In [69]:
#!/usr/bin/env python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import Tuple, List

In [70]:
np.random.seed(42)

In [71]:
##################################################################
# Starter code for exercise 5: Logistic Model for Argument Quality
##################################################################

GROUP = "04"  # TODO: write in your group number

In [72]:
def load_feature_vectors(filename: str) -> np.array:
    """
    Load the feature vectors from the dataset in the given file and return
    them as a numpy array with shape (number-of-examples, number-of-features + 1).
    """
    # TODO: Your code here
    # load_feature_vectors reads feature vectors from a features-*-cleaned.tsv and returns the contained multiset of feature vectors X as an n-by-(p+1) matrix
    data = pd.read_csv(filename, delimiter='\t')
    features = np.hstack([np.ones((data.shape[0], 1)), data.values])
    return features

# # print features
# print(load_feature_vectors("features-train-cleaned.tsv"))


In [73]:
def load_class_values(filename: str) -> np.array:
    """
    Load the class values for overall quality (class 0 for quality 1 and class 1
    for overall quality 2 or 3) from the dataset in the given file and return
    them as a one-dimensional numpy array.
    """
    # TODO: Your code here
    # Load data
    data = pd.read_csv(filename, delimiter='\t')

    # Check if 'overall quality' column exists
    if 'overall quality' not in data.columns:
        raise KeyError("Column 'overall quality' not found in the dataset.")

    # Create class values based on the 'overall quality' column
    classes = np.where(data['overall quality'] > 1, 1, 0)
    return classes
    

In [74]:
def misclassification_rate(cs: np.array, ys: np.array) -> float:
    """
    This function takes two vectors with gold and predicted labels and
    returns the percentage of positions where truth and prediction disagree
    """
    if len(cs) == 0:
        return float('nan')
    else:
        # TODO: Your code here
        return np.sum(cs) / len(cs)

In [75]:
def logistic_function(w: np.array, x: np.array) -> float:
    """
    Return the output of a logistic function with parameter vector `w` on
    example `x`.
    Hint: use np.exp(np.clip(..., -30, 30)) instead of np.exp(...) to avoid
    divisions by zero
    """
    # TODO: Your code here
    z = np.clip(np.dot(x, w.T), -30, 30)
    return 1 / (1 + np.exp(-z))

In [76]:
def logistic_prediction(w: np.array, x: np.array) -> float:
    """
    Making predictions based on the output of the logistic function
    """
    # TODO: Your code here
    return np.round(logistic_function(w, x))

In [77]:
def initialize_random_weights(p: int) -> np.array:
    """
    Generate a pseudorandom weight vector of dimension p.
    """
    # TODO: Your code here
    return np.random.uniform(-1, 1, (p + 1, 1))

In [78]:
def logistic_loss(w: np.array, x: np.array, c: int) -> float:
    """
    Calculate the logistic loss function
    """
    # TODO: Your code here
    return -np.sum(c * np.log(logistic_function(w, x)) + (1 - c) * np.log(1 - logistic_function(w, x)))

In [79]:
def train_logistic_regression_with_bgd(xs: np.array, cs: np.array, eta: float=1e-8, iterations: int=1000, validation_fraction: float=0) -> Tuple[np.array, float, float]:
    """
    Fit a logistic regression model using the Batch Gradient Descent algorithm and
    return the learned weights as a numpy array.

    Arguments:
    - `xs`: feature vectors in the training dataset as a two-dimensional numpy array with shape (n, p+1)
    - `cs`: class values c(x) for every element in `xs` as a one-dimensional numpy array with length n
    - `eta`: the learning rate as a float value
    - `iterations': the number of iterations to run the algorithm for
    - 'validation_fraction': fraction of xs and cs used for validation (not for training)

    Returns:
    - the learned weights as a column vector, i.e. a two-dimensional numpy array with shape (1, p)
    - logistic loss value
    - misclassification rate of predictions on training part of xs/cs
    - misclassification rate of predictions on validation part of xs/cs
    """
    # TODO: Your code here
     # Initialize weights
    w = initialize_random_weights(xs.shape[1])

    # Your implementation for BGD training

    loss = logistic_loss(w, xs, cs)
    train_misclassification_rate = misclassification_rate(logistic_prediction(w, xs), cs)
    
    # Define validation_xs and validation_cs here
    validation_xs = ...
    validation_cs = ...
    
    validation_misclassification_rate = misclassification_rate(logistic_prediction(w, validation_xs), validation_cs)

    return w, loss, train_misclassification_rate, validation_misclassification_rate

In [80]:
def plot_loss_and_misclassification_rates(losss: List[float], train_misclassification_rates: List[float], validation_misclassification_rates: List[float]):
    """
    Plots the normalized loss (divided by max(losss)) and both misclassification rates
    for each iteration.
    """
    # TODO: Your code here
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(losss / np.max(losss), label='Normalized Loss')
    plt.title('Normalized Loss Over Iterations')
    plt.xlabel('Iteration')
    plt.ylabel('Normalized Loss')

    plt.subplot(1, 2, 2)
    plt.plot(train_misclassification_rates, label='Training Misclassification Rate')
    plt.plot(validation_misclassification_rates, label='Validation Misclassification Rate')
    plt.title('Misclassification Rates Over Iterations')
    plt.xlabel('Iteration')
    plt.ylabel('Misclassification Rate')
    plt.legend()

    plt.show()

In [81]:
########################################################################
# Tests
import os
from pytest import approx


def test_logistic_function():
    x = np.array([1, 1, 2])
    assert logistic_function(np.array([0, 0, 0]), x) == approx(0.5)
    assert logistic_function(np.array([1e2, 1e2, 1e2]), x) == approx(1)
    assert logistic_function(np.array([-1e2, -1e2, -1e2]), x) == approx(0)
    assert logistic_function(np.array([1e2, -1e2, 0]), x) == approx(0.5)


def test_bgd():
    xs = np.array([
        [1, -1],
        [1, 2],
        [1, -2],
    ])
    cs = np.array([0, 1, 0])
    
    w, _, _, _ = train_logistic_regression_with_bgd(xs, cs, 0.1, 100)
    assert w @ [1, -1] < 0 and w @ [1, 2] > 0
    w, _, _, _ = train_logistic_regression_with_bgd(-xs, cs, 0.1, 100)
    assert w @ [1, -1] > 0 and w @ [1, 2] < 0



########################################################################
# Main program for running against the training dataset

if __name__ == "__main__":
    import pandas as pd
    import pytest
    import sys

    # Set your file names here (adjust as needed)
    train_features_file_name = "features-train-cleaned.tsv"
    train_classes_file_name = "quality-scores-train-cleaned.tsv"
    test_features_file_name = "features-test-cleaned.tsv"
    test_predictions_file_name = "quality-scores-test-predicted.tsv"

    # Load data
    xs = load_feature_vectors(train_features_file_name)
    xs_test = load_feature_vectors(test_features_file_name)
    cs = load_class_values(train_classes_file_name)

    print("(a)")
    # TODO: Print number of examples with each class

    print("(b)")
    # TODO: Print misclassification rate of random classifier

    print("(c)")
    test_c_result = pytest.main(['-k', 'test_logistic_function', '--tb=short', 'Classifier.ipynb'])
    if test_c_result != 0:
        sys.exit(test_c_result)
    print("Test logistic function successful")

    print("(d)")
    test_d_result = pytest.main(['-k', 'test_bgd', '--tb=short', 'Classifier.ipynb'])
    if test_d_result != 0:
        sys.exit(test_d_result)
    print("Test bgd successful")

    print("(e)")
    w, losss, train_misclassification_rates, validation_misclassification_rates = train_logistic_regression_with_bgd(xs, cs, validation_fraction=0.2)
    plot_loss_and_misclassification_rates(losss, train_misclassification_rates, validation_misclassification_rates)

    print("(f)")
    # (f) Predict on test set and write to test_predictions_file_name
    w, _, _, _ = train_logistic_regression_with_bgd(xs, cs, validation_fraction=0.2)
    predictions = np.round(logistic_function(w, xs_test))

    # Write predictions to file
    pd.DataFrame(predictions).to_csv(test_predictions_file_name, header=False, index=False)

(a)
(b)
(c)
platform win32 -- Python 3.11.3, pytest-7.4.3, pluggy-1.3.0
rootdir: e:\Hobby_Projects\Machine Learning\Introduction-to-Machine-Learning-\Logistic Regression Classifier\ml23-ex3-data
plugins: anyio-3.6.2, typeguard-2.13.3
collected 0 items



[31mERROR: found no collectors for e:\Hobby_Projects\Machine Learning\Introduction-to-Machine-Learning-\Logistic Regression Classifier\ml23-ex3-data\Classifier.ipynb
[0m


SystemExit: 4

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
