# COMPSCI 389: Introduction to Machine Learning
Data Cleaning Introduction

The code below runs gradient descent to minimize the sample mean squared error when using a linear parametric model, with the second-degree (order) polynomial basis. The first code block defines the various functions for this. I recommend skipping down to the next markdown block.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

# Function to calculate mean squared error (for evaluation)
def mean_squared_error(predictions, labels):
    return np.mean((predictions - labels) ** 2)

# Function to calculate gradients
def compute_gradients(X, y, weights):
    predictions = X.dot(weights)
    errors = predictions - y
    return 2 / X.shape[0] * X.T.dot(errors)

class PolynomialRegressionGD(BaseEstimator):
    def __init__(self, learning_rate, iterations=1000, polynomial_degree=2):
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.polynomial_degree = polynomial_degree

    def fit(self, X, y):
        # Expand features into polynomial basis and store the transformer
        self.poly = PolynomialFeatures(degree=self.polynomial_degree)
        X_poly = self.poly.fit_transform(X) # X_poly now holds the phi(X) - the features for each input

        # Get the number of features
        numFeatures = X_poly.shape[1];

        # Initialize weights and loss history
        self.weights = np.zeros(numFeatures)
        self.loss_history = []

        # Print the initial loss
        predictions = X_poly.dot(self.weights)
        loss = mean_squared_error(predictions, y)
        print(f"Iteration 0/{self.iterations}, Loss: {loss:.4f}")

        for i in range(1, self.iterations + 1):
            # Compute the gradient of the loss function
            gradients = compute_gradients(X_poly, y, self.weights)

            # Update the weights using gradient descent
            self.weights -= self.learning_rate * gradients

            # Compute, print, and store the resulting loss (just to print it -  not needed for the actual optimization)
            loss = mean_squared_error(X_poly.dot(self.weights), y)
            self.loss_history.append(loss)
            print(f"Iteration {i}/{self.iterations}, Loss: {loss:.4f}")

        return self

    def predict(self, X):
        # Use the stored polynomial transformer to transform X
        X_poly = self.poly.transform(X)
        return X_poly.dot(self.weights)

# Load the data set
df = pd.read_csv("data/GPA.csv", delimiter=',')

# Split the data into features and labels
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=True)

def run(alpha):
    iterations = 1000
    polynomial_degree = 2

    # Initialize and fit the model
    model = PolynomialRegressionGD(
        learning_rate=alpha,
        iterations=iterations,
        polynomial_degree=polynomial_degree
    )
    model.fit(X_train, y_train)

    # Plotting the loss over iterations
    plt.plot(range(1, iterations + 1), model.loss_history)
    plt.xlabel('Iterations')
    plt.ylabel('Mean Squared Error')
    plt.yscale('log')
    plt.title(f'Gradient Descent Loss, Polynomial Degree: {polynomial_degree}')
    plt.show()

    # Predict on the test set
    predictions = model.predict(X_test)

    # Calculate MSE on the test set
    mse_test = mean_squared_error(predictions, y_test)
    print(f"Test MSE: {mse_test:.4f}")

    # Calculate the standard error of the MSE
    squared_errors = (predictions - y_test) ** 2
    std_error = np.std(squared_errors) / np.sqrt(len(squared_errors))
    print(f"Standard Error of MSE: {std_error:.4f}")


FileNotFoundError: [Errno 2] No such file or directory: 'data/GPA.csv'

The `run` function takes the step size (learning rate) `alpha` as its one argument. It then runs 1,000 iterations of gradient descent on the GPA data set using the second-degree polynomial basis. Let's recreate the plot from the last lecture!

(Note: nan and inf can cause some errors, so don't worry if you see some errors below.)

In [None]:
alpha = 0.1
run(alpha)

Iteration 0/1000, Loss: 8.4292
Iteration 1/1000, Loss: 8699896084662712010801152.0000
Iteration 2/1000, Loss: 9927913236260449382623519869853028519575178706944.0000
Iteration 3/1000, Loss: 11329280668531715668446261968916915601861795744246812403193416257130987520.0000
Iteration 4/1000, Loss: 12928457111081828069683890035613448549819036535784470427398402767141167226351588060354875706310656.0000
Iteration 5/1000, Loss: 14753364151119199753114205040137636889869226880692379199057992888268351069036067823003692500014127297279929631793028792320.0000
Iteration 6/1000, Loss: 16835864628344307272589475773604450362611855240535958011321867455436839348556665472275716145363445910094398902967598946778284926697114636172394496.0000
Iteration 7/1000, Loss: 19212318958617501406591063372808495341392062396435145613797854227420322806597646929659828012963872468477382061532310091188202202126624261065043257210910732416153787826176.0000
Iteration 8/1000, Loss: 219242199860782063160337593775131316263219834323786

  self.weights -= self.learning_rate * gradients


Iteration 146/1000, Loss: nan
Iteration 147/1000, Loss: nan
Iteration 148/1000, Loss: nan
Iteration 149/1000, Loss: nan
Iteration 150/1000, Loss: nan
Iteration 151/1000, Loss: nan
Iteration 152/1000, Loss: nan
Iteration 153/1000, Loss: nan
Iteration 154/1000, Loss: nan
Iteration 155/1000, Loss: nan
Iteration 156/1000, Loss: nan
Iteration 157/1000, Loss: nan
Iteration 158/1000, Loss: nan
Iteration 159/1000, Loss: nan
Iteration 160/1000, Loss: nan
Iteration 161/1000, Loss: nan
Iteration 162/1000, Loss: nan
Iteration 163/1000, Loss: nan
Iteration 164/1000, Loss: nan
Iteration 165/1000, Loss: nan
Iteration 166/1000, Loss: nan
Iteration 167/1000, Loss: nan
Iteration 168/1000, Loss: nan
Iteration 169/1000, Loss: nan
Iteration 170/1000, Loss: nan
Iteration 171/1000, Loss: nan
Iteration 172/1000, Loss: nan
Iteration 173/1000, Loss: nan
Iteration 174/1000, Loss: nan
Iteration 175/1000, Loss: nan
Iteration 176/1000, Loss: nan
Iteration 177/1000, Loss: nan
Iteration 178/1000, Loss: nan
Iteration 

  ticklocs = b ** decades


OverflowError: cannot convert float infinity to integer

<Figure size 640x480 with 1 Axes>

Test MSE: nan
Standard Error of MSE: nan


**Question**: What went wrong, and how can we fix it?