In [None]:
'''Imports'''
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
import matplotlib.pyplot as plt
randomstate = 42

In [None]:
'''Hyperparameters'''
hp_nearest_neighbours = 20
hp_alpha = 0.01
hp_learning_rate = 0.001
hp_epochs = 30

In [None]:
'''Pre-Processing'''
data = pd.read_csv('football_wages.csv')
cleaned_data = data.drop("nationality_name", axis=1)
X = cleaned_data.drop("log_wages", axis=1).to_numpy()
y = cleaned_data["log_wages"].to_numpy()
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.2, random_state=randomstate)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=randomstate)

In [None]:
def pipeline_standard():
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    X_val_scaled = scaler.transform(X_val)
    autograder_data_scaled = scaler.transform(autograder_data_numpy)
    return X_train_scaled, X_test_scaled, X_val_scaled, autograder_data_scaled

def pipeline_minmax():
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    X_val_scaled = scaler.transform(X_val)
    return X_train_scaled, X_test_scaled, X_val_scaled

X_train_scaled, X_test_scaled, X_val_scaled  = pipeline_minmax()

In [None]:
'''Dummy'''
def baseline_model():
    dummy = DummyRegressor(strategy='median')
    dummy.fit(X_train_scaled, y_train)
    y_pred = dummy.predict(X_test_scaled)
    mae_dummy = mean_absolute_error(y_test, y_pred)
    return mae_dummy

In [None]:
'''KNN'''
def knn(neighbours):
    knn = KNeighborsRegressor(n_neighbors=neighbours)
    knn.fit(X_train_scaled, y_train)
    y_pred_knn = knn.predict(X_test_scaled)
    mae_knn_test = mean_absolute_error(y_test, y_pred_knn)
    y_pred_val = knn.predict(X_val_scaled)
    mae_knn_val = mean_absolute_error(y_test, y_pred_val)
    return  mae_knn_test, mae_knn_val

In [None]:
'''SGD'''
def sgd(alpha_, learning_rate_, epochs_):
    sgd = SGDRegressor(
        loss='epsilon_insensitive',
        alpha=alpha_,
        learning_rate='constant',
        eta0=learning_rate_,
        warm_start=True,  
        max_iter=1,
        random_state=randomstate
    )
    mae_sgd_list = []
    for _ in range(epochs_):
        sgd.partial_fit(X_train_scaled, y_train)
        y_pred_sgd = sgd.predict(X_test_scaled)
        mae_sgd_test = mean_absolute_error(y_test, y_pred_sgd)
        mae_sgd_list.append(mae_sgd_test)
        y_pred_sgd_val = sgd.predict(X_val_scaled)
        mae_sgd_val = mean_absolute_error(y_val, y_pred_sgd_val)
    return mae_sgd_test, mae_sgd_list, mae_sgd_val, sgd

In [None]:
'''Plotter'''
def plot(x, y, title="Line Plot", xlabel="X", ylabel="Y"):
    plt.figure(figsize=(10, 6))
    plt.plot(x, y, alpha=0.7)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.show()

In [None]:
'''Hyperparameter Analysis'''
def knn_neighbours():
    knn_mae_list = []
    knn_neighbours =[]
    for i in range(1,100):
        knn_neighbours.append(i)
        knn_mae_list.append(knn(i)[0])
    best_mae = min(knn_mae_list)
    index = knn_mae_list.index(best_mae)
    best_num_neighbours = knn_neighbours[index]
    #print(f'for {best_num_neighbours} neighbours, MAE of {best_mae}')
    #plot(knn_neighbours, knn_mae_list, 'KNN number of Neighbour Analysis', 'Number of Neighbours', 'MAE')
    return best_num_neighbours, best_mae, 