#Imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

In [2]:
# mount google drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


#Preprocessing

In [None]:
# read data into a pandas dataframe
df = pd.read_csv("/content/drive/MyDrive/Fall 2024/ml/epl_dataset.csv")

# remove players that have had less than 19 appearances (half of a season)
df = df[df['Appearances'] >= 19]

# remove features that are not useful for the model/too similar to target variable
df = df.drop(['Name', 'Goals', 'Appearances', 'Headed goals', 'Goals with right foot', 'Goals with left foot',
            'Penalties scored', 'Freekicks scored'], axis=1)

# remove goalkeepers and defenders
df = df[~df['Position'].isin(['Goalkeeper', 'Defender'])]

# dummy variable encode the position, club, and nationality columns
df = pd.get_dummies(df, columns=['Position', 'Club', 'Nationality',], drop_first=True, dtype=int)

# list of columns that contain percentages
percentage_columns = ['Shooting accuracy %', 'Cross accuracy %', 'Tackle success %']
# convert all percentage columns to decimal values (0 to 1)
for column in percentage_columns:
    df[column] = df[column].str.replace('%', '').astype(float) / 100

# fill any missing values with 0
df.fillna(0, inplace=True)

# replace values greater than 1 in the goals per match column with 0
df['Goals per match'] = df['Goals per match'].apply(lambda x: 0 if x > 1 else x)

# save the modified dataset
df.to_csv("/content/drive/MyDrive/Fall 2024/ml/epl_data_linear_processed.csv", index=False)

#Logistic Regression

In [3]:
df = df = pd.read_csv("/content/drive/MyDrive/Fall 2024/ml/epl_data_linear_processed.csv")
df = df.drop(columns=['Saves', 'Penalties saved', 'Punches', 'High Claims', 'Catches', 'Sweeper clearances', 'Throw outs', 'Goal Kicks'])

X = df.drop(columns= ['Position_Midfielder'])
y = df['Position_Midfielder'].values

print(X.shape)
print(y.shape)

(217, 104)
(217,)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)
scaler = StandardScaler()

# scale both the traing and test data using the fitted scaler.
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape `y_train` into 2D array, and `y_test` into 2D array
y_2d_train = y_train.reshape(-1, 1)
y_2d_test = y_test.reshape(-1, 1)

print('X_train.shape: ', X_train.shape)
print('y_2d_train.shape: ', y_train.shape)

X_train.shape:  (162, 104)
y_2d_train.shape:  (162,)


In [36]:
# Appending a column of ones to X_train
ones = np.ones((X_train.shape[0],1))
X_train_1 = np.append(ones, X_train, axis=1)
# X_train_1 = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
# X_test_1 = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

print("The training data has dimensions: ", X_train_1.shape)

# Looking at the first two rows of X_train to check everything worked as expected
print(X_train_1[0:2])
print('Q03 - X_train_1.shape: ', X_train_1.shape)
print('Q03 - X_train_1: ', X_train_1)

The training data has dimensions:  (162, 105)
[[ 1.          1.00167902  0.66808172 -0.4986254   0.21907797 -0.93797411
  -0.44542142 -0.60908518 -1.66538242 -0.42215042 -0.39323321 -0.07881104
  -0.07881104  0.29407632  0.62121435  0.         -0.46575571  0.6294109
   0.53770349  0.30181756  0.          0.78579004  0.44490083  0.42609804
   0.53335897  0.61118242  0.22835837 -0.07881104  1.43337992 -0.31272263
   0.30800225  1.03900811 -0.45839391 -0.41090992  0.19324099 -0.22881762
   0.51163025 -0.33439556  0.59160798  0.08989481 -0.55464632 -0.19611614
   3.89871774 -0.21251186 -0.24253563 -0.2699031  -0.2699031  -0.21251186
  -0.1118034  -0.2699031  -0.22792115 -0.22792115 -0.21251186 -0.25649459
  -0.19611614 -0.19611614 -0.24253563 -0.21251186 -0.22792115 -0.17845765
  -0.1118034  -0.19611614 -0.07881104 -0.24253563  0.         -0.07881104
  -0.13736056 -0.07881104  0.         -0.1118034  -0.13736056 -0.7367884
  -0.25649459 -0.1118034  -0.13736056 -0.13736056 -0.07881104 -0.078

In [22]:
# sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Initialize w_init
w_init = np.zeros((X_train_1.shape[1],1))

# hypothesis function
def hypothesis(X, w):
    return sigmoid(np.dot(X,w))

y_hat_init = hypothesis(X_train_1,w_init)

In [38]:
# log likelihood function
from sklearn.model_selection import KFold
def log_likelihood(X, y, w, l):
    h = hypothesis(X, w)
    h = np.clip(h, 1e-15, 1 - 1e-15)  # Clip to prevent log(0) issues
    a = y * np.log(h)
    b = (1 - y) * np.log(1 - h)
    # Add the ridge regularization term
    ridge_penalty = (l / 2) * np.sum(w[1:] ** 2)
    return np.sum(a + b) - ridge_penalty

def cross_validate(X, y, lambda_values, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    best_lambda = None
    best_score = -np.inf

    for lambda_ in lambda_values:
        total_log_likelihood = 0
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Initialize weights
            w = np.zeros(X_train.shape[1])

            # Perform gradient ascent to optimize weights
            for _ in range(100):
                gradient = np.dot(X_train.T, (y_train - hypothesis(X_train, w))) - lambda_ * np.append(0, w[1:])
                w += 0.01 * gradient  # Update weights with a small learning rate

            # Calculate log likelihood on the test set
            log_likelihood_value = log_likelihood(X_test, y_test, w, lambda_)
            total_log_likelihood += log_likelihood_value

        avg_log_likelihood = total_log_likelihood / n_splits
        if avg_log_likelihood > best_score:
            best_score = avg_log_likelihood
            best_lambda = lambda_
    return best_lambda

lambda_values = np.logspace(-4, 4, 10)  # range of lambda values
optimal_lambda = cross_validate(X_train_1, y_train, lambda_values)
# optimal_lambda = cross_validate(X, y, lambda_values)
print("Optimal λ:", optimal_lambda)
print('Q08 - likelihood: ', log_likelihood(X_train_1, y_2d_train, w_init, 0.0001))

  return 1 / (1 + np.exp(-z))


Optimal λ: 0.0001
Q08 - likelihood:  -112.28984325071114


  ridge_penalty = (l / 2) * np.sum(w[1:] ** 2)


In [39]:
# Gradient ascent function
l = 0.0001
def Gradient_Ascent(X, y, learning_rate, num_iters):
    N = X.shape[0]
    w = np.zeros((X.shape[1],1))
    log_likelihood_values = []

    # Gradient Ascent - local optimization technique
    for i in range(num_iters):
        # Computing log likelihood of seeing examples for current value of w
        if (i % 10) == 0:
            log_likelihood_values.append(log_likelihood(X, y, w, l))
            print(log_likelihood(X, y, w, l))
        error = y - hypothesis(X, w)
        gradient = np.dot(X.T, error)
        # Update w
        w += learning_rate * gradient

    return w, log_likelihood_values

learning_rate = 0.01
num_iters = 10000
# Calculate w and likelihood values using Gradient_Ascent with X_train_1, y_2d_train
w, log_likelihood_values = Gradient_Ascent(X_train_1, y_2d_train, learning_rate, num_iters)
print(w, log_likelihood_values)

-112.28984325071114
-6.213868418944692
-4.3216634269388
-3.3998258455994868
-2.839805746711626
-2.457779690652727
-2.177648424269908
-1.961859851445975
-1.789603578379056
-1.6483436044574182
-1.5300374398126748
-1.429265421355219
-1.3422287016312713
-1.266177403974138
-1.1990669156023945
-1.1393422022184911
-1.0857974431811201
-1.0374817979812783
-0.9936344192752049
-0.953638572020812
-0.9169885662230886
-0.8832654846972157
-0.8521190732616309
-0.8232540291460994
-0.7964194810109406
-0.7714008200424125
-0.7480132867646396
-0.7260968854242615
-0.7055123137539764
-0.686137677551977
-0.6678658177908133
-0.6506021201118934
-0.6342627074023062
-0.6187729389709157
-0.6040661569009341
-0.5900826330333171
-0.5767686798428249
-0.5640758960008754
-0.5519605232507464
-0.5403828957683748
-0.5293069667539754
-0.518699899823746
-0.5085317150176568
-0.4987749810375568
-0.48940454677702666
-0.480397306375464
-0.4717319929813457
-0.46338899718793786
-0.45535020674375953
-0.44759886466717014
-0.44011944

In [40]:
def predict_class(X, w, t):
    p = hypothesis(X, w)
    return (p >= t).astype(int)

In [41]:
ones = np.ones((X_test.shape[0],1))
X_test_1 = np.append(ones, X_test, axis=1)
y_hat = predict_class(X_test_1, w, 0.5)

# precision_recall function
def precision_recall(y_hat, y, threshold):

    false_pos = np.sum((y_hat == 1) & (y == 0))
    false_neg = np.sum((y_hat == 0) & (y == 1))

    true_pos = np.sum((y_hat == 1) & (y == 1))

    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    return precision,recall

# Calculate precision and recall using on the test data where the threshold is 0.5

precision, recall = precision_recall(y_hat, y_test, 0.5)

print('precision: ', precision)
print('recall: ', recall)

precision:  0.6909090909090909
recall:  0.6727272727272727


In [48]:
# F1_score function
def f1_score(precision, recall):
    return 2 * (precision * recall) / (precision + recall)

print('f1:', f1_score(precision, recall))

f1: 0.6816969696969697


In [43]:
# Sklearn's implementation of Logistic regression
from sklearn import linear_model
logreg = linear_model.LogisticRegression(penalty = None)
logreg.fit(X_train, y_train)

In [44]:
# Print out all the coefficients
w_logreg = logreg.coef_
intercept_logreg = logreg.intercept_
print('w_logreg: ', w_logreg)
print('intercept_logreg: ', intercept_logreg)

w_logreg:  [[-3.00969845 -0.58540522 -0.62896965 -1.38204254 -1.71398859 -1.04950391
  -1.02170811 -1.76582446 -0.41072082 -1.0797681  -0.30041281 -0.30041281
  -0.05850586  4.89038849  0.         -0.73188399 -0.33821747 -0.50018127
  -0.56378822  0.          0.40019479  0.63096763  0.96224875  0.59441381
   0.08818973  0.85720687 -0.30041281 -0.21549612  0.49672954 -0.19118246
   4.25764419  0.56433779  0.25330011  3.74872327  0.52108098  0.00594755
   1.09131844 -0.49290402 -0.71942015 -0.75955849  0.43566872 -1.03443314
   1.51198663 -0.3299345  -0.47890787 -0.57633837 -0.00750818 -0.55727617
   2.95103011 -0.4689339   0.90280956 -0.23735814 -0.71580066 -0.8370866
   0.04746389  0.25646229 -0.94956644 -0.92082637  0.22288281 -0.3312295
  -0.57639267  0.02850465 -1.32650678  0.         -0.26419471 -0.92722513
  -0.20668305  0.         -0.48765066  1.17254391 -0.070107   -0.87612475
  -0.16148738 -0.05247328  0.47179994  0.23596025  0.34553252 -0.37413466
  -0.45192095 -0.3297075  -0.

In [45]:
# predicted values on test set (X_test not X_test_1) using logreg.predict
y_hat_logreg = logreg.predict(X_test)

# accuracy achieved on test set using logreg.score and y_test
acc_logreg = logreg.score(X_test, y_test)

print("Accuracy on training data = %f" % acc_logreg)

Accuracy on training data = 0.945455


In [46]:
from sklearn.metrics import precision_recall_fscore_support
# Precision, recall and fscore using precision_recall_fscore_support method of sklearn

prec, recal, fscore, sup = precision_recall_fscore_support(y_test, y_hat_logreg, average='binary')
print('prec: ', prec)
print('recal: ', recal)
print('fscore: ', fscore)

prec:  0.972972972972973
recal:  0.9473684210526315
fscore:  0.96
