In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn import linear_model
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv("./../Data/Final Data/all_combined.csv")
labels = pd.read_csv("./../Data/Final Data/labels.csv")
data['label'] = labels['label']
# .set_index(["teamName", "year"]).sort_index()

In [3]:
data_0 = data[data['label'] == 0]
data_1 = data[data['label'] == 1]

# print(data_0)
# print(data_1)

training_data_0 = data_0.sample(frac=0.8)
testing_data_0 = data_0.drop(training_data_0.index)

training_data_1 = data_1.sample(frac=0.8)
testing_data_1 = data_1.drop(training_data_1.index)

# print("data class 0")
# print(training_data_0)
# print(testing_data_0)

# print("data class 1")
# print(training_data_1)
# print(testing_data_1)

X = training_data_0.append(training_data_1, ignore_index=True)
# X_train = X.loc[:, X.columns != 'label']
X_train = X.loc[:, ~X.columns.isin(['teamName', 'year', 'label', 'NIT', 'R64', 'R32', 'S16', 'E8', 'F4', 'F2', 'CHMP'])]
X_label = X['label']

# print("training")
# print(X)

Y = testing_data_0.append(testing_data_1, ignore_index=True)
# Y_test = Y.loc[:, Y.columns != 'label']
Y_test = Y.loc[:, ~Y.columns.isin(['teamName', 'year', 'label', 'NIT', 'R64', 'R32', 'S16', 'E8', 'F4', 'F2', 'CHMP'])]
Y_label = Y['label']
print(str(X_label.shape) + " " + str(Y_label.shape))

# print("testing")
# print(Y)

print(X_train.shape)
print(Y_test.shape)

(2514,) (628,)
(2514, 13)
(628, 13)


In [4]:
print(X[X['label'] == 0].loc[:, 'teamName'])

0           UT Arlington
1       Tennessee Martin
2         Louisiana Tech
3              Weber St.
4       Eastern Kentucky
              ...       
2019             Seattle
2020     Bethune Cookman
2021      South Carolina
2022    Prairie View A&M
2023           Milwaukee
Name: teamName, Length: 2024, dtype: object


In [5]:
class LogisticRegression:
    
    # set parameters - learning rate, number ot iterations, bias, 
    # and verbose which says whether to print anything or not like, loss etc.
    def __init__(self, learning_rate=0.05, num_iterations=50000, fit_intercept=True, verbose=False):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    # function to define the Incercept value.
    def __bias(self, X):
        # set bias as 1
        bias = np.ones((X.shape[0], 1))
        # concat bias to data
        return np.concatenate((bias, X), axis=1)
    
    def __sigmoid_function(self, x):
        # sigmoid function to predicts yp
        return 1 / (1 + np.exp(-x))
    
    def __loss(self, yp, y):
        # minimize loss
        return (-y * np.log(yp) - (1 - y) * np.log(1 - yp)).mean()
    
    # training function
    def fit(self, X, y):
        
        # use bias if specified
        if self.fit_intercept:
            X = self.__bias(X)
        
        # initialize weights as 0 initially
        # self.W = np.zeros(X.shape[1])
        self.W = np.random.randn(X.shape[1])  # Randomly generate initial weight matrix with normal distribution (mu=0, sigma = 2)
        
        # run for number of iterations provided
        for i in range(self.num_iterations):
            
            z = np.dot(X, self.W)
            
            # prediction probabilities
            yp = self.__sigmoid_function(z)
            
            # calculate gradient
            gradient = np.dot(X.T, (yp - y)) / y.size
            
            # update W
            self.W -= self.learning_rate * gradient
            
            # new W * Xi
            z = np.dot(X, self.W)
            yp = self.__sigmoid_function(z)
            
            # calculate loss
            if len(yp) == 0 or len(y) == 0:
                print("Here")
            loss = self.__loss(yp, y)
            
            # to print loss with verbose
            if(self.verbose ==True and i % 1000 == 0):
                print(f'loss: {loss} \t')
    
    # predict the probabilities using W
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__bias(X)
        
        return self.__sigmoid_function(np.dot(X, self.W))
    
    # predict class from probabilities; less than 0.5 = 0 or more than 0.5 = 1
    def predict(self, X):
        return self.predict_prob(X).round()
    
    def set_learning_rate(self, lr):
        self.learning_rate = lr
        
    def set_num_iterations(self, itrs):
        self.num_iterations = itrs

In [6]:
model = LogisticRegression(learning_rate=0.05, num_iterations=10000, verbose=True)

In [7]:
%%time
'''model.set_learning_rate(0.1)
model.set_num_iterations(20000)'''
model.fit(X_train, X_label)

loss: 0.7608785775831106 	
loss: 0.3834704718694863 	
loss: 0.37432422295480594 	
loss: 0.3681994854690352 	
loss: 0.36353048782363495 	
loss: 0.35988633130265374 	
loss: 0.3570083947407293 	
loss: 0.354713868730809 	
loss: 0.35286855031536 	
loss: 0.35137225882310613 	
Wall time: 9.82 s


In [8]:
preds = model.predict(Y_test)
(preds == Y_label).mean()

0.8710191082802548

In [9]:
# Print confusion matrix
tn, fp, fn, tp = confusion_matrix(pd.Series(Y_label), preds).ravel()
print('True negatives: ', tn, '\nFalse positives: ', fp, '\nFalse negatives: ', fn, '\nTrue Positives: ', tp)

True negatives:  492 
False positives:  14 
False negatives:  67 
True Positives:  55


In [10]:
print("----------WEIGHTS----------")
print("Bias: \t\t\t" + str(model.W[0]))
for i in range(len(X_train.columns)):
    if X_train.columns[i] == "sos" or X_train.columns[i] == "stars" or X_train.columns[i] == "NIT":
        print(X_train.columns[i] + ": \t\t\t" + str(model.W[i+1]))
    elif X_train.columns[i] == "adjTempo" or X_train.columns[i] == "win ratio" or X_train.columns[i] == "seed_points":
        print(X_train.columns[i] + ": \t\t" + str(model.W[i+1]))
    else:
        print(X_train.columns[i] + ": \t" + str(model.W[i+1]))
# print(model.W.shape)

----------WEIGHTS----------
Bias: 			-3.483991509968205
prev_3s_recruits: 	0.018574616257029414
prev_4s_recruits: 	0.09009135580145805
prev_5s_recruits: 	0.228791631064723
curr_3s_recruits: 	-0.03143330160020543
curr_4s_recruits: 	0.10022800345748707
curr_5s_recruits: 	0.34328940542625
seed_points: 		0.043193351984931024
returningMins%: 	2.091986373417421
sos: 			-0.08345299232679514
adjTempo: 		0.008912852662272637
win ratio: 		0.5937985953452354
overall efficiency: 	1.1886527750307758
stars: 			0.05829843644520059


In [11]:
from sklearn.linear_model import LogisticRegression as LogReg
clf = LogReg(max_iter = 10000).fit(X_train, X_label)
clf.predict(Y_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [12]:
clf.score(Y_test, Y_label)

0.8742038216560509

### SMOTE Oversampling Algorithm for data augmentation to get equally sized classes

See the following link: <a>https://towardsdatascience.com/smote-fdce2f605729</a>. Here you will find a description for the SMOTE (Synthetic Minority Oversampling Technique) algorithm, which is useful for oversampling data augmentation. We perform that here using the imbalanced learn python library in order to increase the number of data points labeled as having made the NCAA tournament.

In [13]:
from imblearn.over_sampling import SMOTE

# Oversample teams who made tournament for training data
X_train_resampled, X_label_resampled = SMOTE().fit_resample(X_train, X_label)
print("Old X:\tOverall Length = " + str(len(X_train)) + "\tNumber of 0 labels = " + 
      str(pd.Series(X_label).value_counts()[0]) + "\tNumber of 1 labels = " + str(pd.Series(X_label).value_counts()[1]))
print("New X:\tOverall Length = " + str(len(X_train_resampled)) + "\tNumber of 0 labels = " + 
      str(pd.Series(X_label_resampled).value_counts()[0]) + "\tNumber of 1 labels = " + 
      str(pd.Series(X_label_resampled).value_counts()[1]))

# Oversample teams who made tournament for testing data
Y_test_resampled, Y_label_resampled = SMOTE().fit_resample(Y_test, Y_label)
print("\nOld Y:\tOverall Length = " + str(len(Y_test)) + "\tNumber of 0 labels = " + 
      str(pd.Series(Y_label).value_counts()[0]) + "\tNumber of 1 labels = " + str(pd.Series(Y_label).value_counts()[1]))
print("New Y:\tOverall Length = " + str(len(Y_test_resampled)) + "\tNumber of 0 labels = " + 
      str(pd.Series(Y_label_resampled).value_counts()[0]) + "\tNumber of 1 labels = " + 
      str(pd.Series(Y_label_resampled).value_counts()[1]))

Old X:	Overall Length = 2514	Number of 0 labels = 2024	Number of 1 labels = 490
New X:	Overall Length = 4048	Number of 0 labels = 2024	Number of 1 labels = 2024

Old Y:	Overall Length = 628	Number of 0 labels = 506	Number of 1 labels = 122
New Y:	Overall Length = 1012	Number of 0 labels = 506	Number of 1 labels = 506


##### Now we retrain and retest model

In [19]:
%%time
model = LogisticRegression(learning_rate=0.05, num_iterations=10000, verbose=True)
model.fit(X_train_resampled, X_label_resampled)

loss: 2.304279059241988 	
loss: 0.49589523974382577 	
loss: 0.4900711862684863 	
loss: 0.4864265326658166 	
loss: 0.48368270280071823 	
loss: 0.48155688079099673 	
loss: 0.479884262606916 	
loss: 0.47855030252855846 	
loss: 0.47747273830815784 	
loss: 0.47659166614529114 	
Wall time: 9.65 s


In [20]:
new_preds = model.predict(Y_test_resampled)
(new_preds == Y_label_resampled).mean()

0.7964426877470355

In [21]:
# Print confusion matrix
print(str(type(Y_test_resampled)) + " " + str(type(new_preds)))
tn, fp, fn, tp = confusion_matrix(pd.Series(Y_label_resampled), new_preds).ravel()
print('True negatives: ', tn, '\nFalse positives: ', fp, '\nFalse negatives: ', fn, '\nTrue Positives: ', tp)

<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>
True negatives:  402 
False positives:  104 
False negatives:  102 
True Positives:  404


In [22]:
print("----------WEIGHTS----------")
print("Bias: \t\t\t" + str(model.W[0]))
for i in range(len(X_train_resampled.columns)):
    if X_train_resampled.columns[i] == "sos" or X_train_resampled.columns[i] == "stars" or X_train_resampled.columns[i] == "NIT":
        print(X_train_resampled.columns[i] + ": \t\t\t" + str(model.W[i+1]))
    elif X_train_resampled.columns[i] == "adjTempo" or X_train_resampled.columns[i] == "win ratio" or X_train_resampled.columns[i] == "seed_points":
        print(X_train_resampled.columns[i] + ": \t\t" + str(model.W[i+1]))
    else:
        print(X_train_resampled.columns[i] + ": \t" + str(model.W[i+1]))

----------WEIGHTS----------
Bias: 			-2.59166378771914
prev_3s_recruits: 	-0.08290081284385
prev_4s_recruits: 	-0.0019280790313977558
prev_5s_recruits: 	0.04027271411977194
curr_3s_recruits: 	-0.12346138052173256
curr_4s_recruits: 	-0.013801893205736096
curr_5s_recruits: 	0.15120226938257658
seed_points: 		0.06705979949886619
returningMins%: 	3.048490648321752
sos: 			0.34312745812487844
adjTempo: 		0.0547130571711723
win ratio: 		0.661982399109369
overall efficiency: 	1.4023540328881923
stars: 			0.1546210608569627


In [18]:
clf_new = LogReg(max_iter = 10000).fit(X_train_resampled, X_label_resampled)
clf_new.predict(Y_test_resampled)
clf.score(Y_test_resampled, Y_label_resampled)

0.682806324110672