In [1]:
# Now we are going to read the data we scraped and use it to train a logistic regression model

In [9]:
import pandas as pd 
import csv
import numpy as np
import pickle

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [10]:
path_stats_basic19 = r"C:\Python\March Madness\NCAA_Team_Data_basic19.csv"
path_stats_basic18 = r"C:\Python\March Madness\NCAA_Team_Data_basic18.csv"

# df of all team stats for each year
df_stats19 = pd.read_csv(path_stats_basic19)
df_stats18 = pd.read_csv(path_stats_basic18)

# df picking out our stats we will use as features in our model, this is useful for our our match_stats functions below
df_features19 = df_stats19[['SCHOOL','SRS','FG%','3P%', 'FT%']]
df_features18 = df_stats18[['SCHOOL','SRS','FG%','3P%', 'FT%']]

In [11]:
path_games19 = r"C:\Python\March Madness\NCAA_Reg_Season19.csv"
path_games18 = r"C:\Python\March Madness\NCAA_Reg_Season18.csv"

# df of all games played for each year
df_games19 = pd.read_csv(path_games19)
df_games18 = pd.read_csv(path_games18)

In [5]:
# Since we have data on games that include some very small schools 
# lets create a function to count how many games we do not have team data for (to make sure it is not a large amount which would indicate an issue)
def game_error_counter(df_games,df_stats):
    errors = 0
    for index,row in df_games.iterrows():
        try:
            df_stats.loc[df_stats['SCHOOL']==row[1]].values[0][2:]
        except IndexError:
            df_games = df_games.drop([index])
            errors +=1
    print(errors)

In [6]:
game_error_counter(df_games19,df_stats19)
game_error_counter(df_games18,df_stats18)

# 63 and 60 games respecitvely, we can ignore these games from our data set

63
60


In [12]:
# This function calculates the match stats of each regular season game
# skipping the games we dont have team data for and saves the result to a list
reg_season = []
def match_stats_reg_season(df_games,df_features):
    for index,row in df_games.iterrows():
        if index%2==0: # Each game has 2 rows (one for each team) this will get rid of the redundant data
            try:
                reg_season.append(np.append(df_features.loc[df_features['SCHOOL']==row[1]].values[0][1:].astype(float) - df_features.loc[df_features['SCHOOL']==row[4]].values[0][1:].astype(float),[row[6]]))
            except IndexError:
                pass
    return reg_season

In [13]:
match_stats_reg_season(df_games19,df_features19);
match_stats_reg_season(df_games18,df_features18);

In [14]:
# Now our data is ready to be used to train our model

features = ['SRS','FG%','3P%','FT%']

df_match_stats_reg_season = pd.DataFrame(columns=['SRS','FG%','3P%','FT%','W/L'],data=reg_season)
df_match_stats_reg_season['W/L'] = pd.get_dummies(df_match_stats_reg_season['W/L']) #encode the categorical variables
# 0 is a win, 1 is a loss



all_x = df_match_stats_reg_season[features]
all_y = df_match_stats_reg_season['W/L'] 

model = LogisticRegression()
model.fit(all_x, all_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
# Serializing our model to a file called finalized_model.sav
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [16]:
# Now we can check the accuracy score by splitting our training data into two sets
# We will train the model with one set and predict the other to test accuracy

xTrain, xTest, yTrain, yTest = train_test_split(all_x,all_y,test_size = 0.2, random_state=0)

model = LogisticRegression()
model.fit(xTrain, yTrain)

holdout_predicitons = model.predict(xTest)

score = accuracy_score(yTest, holdout_predicitons)

score

# Our model predicts with an accuracy of 76.385%

0.76385104450499541