In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split, cross_val_score,cross_validate, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
from sportsreference.ncaab.teams import Teams
from sportsreference.ncaab.teams import Team
from sportsreference.ncaab.roster import Player
from tqdm import tqdm
from sportsreference.ncaab.boxscore import Boxscore
from sklearn.linear_model import LogisticRegression,LinearRegression
import xgboost as xgb
from pprint import pprint
from sklearn.preprocessing import MinMaxScaler
import shap
from scipy import stats

In [59]:
df = pd.read_csv("ml_df_l4.csv")
df = df.drop(columns = [df.columns[i] for i in [6,39,66,67,68,69,70,73,74]])
print("Assuring features are scaled appropriately")
for i in df.loc[:,df.columns[df.columns.str.contains('percent')]].columns:
    if df[i].max() > 1:
        df[i] = [j/100 for j in df[i]]
df.sample(5)

Assuring features are scaled appropriately


Unnamed: 0,assist_percentage,block_percentage,effective_field_goal_percentage,field_goal_percentage,free_throw_attempt_rate,free_throw_percentage,offensive_rating,offensive_rebound_percentage,allowed_assist_percentage,allowed_block_percentage,allowed_effective_field_goal_percentage,allowed_field_goal_percentage,allowed_free_throw_attempt_rate,allowed_free_throw_percentage,allowed_offensive_rebound_percentage,allowed_steal_percentage,allowed_three_point_attempt_rate,allowed_three_point_field_goal_percentage,allowed_two_point_field_goal_percentage,allowed_total_rebound_percentage,allowed_true_shooting_percentage,allowed_turnover_percentage,pace,simple_rating_system,steal_percentage,three_point_attempt_rate,three_point_field_goal_percentage,two_point_field_goal_percentage,total_rebound_percentage,true_shooting_percentage,turnover_percentage,win_percentage,opp_assist_percentage,opp_block_percentage,opp_effective_field_goal_percentage,opp_field_goal_percentage,opp_free_throw_attempt_rate,opp_free_throw_percentage,opp_offensive_rating,opp_offensive_rebound_percentage,opp_pace,opp_simple_rating_system,opp_steal_percentage,opp_three_point_attempt_rate,opp_three_point_field_goal_percentage,opp_two_point_field_goal_percentage,opp_total_rebound_percentage,opp_true_shooting_percentage,opp_turnover_percentage,opp_win_percentage,opp_assist_percentage_allowed,opp_block_percentage_allowed,opp_effective_field_goal_percentage_allowed,opp_field_goal_percentage_allowed,opp_free_throw_attempt_rate_allowed,opp_free_throw_percentage_allowed,opp_offensive_rebound_percentage_allowed,opp_steal_percentage_allowed,opp_three_point_attempt_rate_allowed,opp_three_point_field_goal_percentage_allowed,opp_two_point_field_goal_percentage_allowed,opp_total_rebound_percentage_allowed,opp_true_shooting_percentage_allowed,opp_turnover_percentage_allowed,home,result,AdjO,AdjD,AdjT,Luck,OppO,OppD
26810,0.601,0.113,0.516,0.476,0.353,0.701,111.6,0.4,0.524,0.082,0.454,0.398,0.378,0.701,0.312,0.096,0.368,0.3,0.455,0.448,0.497,0.151,69.2,20.63,0.095,0.223,0.358,0.51,0.552,0.548,0.151,0.684,0.464,0.131,0.493,0.436,0.354,0.686,108.0,0.34,64.5,14.11,0.059,0.322,0.357,0.473,0.523,0.526,0.135,0.611,0.515,0.101,0.455,0.403,0.347,0.704,0.301,0.082,0.313,0.33,0.436,0.477,0.495,0.127,0,1,105.4,99.4,59.4,0.027,98.3,107.4
22157,0.522,0.078,0.505,0.445,0.301,0.759,104.4,0.257,0.554,0.076,0.543,0.468,0.314,0.707,0.289,0.095,0.381,0.397,0.511,0.517,0.569,0.145,71.4,-5.51,0.075,0.323,0.367,0.483,0.483,0.542,0.148,0.267,0.521,0.072,0.537,0.455,0.404,0.771,109.6,0.274,69.2,-2.3,0.074,0.456,0.36,0.534,0.507,0.581,0.164,0.514,0.53,0.1,0.536,0.45,0.26,0.722,0.281,0.096,0.463,0.373,0.517,0.493,0.561,0.143,0,0,104.4,110.7,69.1,-0.077,106.9,105.1
54421,0.535,0.106,0.516,0.43,0.392,0.743,107.2,0.298,0.52,0.094,0.468,0.412,0.349,0.665,0.258,0.092,0.349,0.322,0.46,0.468,0.501,0.133,71.4,15.44,0.071,0.446,0.382,0.469,0.532,0.557,0.164,0.6,0.559,0.077,0.534,0.453,0.315,0.762,108.5,0.247,65.4,8.85,0.082,0.445,0.367,0.522,0.504,0.569,0.147,0.533,0.503,0.092,0.494,0.433,0.327,0.71,0.248,0.077,0.37,0.332,0.492,0.496,0.528,0.152,1,1,114.0,96.9,70.7,-0.046,107.6,96.9
19079,0.549,0.128,0.52,0.46,0.393,0.704,108.4,0.329,0.468,0.099,0.463,0.401,0.379,0.703,0.285,0.087,0.383,0.323,0.45,0.474,0.506,0.189,68.3,19.38,0.103,0.335,0.359,0.511,0.526,0.555,0.154,0.73,0.557,0.109,0.575,0.492,0.27,0.708,115.5,0.293,70.2,21.39,0.092,0.412,0.401,0.556,0.503,0.594,0.146,0.795,0.485,0.099,0.491,0.426,0.246,0.713,0.31,0.079,0.391,0.333,0.486,0.497,0.519,0.156,1,0,113.3,90.8,66.7,-0.007,110.7,100.4
23984,0.472,0.088,0.481,0.43,0.356,0.735,99.5,0.299,0.607,0.085,0.538,0.466,0.402,0.73,0.3,0.092,0.402,0.361,0.536,0.51,0.575,0.176,69.0,2.09,0.096,0.313,0.326,0.477,0.49,0.523,0.176,0.281,0.571,0.077,0.519,0.457,0.411,0.688,109.0,0.345,68.2,14.8,0.088,0.361,0.345,0.52,0.544,0.552,0.157,0.632,0.55,0.104,0.516,0.45,0.334,0.698,0.255,0.093,0.395,0.334,0.525,0.456,0.546,0.155,0,0,103.5,104.5,67.7,-0.069,108.8,101.8


# Outlier Removal

In [68]:
idx_to_remove = []
for i in df.columns.tolist():
    stat,p = stats.normaltest(df[i])
    if p <= 0.05:
        zscores = stats.zscore(df[i])
        zscores = np.abs(zscores)
        idx_list = []
        for idx,i in enumerate(zscores):
            if i >=3.2:
                idx_list.append(idx)
        for i in idx_list:
            if i not in idx_to_remove:
                idx_to_remove.append(i)
print("Identified {} outliers to remove".format(len(idx_to_remove)))
df2 = df.drop(idx_to_remove,axis = 0)

X = df2.drop('result',1)
y = df2.result
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y)

Identified 5457 outliers to remove


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only


In [69]:
clf = RandomForestClassifier(n_estimators = 500,n_jobs = -1,max_depth = 4)
clf.fit(X_train,y_train)
print(clf.score(X_train,y_train))
print(clf.score(X_test,y_test))

0.7469765485329688
0.7408312958435208


In [72]:
imp_dict = {key:[] for key in ('feature','importance')}
for feat,imp in zip(X_train.columns,clf.feature_importances_):
    imp_dict['feature'].append(feat)
    imp_dict['importance'].append(imp)
    
imp_df = pd.DataFrame.from_dict(imp_dict)
imp_feats = imp_df.sort_values('importance',ascending = False).head(40)['feature'].tolist()
df2 = df.loc[:,df.columns[df.columns.isin(imp_feats)]]


corr=df2.corr()
high_corr_var=np.where(corr>0.9)
high_corr_var=[(corr.columns[x],corr.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]
for tup in high_corr_var:
    print('Removing {} as it is highly correllated with {}'.format(tup[1],tup[0]))

to_drop = []
for x,y in high_corr_var:
    if y not in to_drop:
        to_drop.append(y)

df2 = df2.drop([i for i in to_drop],axis = 1)

corr=df2.corr()
high_corr_var=np.where(corr>0.9)
high_corr_var=[(corr.columns[x],corr.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]
assert high_corr_var == []
X = df2
y = df.result
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify = y)
wl_cols = X_train.columns.tolist()
X_train

Removing field_goal_percentage as it is highly correllated with effective_field_goal_percentage
Removing true_shooting_percentage as it is highly correllated with effective_field_goal_percentage
Removing true_shooting_percentage as it is highly correllated with field_goal_percentage
Removing allowed_field_goal_percentage as it is highly correllated with allowed_effective_field_goal_percentage
Removing allowed_true_shooting_percentage as it is highly correllated with allowed_effective_field_goal_percentage
Removing allowed_two_point_field_goal_percentage as it is highly correllated with allowed_field_goal_percentage
Removing allowed_true_shooting_percentage as it is highly correllated with allowed_field_goal_percentage
Removing opp_field_goal_percentage as it is highly correllated with opp_effective_field_goal_percentage
Removing opp_true_shooting_percentage as it is highly correllated with opp_effective_field_goal_percentage
Removing opp_true_shooting_percentage as it is highly correll

Unnamed: 0,effective_field_goal_percentage,offensive_rating,allowed_effective_field_goal_percentage,allowed_three_point_field_goal_percentage,allowed_total_rebound_percentage,simple_rating_system,three_point_field_goal_percentage,two_point_field_goal_percentage,total_rebound_percentage,turnover_percentage,win_percentage,opp_block_percentage,opp_effective_field_goal_percentage,opp_offensive_rating,opp_simple_rating_system,opp_three_point_field_goal_percentage,opp_two_point_field_goal_percentage,opp_total_rebound_percentage,opp_turnover_percentage,opp_win_percentage,opp_effective_field_goal_percentage_allowed,opp_steal_percentage_allowed,opp_three_point_field_goal_percentage_allowed,opp_total_rebound_percentage_allowed,home,AdjO,AdjD,Luck,OppO,OppD
25594,0.464,100.9,0.521,0.359,0.494,-14.21,0.338,0.449,0.506,0.170,0.333,0.068,0.505,103.4,-9.20,0.376,0.469,0.477,0.159,0.586,0.520,0.089,0.362,0.523,1,96.4,111.4,-0.059,101.0,108.6
12183,0.470,102.4,0.465,0.312,0.507,7.69,0.337,0.449,0.493,0.118,0.636,0.140,0.516,109.0,15.89,0.365,0.500,0.511,0.140,0.694,0.445,0.076,0.326,0.489,1,107.0,98.1,0.095,107.8,101.5
40732,0.540,108.4,0.504,0.359,0.499,-1.75,0.372,0.528,0.501,0.159,0.606,0.099,0.494,102.6,-7.29,0.357,0.465,0.501,0.179,0.433,0.498,0.099,0.358,0.499,1,106.0,105.8,0.003,104.3,108.4
42290,0.500,102.3,0.502,0.341,0.490,0.40,0.295,0.531,0.510,0.173,0.469,0.072,0.529,106.5,2.58,0.357,0.525,0.509,0.152,0.500,0.513,0.073,0.357,0.491,0,103.4,103.0,-0.006,107.1,102.2
14242,0.531,115.1,0.496,0.349,0.484,6.30,0.398,0.493,0.516,0.132,0.657,0.066,0.514,101.2,0.18,0.373,0.488,0.474,0.152,0.629,0.480,0.074,0.329,0.526,0,116.3,106.7,-0.058,103.8,104.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21439,0.484,96.2,0.485,0.300,0.504,-0.23,0.336,0.469,0.496,0.181,0.500,0.111,0.494,103.5,3.12,0.344,0.483,0.523,0.152,0.613,0.492,0.082,0.329,0.477,1,99.6,100.4,0.057,102.8,101.5
19178,0.530,107.4,0.523,0.366,0.520,-4.78,0.351,0.532,0.480,0.154,0.515,0.097,0.505,101.4,-8.79,0.324,0.520,0.473,0.164,0.469,0.536,0.078,0.360,0.527,1,106.9,109.7,-0.024,100.5,106.8
10659,0.510,105.8,0.488,0.318,0.455,15.55,0.333,0.517,0.545,0.160,0.643,0.109,0.474,97.2,3.23,0.295,0.489,0.502,0.162,0.370,0.525,0.102,0.378,0.498,0,113.3,92.9,0.010,111.2,94.6
50181,0.557,112.5,0.480,0.371,0.468,15.50,0.366,0.561,0.532,0.160,0.750,0.091,0.542,113.3,18.52,0.375,0.532,0.517,0.145,0.771,0.483,0.080,0.344,0.483,1,115.6,98.8,0.112,109.8,100.0


In [73]:
clf = LogisticRegression(max_iter = 10000)
clf.fit(X_train,y_train)
score1 = accuracy_score(y_train,clf.predict(X_train))
score2 = accuracy_score(y_test,clf.predict(X_test))
print(score1,score2)

0.7659094685371122 0.7686391796624653


In [41]:
df_agg = pd.DataFrame()
for t in Teams(2022):
    try:
        temp_df = t.dataframe
        df_agg = pd.concat([df_agg,temp_df],axis=0)
    except:
        pass
        
in_agg = ['Southern Methodist','Brigham Young','University of California','Illinois-Chicago','Connecticut','Massachusetts','Nevada-Las Vegas','North Carolina',
         'North Carolina-Wilmington','North Carolina State','Pittsburgh','Louisiana State','Texas Christian','Southern California','Virginia Commonwealth']
to_change = ['SMU','BYU','California','UIC','UConn','UMass','UNLV','UNC','UNC Wilmington','NC State','Pitt','LSU','TCU','USC','VCU']
rename_dict = dict(zip(in_agg,to_change))
df_agg['name'] = df_agg.name.replace(rename_dict)
df_agg.to_csv('2022_agg.csv',index = False)
df_agg

Unnamed: 0,abbreviation,assist_percentage,assists,away_losses,away_wins,block_percentage,blocks,conference,conference_losses,conference_wins,defensive_rebounds,effective_field_goal_percentage,field_goal_attempts,field_goal_percentage,field_goals,free_throw_attempt_rate,free_throw_attempts,free_throw_percentage,free_throws,free_throws_per_field_goal_attempt,games_played,home_losses,home_wins,losses,minutes_played,name,net_rating,offensive_rating,offensive_rebound_percentage,offensive_rebounds,opp_assist_percentage,opp_assists,opp_block_percentage,opp_blocks,opp_defensive_rebounds,opp_effective_field_goal_percentage,opp_field_goal_attempts,opp_field_goal_percentage,opp_field_goals,opp_free_throw_attempt_rate,opp_free_throw_attempts,opp_free_throw_percentage,opp_free_throws,opp_free_throws_per_field_goal_attempt,opp_offensive_rating,opp_offensive_rebound_percentage,opp_offensive_rebounds,opp_personal_fouls,opp_points,opp_steal_percentage,opp_steals,opp_three_point_attempt_rate,opp_three_point_field_goal_attempts,opp_three_point_field_goal_percentage,opp_three_point_field_goals,opp_two_point_field_goal_attempts,opp_two_point_field_goal_percentage,opp_two_point_field_goals,opp_total_rebound_percentage,opp_total_rebounds,opp_true_shooting_percentage,opp_turnover_percentage,opp_turnovers,pace,personal_fouls,points,simple_rating_system,steal_percentage,steals,strength_of_schedule,three_point_attempt_rate,three_point_field_goal_attempts,three_point_field_goal_percentage,three_point_field_goals,two_point_field_goal_attempts,two_point_field_goal_percentage,two_point_field_goals,total_rebound_percentage,total_rebounds,true_shooting_percentage,turnover_percentage,turnovers,win_percentage,wins
ABILENE-CHRISTIAN,ABILENE-CHRISTIAN,55.5,259,4,3,5.7,28,wac,4,2,386,0.503,1045,0.447,467,0.352,368,0.736,271,0.259,17,2,8,6,695,Abilene Christian,,104.7,27.1,167,51.8,186,8.7,59,450,0.496,844,0.425,359,0.440,371,0.693,257,0.305,,26.3,138,356,1094,7.8,98,0.419,354,0.336,119,490,0.490,240,51.5,588,0.536,27.0,377,72.7,370,1323,2.59,15.3,193,-0.41,0.353,369,0.320,118,676,0.516,349,48.5,553,0.542,15.0,216,0.647,11
AIR-FORCE,AIR-FORCE,60.8,186,4,1,10.0,48,mwc,2,1,319,0.535,681,0.449,306,0.233,159,0.667,106,0.156,14,1,5,6,560,Air Force,,93.7,17.7,70,46.0,138,9.4,34,326,0.465,719,0.417,300,0.349,251,0.737,185,0.257,,29.4,133,227,853,9.4,84,0.332,239,0.285,68,480,0.483,232,54.1,459,0.509,17.8,182,63.6,241,834,-5.76,9.7,86,-4.40,0.471,321,0.361,116,360,0.528,190,45.9,389,0.551,21.5,207,0.571,8
AKRON,AKRON,49.0,171,2,1,8.1,41,mac,2,2,369,0.525,771,0.453,349,0.415,320,0.656,210,0.272,14,1,7,5,560,Akron,,109.0,32.4,154,38.3,124,6.1,27,322,0.477,773,0.419,324,0.274,212,0.670,142,0.184,,23.8,115,286,879,8.5,80,0.343,265,0.336,89,508,0.463,235,45.5,437,0.503,16.8,176,66.8,233,1020,-0.33,8.9,83,-2.33,0.429,331,0.338,112,440,0.539,237,54.5,523,0.553,15.4,168,0.643,9
ALABAMA-AM,ALABAMA-AM,43.3,143,10,2,8.3,40,swac,2,3,376,0.410,879,0.375,330,0.271,238,0.689,164,0.187,15,1,2,11,600,Alabama A&M,,85.7,26.5,155,62.7,227,7.3,48,430,0.488,877,0.413,362,0.275,241,0.701,169,0.193,,32.0,177,233,1025,10.2,105,0.449,394,0.335,132,483,0.476,230,53.3,607,0.517,17.7,213,68.7,238,884,-14.43,10.7,110,-5.03,0.247,217,0.276,60,662,0.408,270,46.7,531,0.446,16.6,197,0.267,4
ALABAMA-BIRMINGHAM,ALABAMA-BIRMINGHAM,43.4,253,2,2,12.3,84,cusa,1,5,509,0.526,1248,0.467,583,0.260,324,0.701,227,0.182,19,1,12,4,760,Alabama-Birmingham,,111.9,32.2,236,47.7,200,8.8,75,496,0.452,1065,0.393,419,0.288,307,0.717,220,0.207,,25.3,172,294,1183,6.9,95,0.361,384,0.326,125,681,0.432,294,47.3,668,0.489,22.0,342,72.4,309,1540,12.88,14.7,202,-1.59,0.319,398,0.369,147,850,0.513,436,52.7,745,0.549,12.8,206,0.789,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WRIGHT-STATE,WRIGHT-STATE,45.6,209,4,3,7.8,53,horizon,2,6,427,0.509,997,0.459,458,0.346,345,0.774,267,0.268,17,1,6,8,680,Wright State,,105.0,26.0,149,43.4,208,7.0,47,424,0.512,1059,0.452,479,0.240,254,0.752,191,0.180,,29.3,177,324,1276,10.2,125,0.355,376,0.338,127,683,0.515,352,51.1,601,0.541,15.4,214,71.8,246,1282,-6.27,7.6,93,-4.58,0.327,326,0.304,99,671,0.535,359,48.9,576,0.552,15.6,214,0.529,9
WYOMING,WYOMING,45.4,172,1,4,5.6,29,mwc,0,1,399,0.559,792,0.479,379,0.350,277,0.697,193,0.244,14,0,6,2,565,Wyoming,,113.0,23.8,107,40.7,136,6.7,29,343,0.447,849,0.393,334,0.219,186,0.699,130,0.153,,25.7,138,269,889,6.4,61,0.388,329,0.277,91,520,0.467,243,48.7,481,0.474,14.0,152,67.6,204,1079,12.33,6.3,60,1.49,0.453,359,0.357,128,433,0.580,251,51.3,506,0.584,13.4,143,0.857,12
XAVIER,XAVIER,56.4,233,1,2,13.1,79,big-east,2,3,453,0.515,921,0.448,413,0.391,360,0.722,260,0.282,16,1,10,3,640,Xavier,,107.6,31.4,174,47.6,181,9.4,52,381,0.455,951,0.400,380,0.212,202,0.738,149,0.157,,23.1,136,302,1015,9.8,110,0.366,348,0.305,106,603,0.454,274,45.2,517,0.485,17.0,215,70.2,232,1209,18.31,10.7,120,6.19,0.397,366,0.336,123,555,0.523,290,54.8,627,0.554,15.7,204,0.813,13
YALE,YALE,51.4,200,4,1,8.3,42,ivy,0,1,397,0.497,908,0.428,389,0.298,271,0.742,201,0.221,15,2,5,8,605,Yale,,101.9,25.5,141,49.2,180,12.0,64,411,0.489,867,0.422,366,0.361,313,0.687,215,0.248,,27.8,153,276,1063,10.0,108,0.419,363,0.320,116,504,0.496,250,51.2,564,0.523,17.3,212,71.6,290,1103,0.85,8.2,89,1.28,0.414,376,0.330,124,532,0.498,265,48.8,538,0.532,15.8,194,0.467,7


In [115]:
missing = ['Loyola (IL)',
 'Louisiana-Monroe',
 'UNC',
 'Miami (FL)',
 'UConn',
 'College of Charleston',
 'Alabama-Birmingham',
 'Albany (NY)',
 'St. Francis (NY)',
 'Prairie View',
 'Savannah St.',
 'Omaha',
 'Maryland-Eastern Shore',
 'Texas-Arlington',
 'Loyola (MD)',
 'NC St.',
 'Pitt',
 'Citadel',
 'Grambling',
 "St. John's (NY)",
 'UC-Irvine',
 'UIC',
 'California Baptist',
 'Bowling Green St.',
 'Bethune-Cookman',
 'Gardner-Webb',
 'Florida International',
 'Texas-Rio Grande Valley',
 'Saint Francis (PA)',
 'UC-Riverside',
 'Purdue-Fort Wayne',
 'UMass',
 'UC-Davis',
 'Miami (OH)',
 'Texas A&M-Corpus Christi',
 'Arkansas-Pine Bluff',
 'UC-San Diego']
in_kp = ['Loyola Chicago','Louisiana Monroe','North Carolina','Miami FL','Connecticut','Charleston','UAB','Albany','St. Francis NY',
        'Prairie View A&M','Savannah St.','Nebraska Omaha','Maryland Eastern Shore','UT Arlington','Loyola MD','N.C. State','Pittsburgh','The Citadel',
        'Grambling St.',"St. John's",'UC Irvine','Cal Baptist','Bowling Green','Bethune Cookman','Gardner Webb','FIU','UT Rio Grande Valley','St. Francis PA',
        'UC Riverside','Purdue Fort Wayne','Massachusetts','UC Davis','Miami OH','Texas A&M Corpus Christi','Arkansas Pine Bluff','UC San Diego']


from kenpompy.utils import login
import kenpompy.summary as kp
import kenpompy.misc as kpm
kp_df2 = pd.DataFrame()
b = ['Rk', 'Team', 'Conf', 'W-L', 'AdjEM', 'AdjO', 'AdjO_r', 'AdjD', 'AdjD_R','AdjT', 'AdjT_r', 'Luck', 'Luck_r', 'AdjEM', 'AdjEM_r', 'OppO', 'OppO_r','OppD', 'OppD_r', 'AdjEM', 'AdjEM_r', 'Seed']
browser = login("xxxx", 'xxxx')
temp_df = kpm.get_pomeroy_ratings(browser,season = 2022)
temp_df.columns = b
temp_df = temp_df.drop([temp_df.columns[i] for i in [0,2,3,6,8,10,12,14,16,18,20,19,21]],1)
temp_df['Team'] = temp_df.Team.replace(dict(zip(in_kp,missing)))
temp_df['Season'] = 2022
kp_df2 = pd.concat([kp_df2,temp_df])
#time.sleep(10)
kp_df2[kp_df2.Team != "Team"].dropna(axis = 0)
kp_df2.Team = kp_df2.Team.str.replace(" St.",' State')

In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only
The default value of regex will change from True to False in a future version.


In [74]:
team_1 = 'Purdue'
team_2 = 'Nebraska'
home_team = team_1
away_team = team_2
if team_1 in df_agg.name.unique().tolist():
    if team_2 in df_agg.name.unique().tolist():
        temp_df_2 = df_agg.loc[df_agg.name == team_1].loc[:,df_agg.columns[df_agg.columns.str.contains('percen') | df_agg.columns.str.contains('rat') | (df_agg.columns == 'pace') | (df_agg.columns == 'name')]]
        temp_df_3 = df_agg.loc[df_agg.name == team_2].loc[:,df_agg.columns[df_agg.columns.str.contains('percen') | df_agg.columns.str.contains('rat') | (df_agg.columns == 'pace') | (df_agg.columns == 'name')]]
        old_names = temp_df_2.columns[temp_df_2.columns.str.contains('opp_')].to_list()
        new_names = [i.replace('opp','allowed') for i in temp_df_2.columns[temp_df_2.columns.str.contains('opp_')].to_list()]
        name_dict = dict(zip(old_names,new_names))
        temp_df_2_dict = temp_df_2.rename(columns = name_dict).to_dict(orient = 'list')
        temp_df_2_dict.update(temp_df_3.iloc[0,:10].rename(dict(zip(temp_df_3.iloc[0,:10].index,['opp_'+str(i) for i in temp_df_3.iloc[0,:10].index]))).to_frame().transpose().to_dict(orient = 'list'))
        temp_df_2_dict.update(temp_df_3.iloc[0,25:].rename(dict(zip(temp_df_3.iloc[0,25:].index,['opp_'+str(i) for i in temp_df_3.iloc[0,25:].index]))).to_frame().transpose().to_dict(orient = 'list'))
        temp_df_2_dict.update(temp_df_3.iloc[0,10:25].rename(dict(zip(temp_df_3.iloc[0,10:25].index,[i + "_allowed" for i in temp_df_3.iloc[0,10:25].index]))).to_frame().transpose().to_dict(orient = 'list'))
        temp_df_3_dict = temp_df_3.rename(columns = name_dict).to_dict(orient = 'list')
        temp_df_3_dict.update(temp_df_2.iloc[0,:10].rename(dict(zip(temp_df_2.iloc[0,:10].index,['opp_'+str(i) for i in temp_df_2.iloc[0,:10].index]))).to_frame().transpose().to_dict(orient = 'list'))
        temp_df_3_dict.update(temp_df_2.iloc[0,25:].rename(dict(zip(temp_df_2.iloc[0,25:].index,['opp_'+str(i) for i in temp_df_2.iloc[0,25:].index]))).to_frame().transpose().to_dict(orient = 'list'))
        temp_df_3_dict.update(temp_df_2.iloc[0,10:25].rename(dict(zip(temp_df_2.iloc[0,10:25].index,[i + "_allowed" for i in temp_df_2.iloc[0,10:25].index]))).to_frame().transpose().to_dict(orient = 'list'))
        for i in temp_df_3_dict.keys():
            temp_df_2_dict[i]+=(temp_df_3_dict[i])
        master_dict = {key:[] for key in temp_df_2_dict.keys()}

        for i in temp_df_2_dict.keys():
            master_dict[i] += temp_df_2_dict[i]
        df_pred = pd.DataFrame.from_dict(master_dict)
        df_pred = df_pred[df_pred.name == home_team]
        df_pred['home'] = 1
        for i in df_pred.loc[:,df_pred.columns[df_pred.columns.str.contains('percent')]].columns:
             if df_pred[i].max() > 1:
                df_pred[i] = [j/100 for j in df_pred[i]]
df_pred = pd.merge(df_pred,kp_df2,left_on = 'name',right_on = "Team",how = 'left').drop("Team",axis = 1)
df_wl = df_pred[wl_cols]

loss_prob = clf.predict_proba(df_wl)[0][0]
win_prob = clf.predict_proba(df_wl)[0][1]

if win_prob >= loss_prob:
    print(home_team + ' has a ' + str(round(win_prob*100,2)) + '% chance of winning at home against ' +away_team)
else:
    print(away_team + ' has a ' + str(round(loss_prob*100,2)) + '% chance of winning on the road against ' + home_team)

Purdue has a 98.14% chance of winning at home against Nebraska


In [75]:
df2

Unnamed: 0,effective_field_goal_percentage,offensive_rating,allowed_effective_field_goal_percentage,allowed_three_point_field_goal_percentage,allowed_total_rebound_percentage,simple_rating_system,three_point_field_goal_percentage,two_point_field_goal_percentage,total_rebound_percentage,turnover_percentage,win_percentage,opp_block_percentage,opp_effective_field_goal_percentage,opp_offensive_rating,opp_simple_rating_system,opp_three_point_field_goal_percentage,opp_two_point_field_goal_percentage,opp_total_rebound_percentage,opp_turnover_percentage,opp_win_percentage,opp_effective_field_goal_percentage_allowed,opp_steal_percentage_allowed,opp_three_point_field_goal_percentage_allowed,opp_total_rebound_percentage_allowed,home,AdjO,AdjD,Luck,OppO,OppD
0,0.513,108.9,0.486,0.343,0.495,11.75,0.352,0.505,0.505,0.154,0.658,0.139,0.517,106.6,12.63,0.380,0.497,0.508,0.183,0.611,0.454,0.100,0.329,0.492,0,113.8,99.9,0.044,108.9,100.7
1,0.528,107.6,0.448,0.321,0.465,13.73,0.379,0.516,0.535,0.174,0.730,0.162,0.474,103.9,11.08,0.310,0.479,0.514,0.154,0.639,0.439,0.089,0.285,0.486,0,111.3,93.7,-0.043,106.3,102.2
2,0.474,103.9,0.439,0.285,0.486,11.08,0.310,0.479,0.514,0.154,0.639,0.137,0.528,107.6,13.73,0.379,0.516,0.535,0.174,0.730,0.448,0.118,0.321,0.465,1,107.1,93.7,0.008,108.5,102.6
3,0.503,108.2,0.530,0.388,0.479,1.74,0.360,0.486,0.521,0.156,0.529,0.103,0.541,113.0,0.98,0.376,0.529,0.515,0.153,0.676,0.491,0.088,0.322,0.485,0,109.5,107.5,0.057,107.8,104.3
4,0.469,102.6,0.498,0.365,0.462,-1.66,0.329,0.458,0.538,0.167,0.576,0.126,0.510,113.9,-5.53,0.325,0.524,0.495,0.116,0.629,0.485,0.073,0.337,0.505,0,102.8,101.5,0.080,105.6,104.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56167,0.500,103.1,0.464,0.323,0.454,9.41,0.340,0.496,0.546,0.156,0.742,0.089,0.510,102.5,1.44,0.340,0.510,0.489,0.159,0.543,0.517,0.090,0.362,0.511,1,105.8,94.4,0.026,103.1,101.2
56168,0.510,102.5,0.517,0.362,0.511,1.44,0.340,0.510,0.489,0.159,0.543,0.126,0.500,103.1,9.41,0.340,0.496,0.546,0.156,0.742,0.464,0.097,0.323,0.454,0,104.5,102.5,0.020,103.0,101.6
56169,0.527,103.6,0.483,0.312,0.503,-5.91,0.342,0.538,0.497,0.160,0.613,0.117,0.471,98.0,-11.30,0.297,0.485,0.492,0.156,0.419,0.493,0.079,0.351,0.508,0,100.6,105.5,0.101,101.0,106.1
56170,0.520,107.4,0.498,0.300,0.516,4.20,0.338,0.527,0.484,0.155,0.667,0.082,0.516,108.9,-0.98,0.346,0.515,0.521,0.156,0.636,0.529,0.087,0.361,0.479,1,108.9,103.3,0.023,102.3,101.4


# Predicting Spreads

In [108]:
df_points = pd.read_csv('ml_df_l4.csv')
df_points = df_points.drop([df_points.columns[i] for i in [6,39,66,67,68,69,70,72]],axis = 1)

idx_to_remove = []
for i in df_points.columns.tolist():
    stat,p = stats.normaltest(df_points[i])
    if p <= 0.05:
        zscores = stats.zscore(df_points[i])
        zscores = np.abs(zscores)
        idx_list = []
        for idx,i in enumerate(zscores):
            if i >=3.2:
                idx_list.append(idx)
        for i in idx_list:
            if i not in idx_to_remove:
                idx_to_remove.append(i)
print("Identified {} outliers to remove".format(len(idx_to_remove)))

df_points['spread'] = df_points.apply(lambda x:x.points_for-x.points_against,axis = 1)
df_points = df_points.drop(['points_for','points_against'],axis = 1)
df_points = df_points.drop(idx_to_remove,axis = 0)

corr=df_points.corr()
high_corr_var=np.where(corr>0.9)
high_corr_var=[(corr.columns[x],corr.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]
for tup in high_corr_var:
    if tup[1] != 'points_for':
        print('Removing {} as it is highly correllated with {}'.format(tup[1],tup[0]))

to_drop = []
for x,y in high_corr_var:
    if y not in to_drop:
        to_drop.append(y)

df_points = df_points.drop([i for i in to_drop],axis = 1)

corr=df_points.corr()
high_corr_var=np.where(corr>0.9)
high_corr_var=[(corr.columns[x],corr.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]
assert high_corr_var == []
X = df_points.drop('spread',1)
y = df_points.spread
X_train,X_test,y_train,y_test = train_test_split(X,y)
spread_cols = X_train.columns.tolist()
X_train

Identified 5618 outliers to remove
Removing field_goal_percentage as it is highly correllated with effective_field_goal_percentage
Removing true_shooting_percentage as it is highly correllated with effective_field_goal_percentage
Removing allowed_field_goal_percentage as it is highly correllated with allowed_effective_field_goal_percentage
Removing allowed_true_shooting_percentage as it is highly correllated with allowed_effective_field_goal_percentage
Removing allowed_true_shooting_percentage as it is highly correllated with allowed_field_goal_percentage
Removing AdjT as it is highly correllated with pace
Removing opp_field_goal_percentage as it is highly correllated with opp_effective_field_goal_percentage
Removing opp_true_shooting_percentage as it is highly correllated with opp_effective_field_goal_percentage
Removing opp_true_shooting_percentage as it is highly correllated with opp_field_goal_percentage
Removing opp_field_goal_percentage_allowed as it is highly correllated with op

In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only


Unnamed: 0,assist_percentage,block_percentage,effective_field_goal_percentage,free_throw_attempt_rate,free_throw_percentage,offensive_rating,offensive_rebound_percentage,allowed_assist_percentage,allowed_block_percentage,allowed_effective_field_goal_percentage,allowed_free_throw_attempt_rate,allowed_free_throw_percentage,allowed_offensive_rebound_percentage,allowed_steal_percentage,allowed_three_point_attempt_rate,allowed_three_point_field_goal_percentage,allowed_two_point_field_goal_percentage,allowed_total_rebound_percentage,allowed_turnover_percentage,pace,simple_rating_system,steal_percentage,three_point_attempt_rate,three_point_field_goal_percentage,two_point_field_goal_percentage,total_rebound_percentage,turnover_percentage,win_percentage,opp_assist_percentage,opp_block_percentage,opp_effective_field_goal_percentage,opp_free_throw_attempt_rate,opp_free_throw_percentage,opp_offensive_rating,opp_offensive_rebound_percentage,opp_pace,opp_simple_rating_system,opp_steal_percentage,opp_three_point_attempt_rate,opp_three_point_field_goal_percentage,opp_two_point_field_goal_percentage,opp_total_rebound_percentage,opp_turnover_percentage,opp_win_percentage,opp_assist_percentage_allowed,opp_block_percentage_allowed,opp_effective_field_goal_percentage_allowed,opp_free_throw_attempt_rate_allowed,opp_free_throw_percentage_allowed,opp_offensive_rebound_percentage_allowed,opp_steal_percentage_allowed,opp_three_point_attempt_rate_allowed,opp_three_point_field_goal_percentage_allowed,opp_two_point_field_goal_percentage_allowed,opp_total_rebound_percentage_allowed,opp_turnover_percentage_allowed,home,AdjO,AdjD,Luck,OppO,OppD
47607,58.9,6.9,0.489,0.377,0.737,100.1,26.9,48.1,10.4,0.506,0.311,0.712,28.8,10.7,0.282,0.392,0.474,50.6,14.2,65.4,-4.85,8.0,0.399,0.338,0.477,49.4,17.2,0.467,49.5,7.0,0.471,0.484,0.666,99.0,28.0,67.9,3.27,9.9,0.387,0.347,0.441,46.0,16.4,0.429,50.8,13.3,0.523,0.412,0.698,33.8,9.8,0.326,0.332,0.535,54.0,18.5,0,103.1,106.9,0.043,105.6,104.7
47418,44.4,8.9,0.500,0.271,0.712,102.3,27.2,53.6,7.5,0.520,0.288,0.736,26.9,7.8,0.398,0.356,0.511,49.9,14.3,70.7,-5.09,7.2,0.300,0.338,0.497,50.1,14.9,0.500,59.1,5.8,0.541,0.390,0.679,106.8,26.5,69.7,6.70,9.2,0.393,0.366,0.536,52.0,16.4,0.625,50.9,7.4,0.516,0.239,0.725,23.8,8.9,0.426,0.342,0.517,48.0,15.9,0,103.2,107.7,0.077,104.0,104.3
53514,51.1,9.7,0.503,0.269,0.764,103.8,24.1,47.7,9.6,0.464,0.253,0.690,25.9,7.7,0.325,0.331,0.449,50.4,14.8,64.4,15.95,6.9,0.445,0.352,0.483,49.6,13.8,0.677,55.4,8.1,0.526,0.375,0.720,105.1,27.6,73.0,3.47,11.1,0.368,0.327,0.548,49.5,17.4,0.576,58.5,7.0,0.524,0.245,0.691,30.1,9.6,0.426,0.346,0.526,50.5,17.6,0,110.8,91.9,0.031,108.7,96.9
37318,56.8,12.9,0.529,0.398,0.695,109.0,27.2,43.8,8.7,0.431,0.344,0.677,29.2,8.0,0.306,0.294,0.426,49.2,12.0,69.2,17.20,6.0,0.380,0.382,0.501,50.8,14.7,0.576,58.7,13.8,0.497,0.376,0.694,107.5,30.9,71.3,10.23,10.0,0.273,0.397,0.461,50.0,13.2,0.500,55.7,9.7,0.498,0.428,0.700,30.0,7.1,0.349,0.359,0.477,50.0,17.0,0,113.6,94.9,-0.106,109.2,100.6
43765,47.8,10.5,0.510,0.343,0.718,106.3,33.4,51.9,8.3,0.476,0.287,0.739,28.1,9.2,0.347,0.330,0.466,47.0,16.0,64.7,14.48,9.9,0.326,0.356,0.498,53.0,16.6,0.636,49.3,13.7,0.537,0.384,0.689,113.1,33.6,72.6,18.87,10.2,0.313,0.347,0.544,52.7,14.1,0.743,49.5,7.1,0.479,0.373,0.715,29.5,6.9,0.361,0.349,0.454,47.3,16.9,0,109.9,94.9,0.031,110.7,101.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13682,42.1,8.1,0.452,0.429,0.717,96.7,35.6,53.2,10.0,0.482,0.455,0.703,32.9,10.4,0.339,0.336,0.470,48.9,17.4,61.3,-15.52,8.9,0.346,0.301,0.453,51.1,19.2,0.344,56.7,12.5,0.553,0.379,0.685,109.9,32.4,62.6,0.66,10.6,0.356,0.369,0.553,53.1,17.4,0.758,46.1,8.6,0.410,0.422,0.675,29.7,8.3,0.350,0.282,0.402,46.9,18.2,1,107.6,111.4,0.085,103.9,105.8
42284,59.7,15.2,0.554,0.333,0.754,116.2,30.9,51.2,6.3,0.475,0.343,0.653,29.4,8.8,0.406,0.348,0.442,47.7,15.6,69.4,21.55,8.5,0.320,0.367,0.556,52.3,13.7,0.838,51.2,13.4,0.530,0.419,0.739,111.9,36.4,67.3,21.43,8.8,0.296,0.354,0.529,56.9,15.9,0.811,47.9,7.8,0.466,0.268,0.705,25.2,8.6,0.386,0.343,0.436,43.1,15.7,1,122.7,96.5,0.034,109.7,99.0
3007,40.3,8.5,0.504,0.439,0.711,111.7,29.1,44.2,9.6,0.494,0.380,0.694,32.8,6.9,0.317,0.327,0.496,51.9,14.3,70.9,2.73,8.4,0.345,0.347,0.495,48.1,11.6,0.714,49.9,12.2,0.503,0.423,0.689,106.1,29.6,65.1,17.84,11.8,0.329,0.324,0.511,50.2,14.9,0.714,43.8,8.3,0.452,0.298,0.711,29.6,6.3,0.316,0.293,0.459,49.8,19.0,0,112.0,106.2,0.064,105.2,105.9
32882,54.9,9.0,0.488,0.327,0.707,99.6,26.0,60.9,10.7,0.481,0.311,0.698,32.6,8.8,0.332,0.340,0.467,52.4,14.3,65.5,-6.52,7.5,0.316,0.359,0.465,47.6,15.9,0.438,51.2,6.1,0.540,0.339,0.744,107.9,26.8,65.6,-7.64,7.9,0.349,0.398,0.509,47.3,16.3,0.355,52.0,8.6,0.540,0.367,0.727,33.6,9.4,0.373,0.373,0.529,52.7,15.6,1,98.0,104.1,0.031,104.3,107.6


In [109]:
import statsmodels.api as sm
X2 = sm.add_constant(X_train)
est = sm.OLS(y_train, X2)
est2 = est.fit()
print(est2.summary())

In a future version of pandas all arguments of concat except for the argument 'objs' will be keyword-only


                            OLS Regression Results                            
Dep. Variable:                 spread   R-squared:                       0.512
Model:                            OLS   Adj. R-squared:                  0.511
Method:                 Least Squares   F-statistic:                     639.4
Date:                Sun, 16 Jan 2022   Prob (F-statistic):               0.00
Time:                        14:05:45   Log-Likelihood:            -1.4185e+05
No. Observations:               37915   AIC:                         2.838e+05
Df Residuals:                   37852   BIC:                         2.844e+05
Df Model:                          62                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------

In [110]:
pvals = est2.pvalues.to_frame('p_val').reset_index().rename(columns = {'index':'coef'})
pvals['p_val'] = round(pvals.p_val,3)
sig_vars = pvals[pvals.p_val <= 0.05]['coef'].tolist()
sig_vars.append("spread")
df_points = df_points[sig_vars]
X = df_points.drop('spread',axis = 1)
y = df_points.spread
X_train,X_test,y_train,y_test = train_test_split(X,y)
spread_cols = X_train.columns.tolist()
X_train.sample(5)

Unnamed: 0,simple_rating_system,opp_simple_rating_system,home
5178,3.81,-8.62,1
35234,-3.48,-7.75,0
46558,-11.15,12.88,0
40163,-3.49,5.33,0
20051,-12.22,-8.42,0


In [111]:
pf_reg = LinearRegression(n_jobs = -1)
pf_reg.fit(X_train,y_train)
print('Training:',pf_reg.score(X_train,y_train),mean_squared_error(y_train,pf_reg.predict(X_train),squared = False))
print('Testing:',pf_reg.score(X_test,y_test),mean_squared_error(y_test,pf_reg.predict(X_test),squared = False))

Training: 0.5111185391082598 10.18557243382938
Testing: 0.5068644683982028 10.280112112881145


In [117]:
team_1 = 'Illinois'
team_2 = 'Purdue'
home_team = team_1
away_team = team_2
if team_1 in df_agg.name.unique().tolist():
    if team_2 in df_agg.name.unique().tolist():
        temp_df_2 = df_agg.loc[df_agg.name == team_1].loc[:,df_agg.columns[df_agg.columns.str.contains('percen') | df_agg.columns.str.contains('rat') | (df_agg.columns == 'pace') | (df_agg.columns == 'name')]]
        temp_df_3 = df_agg.loc[df_agg.name == team_2].loc[:,df_agg.columns[df_agg.columns.str.contains('percen') | df_agg.columns.str.contains('rat') | (df_agg.columns == 'pace') | (df_agg.columns == 'name')]]
        old_names = temp_df_2.columns[temp_df_2.columns.str.contains('opp_')].to_list()
        new_names = [i.replace('opp','allowed') for i in temp_df_2.columns[temp_df_2.columns.str.contains('opp_')].to_list()]
        name_dict = dict(zip(old_names,new_names))
        temp_df_2_dict = temp_df_2.rename(columns = name_dict).to_dict(orient = 'list')
        temp_df_2_dict.update(temp_df_3.iloc[0,:10].rename(dict(zip(temp_df_3.iloc[0,:10].index,['opp_'+str(i) for i in temp_df_3.iloc[0,:10].index]))).to_frame().transpose().to_dict(orient = 'list'))
        temp_df_2_dict.update(temp_df_3.iloc[0,25:].rename(dict(zip(temp_df_3.iloc[0,25:].index,['opp_'+str(i) for i in temp_df_3.iloc[0,25:].index]))).to_frame().transpose().to_dict(orient = 'list'))
        temp_df_2_dict.update(temp_df_3.iloc[0,10:25].rename(dict(zip(temp_df_3.iloc[0,10:25].index,[i + "_allowed" for i in temp_df_3.iloc[0,10:25].index]))).to_frame().transpose().to_dict(orient = 'list'))
        temp_df_3_dict = temp_df_3.rename(columns = name_dict).to_dict(orient = 'list')
        temp_df_3_dict.update(temp_df_2.iloc[0,:10].rename(dict(zip(temp_df_2.iloc[0,:10].index,['opp_'+str(i) for i in temp_df_2.iloc[0,:10].index]))).to_frame().transpose().to_dict(orient = 'list'))
        temp_df_3_dict.update(temp_df_2.iloc[0,25:].rename(dict(zip(temp_df_2.iloc[0,25:].index,['opp_'+str(i) for i in temp_df_2.iloc[0,25:].index]))).to_frame().transpose().to_dict(orient = 'list'))
        temp_df_3_dict.update(temp_df_2.iloc[0,10:25].rename(dict(zip(temp_df_2.iloc[0,10:25].index,[i + "_allowed" for i in temp_df_2.iloc[0,10:25].index]))).to_frame().transpose().to_dict(orient = 'list'))
        for i in temp_df_3_dict.keys():
            temp_df_2_dict[i]+=(temp_df_3_dict[i])
        master_dict = {key:[] for key in temp_df_2_dict.keys()}

        for i in temp_df_2_dict.keys():
            master_dict[i] += temp_df_2_dict[i]
        df_pred = pd.DataFrame.from_dict(master_dict)
        df_pred = df_pred[df_pred.name == home_team]
        df_pred['home'] = 1
        for i in df_pred.loc[:,df_pred.columns[df_pred.columns.str.contains('percent')]].columns:
             if df_pred[i].max() > 1:
                df_pred[i] = [j/100 for j in df_pred[i]]
df_pred = pd.merge(df_pred,kp_df2,left_on = 'name',right_on = "Team",how = 'left').drop("Team",axis = 1)
df_wl = df_pred[wl_cols]

loss_prob = clf.predict_proba(df_wl)[0][0]
win_prob = clf.predict_proba(df_wl)[0][1]

if win_prob >= loss_prob:
    print(home_team + ' has a ' + str(round(win_prob*100,2)) + '% chance of winning at home against ' +away_team)
else:
    print(away_team + ' has a ' + str(round(loss_prob*100,2)) + '% chance of winning on the road against ' + home_team)
    
df_spread = df_pred[spread_cols]
predicted_spread = int(round(pf_reg.predict(df_spread)[0]))
print("Predicted Spread is {}".format(predicted_spread))

Illinois has a 59.54% chance of winning at home against Purdue
Predicted Spread is 1
