In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression

In [2]:
# get training data loaded in to build classifier
df = pd.read_csv('final_bracket_train.csv')
df.dropna(inplace=True)
df.drop(['Season','MinTeam','MaxTeam'], axis = 1,inplace=True)

In [3]:
y_all = df['MinWin']
x_all = df.drop(['MinWin'], axis = 1)
x_all.shape

(968, 140)

In [4]:
# fit scaler to use later
scaler = preprocessing.StandardScaler().fit(x_all)
x_scale = scaler.transform(x_all)

In [5]:
# split up the data 
x_train, x_test, y_train, y_test = train_test_split(x_scale, y_all, test_size=0.33)

In [6]:
clf = LogisticRegression()
clf.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
clf.score(x_test,y_test)

0.74687499999999996

In [8]:
print(confusion_matrix(y_test,clf.predict(x_test)))

[[124  36]
 [ 45 115]]


In [9]:
print(classification_report(y_test,clf.predict(x_test)))

             precision    recall  f1-score   support

          0       0.73      0.78      0.75       160
          1       0.76      0.72      0.74       160

avg / total       0.75      0.75      0.75       320



In [12]:
output = pd.read_csv('final_bracket_predict.csv')
out_teams = output[['MinTeam','MaxTeam']]
output.drop(['Season','MinTeam','MaxTeam','MinWin'],axis=1, inplace=True)
out_teams.head()

Unnamed: 0,MinTeam,MaxTeam
0,1104,1112
1,1104,1438
2,1112,1438
3,1113,1438
4,1116,1438


In [13]:
output.head()

Unnamed: 0,min.g1.OffRtg,min.g1.DefRtg,min.g1.NetRtg,min.g1.AstR,min.g1.TOR,min.g1.TSP,min.g1.eFGP,min.g1.FTAR,min.g1.ORP,min.g1.DRP,...,pom.min.NCSOS_AdjEM,pom.max.AdjEM,pom.max.AdjO,pom.max.AdjD,pom.max.AdjT,pom.max.Luck,pom.max.SOS_AdjEM,pom.max.SOS_OppO,pom.max.SOS_OppD,pom.max.NCSOS_AdjEM
0,99.193044,125.737661,-26.544617,15.10574,18.342685,57.555123,0.548077,0.423077,0.206897,0.707317,...,3.02,19.37,119.0,99.6,67.3,0.025,6.33,108.9,102.6,2.82
1,99.193044,125.737661,-26.544617,15.10574,18.342685,57.555123,0.548077,0.423077,0.206897,0.707317,...,3.02,32.15,116.5,84.4,59.2,0.032,9.99,110.9,100.9,0.22
2,111.575264,96.698562,14.876702,12.146643,14.355124,56.340144,0.526786,0.428571,0.366667,0.766667,...,2.82,32.15,116.5,84.4,59.2,0.032,9.99,110.9,100.9,0.22
3,101.889145,112.377733,-10.488588,13.107722,9.532888,52.37215,0.45614,0.315789,0.166667,0.735294,...,1.53,32.15,116.5,84.4,59.2,0.032,9.99,110.9,100.9,0.22
4,108.131488,130.658881,-22.527393,14.906303,9.582624,50.761421,0.468254,0.285714,0.277778,0.470588,...,1.3,32.15,116.5,84.4,59.2,0.032,9.99,110.9,100.9,0.22


In [14]:
x_pred = scaler.transform(output)

In [15]:
predictions = pd.DataFrame(clf.predict(x_pred),columns=['MinWin'])
predictions.head()

Unnamed: 0,MinWin
0,0
1,0
2,0
3,0
4,0


In [16]:
final_file = pd.concat([out_teams,predictions],axis=1)
final_file.head()

Unnamed: 0,MinTeam,MaxTeam,MinWin
0,1104,1112,0
1,1104,1438,0
2,1112,1438,0
3,1113,1438,0
4,1116,1438,0


In [17]:
final_file.to_excel('logistic_predictions.xlsx')