In [83]:
# import packages
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

scaler = preprocessing.MinMaxScaler()

In [84]:
# data preparation
# game --> team
# team --> game
# write all data into csv

In [85]:
# model construction
# read data 
data = pd.read_csv("all_data.csv")
target = data['result']

In [86]:
# seed difference

seed = data['different_seed']

x_train, x_test, y_train, y_test = train_test_split(seed, target, random_state=0)
x_train = x_train.reshape(-1,1)
x_test = x_test.reshape(-1,1)

# logistic regression

log = LogisticRegression()
print("accuracy:",log.fit(x_train, y_train).score(x_test,y_test));

pred = log.predict(x_test)
confusion = confusion_matrix(y_test,pred)
print("confusion matrix:\n",confusion);
print("P:",precision_score(y_test,pred));
print("R:",recall_score(y_test,pred))

pred_prob = log.fit(x_train, y_train).predict_proba(x_test)
print("log_loss:",log_loss(y_test,pred_prob))

accuracy: 0.71615720524
confusion matrix:
 [[88 37]
 [28 76]]
P: 0.672566371681
R: 0.730769230769
log_loss: 0.532172798705




In [87]:
# all other difference
data0 = data.iloc[:,51:65]
data1 = data.iloc[:,1:3]
data0 = pd.concat([data1,data0], axis=1)
data0 = pd.concat([data['team1'],data0], axis=1)
data0 = pd.concat([data['result'],data0], axis=1)
data0 = data0.dropna()
data0.head()

Unnamed: 0,result,team1,team2,Season,diff_score,diff_fgm,diff_fga,diff_fgm3,diff_fga3,diff_ftm,diff_fta,diff_or,diff_dr,diff_ast,diff_to,diff_stl,diff_blk,diff_pf
0,1,1112,1125,2013,3.61875,2.602083,0.310417,0.64375,1.00625,-2.229167,-2.679167,-2.164583,-1.885417,1.95625,0.472917,2.595833,-1.172917,1.85625
1,0,1116,1137,2006,-8.012236,-3.516129,-8.914349,0.324805,-1.220245,-1.304783,-1.79644,-2.883204,-0.459399,-0.96218,1.095662,0.272525,-4.439377,0.61624
2,0,1137,1139,2013,1.174242,-0.154356,0.457386,1.920455,6.216856,-0.4375,0.989583,2.574811,-2.246212,-0.558712,3.835227,2.166667,-1.75947,3.404356
3,1,1112,1156,2009,-7.21875,-2.625,0.09375,-1.4375,-0.3125,-0.53125,0.09375,0.46875,-0.65625,-1.0,-0.125,2.625,0.21875,2.75
4,1,1124,1160,2012,-7.991477,-3.372159,-4.3125,-1.164773,-1.535985,-0.082386,1.825758,-2.558712,1.340909,-3.352273,-1.285985,-1.761364,-1.386364,-0.542614


In [88]:
df = data0.iloc[:,4:18]
target = data0['result']

x_train, x_test, y_train, y_test = train_test_split(df, target, random_state=0)

x_train_scaled = scaler.fit(x_train).transform(x_train)
x_test_scaled = scaler.fit(x_train).transform(x_test)
df_scaled = scaler.fit_transform(df)

In [89]:
# logistic regression
log = LogisticRegression()
print("accuracy:",log.fit(x_train, y_train).score(x_test,y_test));

pred = log.predict(x_test)
confusion = confusion_matrix(y_test,pred)
print("confusion matrix:\n",confusion);
print("P:",precision_score(y_test,pred));
print("R:",recall_score(y_test,pred));

pred_prob = log.fit(x_train, y_train).predict_proba(x_test)
print("log_loss:",log_loss(y_test,pred_prob))

accuracy: 0.638766519824
confusion matrix:
 [[72 40]
 [42 73]]
P: 0.646017699115
R: 0.634782608696
log_loss: 0.605148199814


In [90]:
# logistic regression using scaled data
log = LogisticRegression()
print("accuracy:",log.fit(x_train_scaled, y_train).score(x_test_scaled,y_test));

pred = log.predict(x_test_scaled)
confusion = confusion_matrix(y_test,pred)
print("confusion matrix:\n",confusion);
print("P:",precision_score(y_test,pred));
print("R:",recall_score(y_test,pred));

pred_prob = log.fit(x_train_scaled, y_train).predict_proba(x_test_scaled)
print("log_loss:",log_loss(y_test,pred_prob))

accuracy: 0.678414096916
confusion matrix:
 [[78 34]
 [39 76]]
P: 0.690909090909
R: 0.660869565217
log_loss: 0.603924386801


In [91]:
# svm
clf = svm.SVC()

print("accuracy:",clf.fit(x_train, y_train).score(x_test,y_test));

pred = clf.fit(x_train, y_train).predict(x_test)
confusion = confusion_matrix(y_test,pred)
print("confusion matrix:\n",confusion);
print("P:",precision_score(y_test,pred));
print("R:",recall_score(y_test,pred));

clf = svm.SVC(probability=True)
pred_prob = clf.fit(x_train, y_train).predict_proba(x_test)
print("log_loss:",log_loss(y_test,pred_prob))

accuracy: 0.546255506608
confusion matrix:
 [[82 30]
 [73 42]]
P: 0.583333333333
R: 0.365217391304
log_loss: 0.685906620939


In [92]:
# svm using scaled data
clf = svm.SVC()

print("accuracy:",clf.fit(x_train_scaled, y_train).score(x_test_scaled,y_test));

pred = clf.fit(x_train_scaled, y_train).predict(x_test_scaled)
confusion = confusion_matrix(y_test,pred)
print("confusion matrix:\n",confusion);
print("P:",precision_score(y_test,pred));
print("R:",recall_score(y_test,pred));

clf = svm.SVC(probability=True)
pred_prob = clf.fit(x_train_scaled, y_train).predict_proba(x_test_scaled)
print("log_loss:",log_loss(y_test,pred_prob))

accuracy: 0.660792951542
confusion matrix:
 [[82 30]
 [47 68]]
P: 0.69387755102
R: 0.591304347826
log_loss: 0.604725501994


In [93]:
# all difference with seed difference
data0 = data.iloc[:,50:65]
data1 = data.iloc[:,1:3]
data0 = pd.concat([data1,data0], axis=1)
data0 = pd.concat([data['team1'],data0], axis=1)
data0 = pd.concat([data['result'],data0], axis=1)
data0 = data0.dropna()
data0.head()

Unnamed: 0,result,team1,team2,Season,different_seed,diff_score,diff_fgm,diff_fga,diff_fgm3,diff_fga3,diff_ftm,diff_fta,diff_or,diff_dr,diff_ast,diff_to,diff_stl,diff_blk,diff_pf
0,1,1112,1125,2013,-5,3.61875,2.602083,0.310417,0.64375,1.00625,-2.229167,-2.679167,-2.164583,-1.885417,1.95625,0.472917,2.595833,-1.172917,1.85625
1,0,1116,1137,2006,-1,-8.012236,-3.516129,-8.914349,0.324805,-1.220245,-1.304783,-1.79644,-2.883204,-0.459399,-0.96218,1.095662,0.272525,-4.439377,0.61624
2,0,1137,1139,2013,5,1.174242,-0.154356,0.457386,1.920455,6.216856,-0.4375,0.989583,2.574811,-2.246212,-0.558712,3.835227,2.166667,-1.75947,3.404356
3,1,1112,1156,2009,-1,-7.21875,-2.625,0.09375,-1.4375,-0.3125,-0.53125,0.09375,0.46875,-0.65625,-1.0,-0.125,2.625,0.21875,2.75
4,1,1124,1160,2012,-8,-7.991477,-3.372159,-4.3125,-1.164773,-1.535985,-0.082386,1.825758,-2.558712,1.340909,-3.352273,-1.285985,-1.761364,-1.386364,-0.542614


In [94]:
df = data0.iloc[:,4:18]
target = data0['result']

x_train, x_test, y_train, y_test = train_test_split(df, target, random_state=0)

x_train_scaled = scaler.fit(x_train).transform(x_train)
x_test_scaled = scaler.fit(x_train).transform(x_test)
df_scaled = scaler.fit_transform(df)

# logistic regression
log = LogisticRegression()
print("accuracy:",log.fit(x_train, y_train).score(x_test,y_test));

pred = log.predict(x_test)
confusion = confusion_matrix(y_test,pred)
print("confusion matrix:\n",confusion);
print("P:",precision_score(y_test,pred));
print("R:",recall_score(y_test,pred));

pred_prob = log.fit(x_train, y_train).predict_proba(x_test)
print("log_loss:",log_loss(y_test,pred_prob))

accuracy: 0.709251101322
confusion matrix:
 [[83 29]
 [37 78]]
P: 0.728971962617
R: 0.678260869565
log_loss: 0.548221113352


In [95]:
# logistic regression using scaled data
log = LogisticRegression()
print("accuracy:",log.fit(x_train_scaled, y_train).score(x_test_scaled,y_test));

pred = log.predict(x_test_scaled)
confusion = confusion_matrix(y_test,pred)
print("confusion matrix:\n",confusion);
print("P:",precision_score(y_test,pred));
print("R:",recall_score(y_test,pred));

pred_prob = log.fit(x_train_scaled, y_train).predict_proba(x_test_scaled)
print("log_loss:",log_loss(y_test,pred_prob))

accuracy: 0.718061674009
confusion matrix:
 [[86 26]
 [38 77]]
P: 0.747572815534
R: 0.669565217391
log_loss: 0.550236931041


In [96]:
# svm
clf = svm.SVC()

print("accuracy:",clf.fit(x_train, y_train).score(x_test,y_test));

pred = clf.fit(x_train, y_train).predict(x_test)
confusion = confusion_matrix(y_test,pred)
print("confusion matrix:\n",confusion);
print("P:",precision_score(y_test,pred));
print("R:",recall_score(y_test,pred));

clf = svm.SVC(probability=True)
pred_prob = clf.fit(x_train, y_train).predict_proba(x_test)
print("log_loss:",log_loss(y_test,pred_prob))

accuracy: 0.581497797357
confusion matrix:
 [[92 20]
 [75 40]]
P: 0.666666666667
R: 0.347826086957
log_loss: 0.661858129374


In [97]:
# svm using scaled data
clf = svm.SVC()

print("accuracy:",clf.fit(x_train_scaled, y_train).score(x_test_scaled,y_test));

pred = clf.fit(x_train_scaled, y_train).predict(x_test_scaled)
confusion = confusion_matrix(y_test,pred)
print("confusion matrix:\n",confusion);
print("P:",precision_score(y_test,pred));
print("R:",recall_score(y_test,pred));

clf = svm.SVC(probability=True)
pred_prob = clf.fit(x_train_scaled, y_train).predict_proba(x_test_scaled)
print("log_loss:",log_loss(y_test,pred_prob))

accuracy: 0.726872246696
confusion matrix:
 [[86 26]
 [36 79]]
P: 0.752380952381
R: 0.686956521739
log_loss: 0.544766890539


In [98]:
# only score and win rate with scale
data0 = data.dropna()
df = data0[['score1','score2','rate1','rate2']]
target = data0['result']

x_train, x_test, y_train, y_test = train_test_split(df, target, random_state=0)

x_train_scaled = scaler.fit(x_train).transform(x_train)
x_test_scaled = scaler.fit(x_train).transform(x_test)
df_scaled = scaler.fit_transform(df)

# logistic regression
log = LogisticRegression()
print("accuracy:",log.fit(x_train, y_train).score(x_test,y_test));

pred = log.predict(x_test)
confusion = confusion_matrix(y_test,pred)
print("confusion matrix:\n",confusion);
print("P:",precision_score(y_test,pred));
print("R:",recall_score(y_test,pred));

pred_prob = log.fit(x_train, y_train).predict_proba(x_test)
print("log_loss:",log_loss(y_test,pred_prob))

accuracy: 0.656387665198
confusion matrix:
 [[77 35]
 [43 72]]
P: 0.672897196262
R: 0.626086956522
log_loss: 0.632094655657


In [101]:
data.head()

Unnamed: 0.1,Unnamed: 0,team2,Season,fgm1,fga1,fgm31,fga31,ftm1,fta1,or1,...,diff_fga3,diff_ftm,diff_fta,diff_or,diff_dr,diff_ast,diff_to,diff_stl,diff_blk,diff_pf
0,0,1125,2013,27.633333,55.966667,8.3,22.1,13.333333,18.133333,9.366667,...,1.00625,-2.229167,-2.679167,-2.164583,-1.885417,1.95625,0.472917,2.595833,-1.172917,1.85625
1,1,1137,2006,23.0,48.827586,6.034483,15.586207,14.275862,20.655172,9.310345,...,-1.220245,-1.304783,-1.79644,-2.883204,-0.459399,-0.96218,1.095662,0.272525,-4.439377,0.61624
2,2,1139,2013,23.939394,53.363636,6.545455,19.060606,14.0,20.333333,11.606061,...,6.216856,-0.4375,0.989583,2.574811,-2.246212,-0.558712,3.835227,2.166667,-1.75947,3.404356
3,3,1156,2009,23.03125,54.15625,4.75,15.34375,14.15625,20.09375,11.5625,...,-0.3125,-0.53125,0.09375,0.46875,-0.65625,-1.0,-0.125,2.625,0.21875,2.75
4,4,1160,2012,22.71875,51.6875,5.5625,16.3125,15.28125,22.25,9.5625,...,-1.535985,-0.082386,1.825758,-2.558712,1.340909,-3.352273,-1.285985,-1.761364,-1.386364,-0.542614
