In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import seaborn as sns
import sklearn.model_selection as ms
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

## Data Processing

In [3]:
df = pd.read_csv(r'D:\Github\Projects\Dota_Winrate_Analysis\Data\Export\outladnersexport.csv')
df['Radiant']='R'+df['player_1'].map(str)+','+'R'+df['player_2'].map(str)+','+'R'+df['player_3'].map(str)+','+'R'+df['player_4'].map(str)+','+'R'+df['player_5'].map(str)
df['Dire']='D'+df['player_6'].map(str)+','+'D'+df['player_7'].map(str)+','+'D'+df['player_8'].map(str)+','+'D'+df['player_9'].map(str)+','+'D'+df['player_10'].map(str)
Radiant = df['Radiant'].str.get_dummies(sep=',')
Dire = df['Dire'].str.get_dummies(sep=',')
dfmodel = Radiant.join(Dire)
dfmodel['Radiant Win'] = df['radiant_win']

We want to check the radiant winrate, our goal is to create a model that's better than this mean.

In [17]:
df.radiant_win.mean()

0.5233

## Train Test Split

In [18]:
Train, Test = train_test_split(dfmodel, test_size=0.2,random_state=1)
xTrain = Train.drop(labels = 'Radiant Win',axis=1)
xTest = Test.drop(labels = 'Radiant Win',axis=1)
yTrain = Train['Radiant Win']
yTest = Test['Radiant Win']

## Creation of Model

In [19]:
model = LogisticRegression().fit(xTrain,yTrain)

## Testing of Model

In [20]:
#accuracy_score(yTest, model.predict(xTest))
(model.predict(xTest) == yTest).mean()

0.56075

In [21]:
print ('Logistic Regression accuracy:', np.mean(cross_val_score(LogisticRegression(), xTest, yTest, scoring='accuracy', cv = 3)))

Logistic Regression accuracy: 0.5487498186390676


## Fine-Tune Model

In [22]:
model = LogisticRegression(max_iter=1000).fit(xTest,yTest)
print('Logistic Regression accuracy:',
      np.mean(cross_val_score(LogisticRegression(max_iter=1000), xTest, yTest, scoring='accuracy', cv = 3)))

model = LogisticRegression(C=1,max_iter=1000).fit(xTest,yTest)
print('Logistic Regression accuracy:',
      np.mean(cross_val_score(LogisticRegression(C=1e5,max_iter=1000), xTest, yTest, scoring='accuracy', cv = 3)))

model = LogisticRegression(C=10000,max_iter=1000).fit(xTest,yTest)
print('Logistic Regression accuracy:', 
      np.mean(cross_val_score(LogisticRegression(C=1e5,solver='newton-cg',max_iter=1000), xTest, yTest, scoring='accuracy', 
                              cv = ms.StratifiedKFold(random_state=1,shuffle = False))))

model = LogisticRegression(C=10000,solver='newton-cg',max_iter=1000).fit(xTest,yTest)
print('Logistic Regression accuracy:',
      np.mean(cross_val_score(LogisticRegression(C=1e5,solver='newton-cg',max_iter=1000), xTest, yTest, scoring='accuracy',
                              cv = ms.StratifiedKFold(random_state=1,shuffle = True))))

Logistic Regression accuracy: 0.5487498186390676
Logistic Regression accuracy: 0.5492501311234855




Logistic Regression accuracy: 0.5455
Logistic Regression accuracy: 0.5475000000000001


## Logit summary

In [23]:
dfmodel['Yes'] = (dfmodel['Radiant Win']==True).astype(int) 
cols = ' + '.join(dfmodel.columns[:-2])
results = smf.logit('Yes ~ '+ cols, data=dfmodel).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.672463
         Iterations 5


  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0,1,2,3
Dep. Variable:,Yes,No. Observations:,20000.0
Model:,Logit,Df Residuals:,19763.0
Method:,MLE,Df Model:,236.0
Date:,"Tue, 10 Mar 2020",Pseudo R-squ.:,0.02832
Time:,01:50:58,Log-Likelihood:,-13449.0
converged:,True,LL-Null:,-13841.0
Covariance Type:,nonrobust,LLR p-value:,5.444e-60

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0571,,,,,
R1,0.2506,7.71e+05,3.25e-07,1.000,-1.51e+06,1.51e+06
R10,-0.1434,7.71e+05,-1.86e-07,1.000,-1.51e+06,1.51e+06
R100,0.0176,7.71e+05,2.28e-08,1.000,-1.51e+06,1.51e+06
R101,-0.1080,7.71e+05,-1.4e-07,1.000,-1.51e+06,1.51e+06
R102,0.2161,7.71e+05,2.8e-07,1.000,-1.51e+06,1.51e+06
R103,0.5386,7.71e+05,6.99e-07,1.000,-1.51e+06,1.51e+06
R104,0.0003,7.71e+05,3.25e-10,1.000,-1.51e+06,1.51e+06
R105,-0.1770,7.71e+05,-2.3e-07,1.000,-1.51e+06,1.51e+06


## Conclusion

There were limited options for find-tuning the model, because the data was all categorical, so there was no feature scaling to be done. One way to increase accuracy is by increasing dataset or by improving the dataset.

In [11]:
dfmodel.corr()

Unnamed: 0,R1,R10,R100,R101,R102,R103,R104,R105,R106,R107,...,D91,D92,D93,D94,D95,D96,D97,D98,D99,Radiant Win
R1,1.000000,-0.035499,-0.006195,0.008185,0.002624,-0.005396,-0.009554,-0.001816,0.000970,-0.000101,...,-0.000145,-0.011935,0.016484,0.034603,0.010016,-0.007972,-0.001368,-0.006159,-0.006398,0.026243
R10,-0.035499,1.000000,-0.006568,-0.009594,-0.008334,0.001876,0.002673,0.004251,-0.014295,-0.009902,...,0.010371,-0.008760,0.001296,0.003330,-0.000228,0.000265,0.006467,0.000499,-0.011743,-0.015177
R100,-0.006195,-0.006568,1.000000,-0.011533,-0.019325,0.002769,-0.008981,-0.018824,0.009331,-0.012481,...,-0.010356,-0.008485,-0.002040,-0.002850,-0.000671,0.001754,0.001763,0.006147,-0.001015,0.000224
R101,0.008185,-0.009594,-0.011533,1.000000,-0.004235,-0.006619,0.034695,-0.010157,-0.014805,-0.001945,...,-0.006081,-0.004799,-0.017840,0.004428,-0.011059,0.001269,-0.007821,0.002130,0.007605,-0.010782
R102,0.002624,-0.008334,-0.019325,-0.004235,1.000000,0.006888,-0.029227,-0.011413,-0.003947,-0.005783,...,-0.001838,-0.008441,0.016751,-0.005202,0.012035,-0.000279,0.003811,0.002527,0.000812,0.013510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D96,-0.007972,0.000265,0.001754,0.001269,-0.000279,-0.007414,-0.008899,0.002972,-0.002120,0.000267,...,-0.001537,-0.000971,0.005950,0.003783,-0.004743,1.000000,-0.014618,-0.020083,-0.027234,0.019076
D97,-0.001368,0.006467,0.001763,-0.007821,0.003811,0.000844,-0.004431,-0.007107,0.009102,-0.002486,...,-0.008558,-0.000067,-0.010986,-0.016322,0.000479,-0.014618,1.000000,-0.014592,-0.025490,0.011157
D98,-0.006159,0.000499,0.006147,0.002130,0.002527,0.005056,-0.017935,-0.012420,-0.002418,-0.010369,...,-0.011773,-0.008944,0.006533,0.002404,-0.005219,-0.020083,-0.014592,1.000000,-0.029631,0.014696
D99,-0.006398,-0.011743,-0.001015,0.007605,0.000812,0.000602,0.033895,0.007702,0.002356,0.007271,...,-0.005556,-0.008049,-0.004447,0.011274,0.002424,-0.027234,-0.025490,-0.029631,1.000000,0.003649
