In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import seaborn as sns
import sklearn.model_selection as ms
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

## Data Processing

In [4]:
df = pd.read_csv(r'C:\Users\caleb\Documents\GitHub\Dota_Winrate_Analysis\Data\outladnersexport.csv')
df['Radiant']='R'+df['player_1'].map(str)+','+'R'+df['player_2'].map(str)+','+'R'+df['player_3'].map(str)+','+'R'+df['player_4'].map(str)+','+'R'+df['player_5'].map(str)
df['Dire']='D'+df['player_6'].map(str)+','+'D'+df['player_7'].map(str)+','+'D'+df['player_8'].map(str)+','+'D'+df['player_9'].map(str)+','+'D'+df['player_10'].map(str)
Radiant = df['Radiant'].str.get_dummies(sep=',')
Dire = df['Dire'].str.get_dummies(sep=',')
dfmodel = Radiant.join(Dire)
dfmodel['Radiant Win'] = df['radiant_win']

We want to check the radiant winrate, our goal is to create a model that's better than this mean.

In [5]:
df.radiant_win.mean()

0.5233

## Train Test Split

In [6]:
Train, Test = train_test_split(dfmodel, test_size=0.2,random_state=1)
xTrain = Train.drop(labels = 'Radiant Win',axis=1)
xTest = Test.drop(labels = 'Radiant Win',axis=1)
yTrain = Train['Radiant Win']
yTest = Test['Radiant Win']

## Creation of Model

In [7]:
model = LogisticRegression().fit(xTrain,yTrain)

## Testing of Model

In [8]:
#accuracy_score(yTest, model.predict(xTest))
(model.predict(xTest) == yTest).mean()

0.56075

In [9]:
print ('Logistic Regression accuracy:', np.mean(cross_val_score(LogisticRegression(), xTest, yTest, scoring='accuracy', cv = 3)))

Logistic Regression accuracy: 0.5487498186390676


## Fine-Tune Model

In [10]:
model = LogisticRegression(max_iter=1000).fit(xTest,yTest)
print('Logistic Regression accuracy:', np.mean(cross_val_score(LogisticRegression(max_iter=1000), xTest, yTest, scoring='accuracy', cv = 3)))

model = LogisticRegression(C=1,max_iter=1000).fit(xTest,yTest)
print('Logistic Regression accuracy:', np.mean(cross_val_score(LogisticRegression(C=1e5,max_iter=1000), xTest, yTest, scoring='accuracy', cv = 3)))

model = LogisticRegression(C=10000,max_iter=1000).fit(xTest,yTest)
print('Logistic Regression accuracy:', np.mean(cross_val_score(LogisticRegression(C=1e5,solver='newton-cg',max_iter=1000), xTest, yTest, scoring='accuracy', cv = ms.StratifiedKFold(random_state=1,shuffle = False))))

model = LogisticRegression(C=10000,solver='newton-cg',max_iter=1000).fit(xTest,yTest)
print('Logistic Regression accuracy:', np.mean(cross_val_score(LogisticRegression(C=1e5,solver='newton-cg',max_iter=1000), xTest, yTest, scoring='accuracy', cv = ms.StratifiedKFold(random_state=1,shuffle = True))))

Logistic Regression accuracy: 0.5487498186390676
Logistic Regression accuracy: 0.5492501311234855




Logistic Regression accuracy: 0.5455
Logistic Regression accuracy: 0.5475000000000001


## Logit summary

In [11]:
dfmodel['Yes'] = (dfmodel['Radiant Win']==True).astype(int) 
cols = ' + '.join(dfmodel.columns[:-2])
results = smf.logit('Yes ~ '+ cols, data=dfmodel).fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.672463
         Iterations 20


0,1,2,3
Dep. Variable:,Yes,No. Observations:,20000.0
Model:,Logit,Df Residuals:,19763.0
Method:,MLE,Df Model:,236.0
Date:,"Thu, 05 Mar 2020",Pseudo R-squ.:,0.02832
Time:,11:21:15,Log-Likelihood:,-13449.0
converged:,True,LL-Null:,-13841.0
Covariance Type:,nonrobust,LLR p-value:,5.444e-60

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0571,8.32e+05,6.86e-08,1.000,-1.63e+06,1.63e+06
R1,0.2506,2.09e+06,1.2e-07,1.000,-4.1e+06,4.1e+06
R10,-0.1434,2.09e+06,-6.85e-08,1.000,-4.1e+06,4.1e+06
R100,0.0176,2.09e+06,8.42e-09,1.000,-4.1e+06,4.1e+06
R101,-0.1080,2.09e+06,-5.16e-08,1.000,-4.1e+06,4.1e+06
R102,0.2161,2.09e+06,1.03e-07,1.000,-4.1e+06,4.1e+06
R103,0.5386,2.09e+06,2.58e-07,1.000,-4.1e+06,4.1e+06
R104,0.0003,2.09e+06,1.2e-10,1.000,-4.1e+06,4.1e+06
R105,-0.1770,2.09e+06,-8.46e-08,1.000,-4.1e+06,4.1e+06


## Conclusion

There were limited options for find-tuning the model, because the data was all categorical, so there was no feature scaling to be done. One way to increase accuracy is by increasing dataset or by improving the dataset.

In [12]:
dfmodel.corr()

Unnamed: 0,R1,R10,R100,R101,R102,R103,R104,R105,R106,R107,...,D92,D93,D94,D95,D96,D97,D98,D99,Radiant Win,Yes
R1,1.000000,-0.035499,-0.006195,0.008185,0.002624,-0.005396,-0.009554,-0.001816,0.000970,-0.000101,...,-0.011935,0.016484,0.034603,0.010016,-0.007972,-0.001368,-0.006159,-0.006398,0.026243,0.026243
R10,-0.035499,1.000000,-0.006568,-0.009594,-0.008334,0.001876,0.002673,0.004251,-0.014295,-0.009902,...,-0.008760,0.001296,0.003330,-0.000228,0.000265,0.006467,0.000499,-0.011743,-0.015177,-0.015177
R100,-0.006195,-0.006568,1.000000,-0.011533,-0.019325,0.002769,-0.008981,-0.018824,0.009331,-0.012481,...,-0.008485,-0.002040,-0.002850,-0.000671,0.001754,0.001763,0.006147,-0.001015,0.000224,0.000224
R101,0.008185,-0.009594,-0.011533,1.000000,-0.004235,-0.006619,0.034695,-0.010157,-0.014805,-0.001945,...,-0.004799,-0.017840,0.004428,-0.011059,0.001269,-0.007821,0.002130,0.007605,-0.010782,-0.010782
R102,0.002624,-0.008334,-0.019325,-0.004235,1.000000,0.006888,-0.029227,-0.011413,-0.003947,-0.005783,...,-0.008441,0.016751,-0.005202,0.012035,-0.000279,0.003811,0.002527,0.000812,0.013510,0.013510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D97,-0.001368,0.006467,0.001763,-0.007821,0.003811,0.000844,-0.004431,-0.007107,0.009102,-0.002486,...,-0.000067,-0.010986,-0.016322,0.000479,-0.014618,1.000000,-0.014592,-0.025490,0.011157,0.011157
D98,-0.006159,0.000499,0.006147,0.002130,0.002527,0.005056,-0.017935,-0.012420,-0.002418,-0.010369,...,-0.008944,0.006533,0.002404,-0.005219,-0.020083,-0.014592,1.000000,-0.029631,0.014696,0.014696
D99,-0.006398,-0.011743,-0.001015,0.007605,0.000812,0.000602,0.033895,0.007702,0.002356,0.007271,...,-0.008049,-0.004447,0.011274,0.002424,-0.027234,-0.025490,-0.029631,1.000000,0.003649,0.003649
Radiant Win,0.026243,-0.015177,0.000224,-0.010782,0.013510,0.020924,-0.008560,-0.020386,-0.019751,-0.013833,...,-0.004367,0.018062,0.021321,0.012275,0.019076,0.011157,0.014696,0.003649,1.000000,1.000000


In [15]:
dotaFrame = pd.read_csv(r'C:\Users\caleb\Documents\GitHub\Dota_Winrate_Analysis\Data\dotaFrame_20160229_to_20160306.csv')

In [16]:
dotaFrame['Radiant']='R'+dotaFrame['Radiant 1'].map(str)+','+'R'+dotaFrame['Radiant 2'].map(str)+','+'R'+dotaFrame['Radiant 3'].map(str)+','+'R'+dotaFrame['Radiant 4'].map(str)+','+'R'+dotaFrame['Radiant 5'].map(str)
dotaFrame['Dire']='D'+dotaFrame['Dire 1'].map(str)+','+'D'+dotaFrame['Dire 2'].map(str)+','+'D'+dotaFrame['Dire 3'].map(str)+','+'D'+dotaFrame['Dire 4'].map(str)+','+'D'+dotaFrame['Dire 5'].map(str)

In [17]:
dotaFrameLogitRadiant = dotaFrame['Radiant'].str.get_dummies(sep=',')
dotaFrameLogitDire = dotaFrame['Dire'].str.get_dummies(sep=',')
dotaFrameLogit = dotaFrameLogitRadiant.join(dotaFrameLogitDire)
dotaFrameLogit['Intercept']=1
dotaFrameLogit['Radiant Win Y/N'] = dotaFrame['Radiant Win Y/N']

In [19]:
dotaFrameLogit.corr()

Unnamed: 0,R1,R10,R100,R101,R102,R103,R104,R105,R106,R107,...,D92,D93,D94,D95,D96,D97,D98,D99,Intercept,Radiant Win Y/N
R1,1.000000,-0.033890,-0.002919,0.005391,-0.007390,-0.003084,-0.005736,0.006486,-0.024471,0.003148,...,0.001678,0.002838,0.027850,0.001567,0.000230,-0.004613,-0.002057,-0.006132,,-0.064709
R10,-0.033890,1.000000,0.003549,-0.007639,-0.000071,-0.002149,-0.011783,-0.002539,-0.017899,0.008195,...,-0.002425,0.005896,-0.005257,-0.004114,-0.001226,-0.001385,-0.004526,-0.001025,,-0.027986
R100,-0.002919,0.003549,1.000000,-0.001593,-0.011849,-0.000364,-0.017938,-0.004967,-0.003927,-0.016267,...,-0.004298,0.004279,-0.007296,0.007046,0.006646,0.003535,0.004614,0.001804,,-0.006298
R101,0.005391,-0.007639,-0.001593,1.000000,-0.007219,-0.000839,-0.003411,-0.005709,-0.003559,-0.010378,...,0.004218,0.003246,0.001405,0.002205,-0.000053,-0.003267,0.002600,0.004945,,-0.019175
R102,-0.007390,-0.000071,-0.011849,-0.007219,1.000000,-0.005208,-0.018263,-0.005312,-0.008467,-0.021432,...,0.007248,-0.003213,0.022840,0.000120,0.004817,0.003680,0.010111,-0.004481,,0.033105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D97,-0.004613,-0.001385,0.003535,-0.003267,0.003680,-0.000363,0.003039,-0.000091,-0.001582,-0.002613,...,-0.003318,-0.017212,-0.010355,-0.004533,-0.009537,1.000000,-0.007869,-0.016501,,0.026182
D98,-0.002057,-0.004526,0.004614,0.002600,0.010111,0.000594,-0.004581,0.000004,-0.001386,-0.002739,...,-0.003445,-0.011349,-0.000446,-0.004172,-0.005064,-0.007869,1.000000,-0.016798,,0.029201
D99,-0.006132,-0.001025,0.001804,0.004945,-0.004481,0.002929,0.008671,0.000751,-0.000810,-0.004877,...,-0.004961,-0.011930,-0.002564,-0.003888,-0.013409,-0.016501,-0.016798,1.000000,,0.021071
Intercept,,,,,,,,,,,...,,,,,,,,,,
