In [1]:
import time
import datetime
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
data = pd.read_csv('7_final/features.csv')
data_test = pd.read_csv('7_final/features_test.csv')
data.head()
data_test.head()

FileNotFoundError: File b'7_final/features.csv' does not exist

In [None]:
y = data['radiant_win']
columns_dropping = ['duration','radiant_win','tower_status_radiant','tower_status_dire','barracks_status_radiant','barracks_status_dire']
data.drop(columns_dropping, axis=1, inplace=True, errors='ignore')

xx = (data.isnull() == True).sum(axis=0)

nan_columns= [i for i in xx.index if xx[i]>0]
nan_columns

In [None]:
data.fillna(value = 0,inplace = True)
X_train = np.array(data)
y_train = np.array(y)

In [None]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)
n_estimators = [10,20,30]
mean_rocauc_score = []
for N in n_estimators:
    clf = GradientBoostingClassifier(n_estimators=N, verbose=False, random_state=241)
    mean_rocauc_score.append(np.mean(cross_val_score(clf,X_train,y_train,cv=kf,scoring='roc_auc')))

In [None]:
plt.figure()
plt.plot(n_estimators,mean_rocauc_score, 'ro', linewidth=2)
plt.legend(['roc_auc'])
plt.xlabel('n_estimators')
plt.show()

In [None]:
start_time = datetime.datetime.now()

clf = GradientBoostingClassifier(n_estimators=30, verbose=False, random_state=241)
n_30_rocauc_cross_val_score = np.mean(cross_val_score(clf,X_train,y_train,cv=kf,scoring='roc_auc'))

print('Time elapsed:', datetime.datetime.now() - start_time)
n_30_rocauc_cross_val_score

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
grid = {'C': np.power(10.0, np.arange(-5, 6))}
clf_logreg = LogisticRegression(penalty='l2', random_state=241)
gs = GridSearchCV(clf_logreg, grid, scoring='roc_auc', cv=cv)
gs.fit(X_train_scaled,y_train)
C = gs.best_params_['C']
score = gs.best_score_
C, score

In [None]:
start_time = datetime.datetime.now()

clf_logreg = LogisticRegression(penalty='l2', random_state=241, C = C)
logreg_mean_score = np.mean(cross_val_score(clf_logreg,X_train_scaled,y_train,cv=kf,scoring='roc_auc'))

print('Time elapsed:', datetime.datetime.now() - start_time)
logreg_mean_score

In [None]:
categorial = ['lobby_type']
for i in ['r','d']:
    for c in range(1,6):
        categorial.append('{}{}_hero'.format(i,c))

X_no_categorial = data.drop(categorial, axis=1, errors='ignore')
X_train_nocat_scaled = scaler.fit_transform(X_no_categorial)
categorial.remove('lobby_type')

clf_logreg = LogisticRegression(penalty='l2', random_state=241)
gs = GridSearchCV(clf_logreg, grid, scoring='roc_auc', cv=cv)
gs.fit(X_train_nocat_scaled,y_train)
C = gs.best_params_['C']
score = gs.best_score_
C, score

In [None]:
heroes = []
for cat in categorial:
    heroes.extend(list(data[cat].unique()))
heroes = set(heroes)
N = max(heroes)

X_pick = np.zeros((data.shape[0], N))

for i, match_id in enumerate(data.index):
    for p in range(5):
        X_pick[i, data.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [None]:
X = np.array(X_train_nocat_scaled)
X = np.hstack((X,X_pick))
gs = GridSearchCV(clf_logreg, grid, scoring='roc_auc', cv=cv)
gs.fit(X,y_train)
C = gs.best_params_['C']
score = gs.best_score_
C, score

The best roc-auc score was shown with logistic regression model (scaled data, no categorial features, bag of words approach used, regularization constant == 0.10)

In [None]:
data_test.fillna(value = 0,inplace = True)
categorial = ['lobby_type']
for i in ['r','d']:
    for c in range(1,6):
        categorial.append('{}{}_hero'.format(i,c))



X_pick = np.zeros((data_test.shape[0], N))

for i, match_id in enumerate(data_test.index):
    for p in range(5):
        X_pick[i, data.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

data_test.drop(categorial, axis=1, errors='ignore',inplace=True)
X_test = scaler.fit_transform(data_test)
X_test = np.hstack((X_test,X_pick))



In [None]:
clf_logreg_final = LogisticRegression(penalty='l2', random_state=241, C = 0.10)
clf_logreg_final.fit(X,y_train)
y_hat = clf_logreg_final.predict(X_test)

In [None]:
min(clf_logreg_final.predict_proba(X_test)[:,1])
max(clf_logreg_final.predict_proba(X_test)[:,1])