In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from scipy.sparse import coo_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

def missingdata(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    ms=pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    ms= ms[ms["Percent"] > 0]
   
    return ms

features = pd.read_csv('features.csv', index_col='match_id')
features.drop(columns=['duration', 'tower_status_radiant', 'tower_status_dire', 'barracks_status_radiant', 'barracks_status_dire'], inplace = True)


for (columnName, columnData) in (missingdata(features).T).iteritems():
    features[columnName].fillna(0, inplace = True)
    
data_x = features.iloc[:, 0:(len(features.columns)-1)]
data_y = features.iloc[:, (len(features.columns)-1):(len(features.columns))]

In [2]:
scaler = preprocessing.StandardScaler()
# Fit your data on the scaler object
x_scaled = scaler.fit_transform(data_x)
x_scaled = pd.DataFrame(x_scaled)

In [3]:
clf = LogisticRegression(penalty = 'l2', random_state=241, max_iter = 200, n_jobs = 1)

In [4]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=True, random_state=241)
grid = {'C': np.power(10.0, np.arange(-5, 6))}
gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=cv, n_jobs=-1)
gs.fit(data_x, data_y.values.ravel())
best_c = gs.best_estimator_.C
best_c

1e-05

In [5]:
clf = LogisticRegression(penalty = 'l2', random_state=241, max_iter = 200, n_jobs = 1, C=best_c)
np.mean(cross_val_score(clf, data_x, data_y.values.ravel(), cv=cv, scoring='roc_auc'))

0.5134556409261802

In [6]:
data_x_part = data_x.drop(columns=['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'])

In [7]:
cv = KFold(n_splits=5, shuffle=True, random_state=241)
grid = {'C': np.power(10.0, np.arange(-5, 6))}
gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=cv, n_jobs=-1)
gs.fit(data_x_part, data_y.values.ravel())
best_c = gs.best_estimator_.C
best_c

1e-05

In [8]:
clf = LogisticRegression(penalty = 'l2', random_state=241, max_iter = 200, n_jobs = 1, C=best_c)
np.mean(cross_val_score(clf, data_x_part, data_y.values.ravel(), cv=cv, scoring='roc_auc'))

0.5134556409261802

In [9]:
a = np.array(data_x['r1_hero'])
np.unique(a)

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 109, 110, 112], dtype=int64)

In [10]:
X_pick = np.zeros((data_x.shape[0], 112))

for i, match_id in enumerate(data_x.index):
    for p in range(5):
        X_pick[i, data_x.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick[i, data_x.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [11]:
np.where(~X_pick.any(axis=0))[0]

array([ 23, 106, 107, 110], dtype=int32)

In [12]:
X_pick = np.delete(X_pick, np.where(~X_pick.any(axis=0))[0], 1) 

In [13]:
x_scaled_part = scaler.fit_transform(data_x_part)
x_scaled_part = pd.DataFrame(x_scaled_part)

In [14]:
X = np.hstack([x_scaled_part, X_pick])

In [15]:
cv = KFold(n_splits=5, shuffle=True, random_state=241)
grid = {'C': np.power(10.0, np.arange(-5, 6))}
gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=cv, n_jobs=-1)
gs.fit(X, data_y.values.ravel())
best_c = gs.best_estimator_.C
best_c

0.1

In [16]:
clf = LogisticRegression(penalty = 'l2', random_state=241, max_iter = 200, n_jobs = 1, C=best_c)
np.mean(cross_val_score(clf, X, data_y.values.ravel(), cv=cv, scoring='roc_auc'))

0.7519379754795084

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

x_train, x_test, y_train, y_test=train_test_split(X, data_y, shuffle= True, test_size=0.2, random_state=241)

clf2 = LogisticRegression(penalty = 'l2', random_state=241, max_iter = 200, n_jobs = 1, C= best_c)
clf2.fit(x_train, y_train.values.ravel())
pred = clf2.predict_proba(x_test)[:, 1]
roc_auc_score(y_test, pred)

0.7551591793314203

In [19]:
test_data = pd.read_csv('features_test.csv', index_col='match_id')
for (columnName, columnData) in (missingdata(test_data).T).iteritems():
    test_data[columnName].fillna(0, inplace = True)
    
test_data_part = test_data.drop(columns=['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'])

In [20]:
x_scaled_test = scaler.fit_transform(test_data_part)
x_scaled_test = pd.DataFrame(x_scaled_test)

In [21]:
X_pick_test = np.zeros((test_data.shape[0], 112))

for i, match_id in enumerate(test_data.index):
    for p in range(5):
        X_pick_test[i, test_data.loc[match_id, 'r%d_hero' % (p+1)]-1] = 1
        X_pick_test[i, test_data.loc[match_id, 'd%d_hero' % (p+1)]-1] = -1

In [22]:
X_pick_test = np.delete(X_pick_test, np.where(~X_pick_test.any(axis=0))[0], 1)  # delete second column of C

In [23]:
X_train_test = np.hstack([x_scaled_test, X_pick_test])

In [33]:
clf.fit(X, data_y.values.ravel())
pred = clf.predict_proba(X_train_test)[:, 1]

MemoryError: Unable to allocate 148. MiB for an array with shape (97230, 199) and data type float64

In [28]:
pred2 = clf2.predict_proba(X_train_test)[:, 1]

In [30]:
np.mean(pred)

0.5184133225443168

In [31]:
np.mean(pred2)

0.5184133225443168

In [34]:
np.mean(pred2)-np.mean(pred)

0.0

In [35]:
test_data_1 = pd.read_csv('features_test.csv')

In [37]:
import csv
submission = pd.DataFrame({ 'match_id': test_data_1['match_id'], 'radiant_win': pred2 })
submission.to_csv("submission.csv", index=False)