In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import time
import datetime

In [2]:
RANDOM_STATE = 777
FOLDS_COUNT = 5
FUTURE_COLUMNS = 'duration radiant_win tower_status_radiant tower_status_dire barracks_status_radiant barracks_status_dire'.split(' ')
CATEGORICAL_COLUMNS = 'lobby_type r1_hero r2_hero r3_hero r4_hero r5_hero d1_hero d2_hero d3_hero d4_hero d5_hero'.split(' ')

In [3]:
def get_kf_score_for_classifier(clf, X, y):
    kf = KFold(n_splits=FOLDS_COUNT, shuffle=True, random_state=RANDOM_STATE)
    scores = []
    
    start_time = datetime.datetime.now()
    
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        print '  Fitting with fold', (i + 1), '...'
        
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]
        X_test = X.iloc[test_index]
        y_test = y.iloc[test_index]

        clf.fit(X_train, y_train)
        predicted = clf.predict_proba(X_test)[:, 1]
        scores.append(roc_auc_score(y_test, predicted))
        
    print 'Time elapsed:', datetime.datetime.now() - start_time
    
    return np.mean(scores)

In [4]:
def variate_and_cross_validate_gradient_boosting(X):
    for n_estimators in [10, 20, 30]:
        print 'Running cross-validation for gradient boosting with', n_estimators, 'estimators:'

        clf = GradientBoostingClassifier(n_estimators=n_estimators, random_state=RANDOM_STATE)
        score = get_kf_score_for_classifier(clf, X, y)

        print 'Mean score for', n_estimators, 'estimators:', score
        print ''

def variate_and_cross_validate_logistic_regression(X):
    clfs = {}
    
    for C in [1., 5., 10.]:
        print 'Running cross-validation for logistic regression with C =', C, ':'

        clfs[C] = LogisticRegression(C=C)
        score = get_kf_score_for_classifier(clfs[C], X, y)
        
        print 'Mean score for C =', C, ':', score
        print ''
        
    return clfs

In [5]:
def make_word_bag_for_heros(X, heros_columns):
    heros = X[heros_columns].stack().unique()
    heros_count = len(heros)
    print 'Total heros:', heros_count

    def get_hero_index(hero_id):
        return np.where(heros == hero_id)

    print 'Making words bag...'

    X_pick = np.zeros((X.shape[0], heros_count))

    for i, match_id in enumerate(X.index):
        for p in xrange(5):
            X_pick[i, get_hero_index(X.loc[match_id, 'r%d_hero' % (p + 1)])] = 1
            X_pick[i, get_hero_index(X.loc[match_id, 'd%d_hero' % (p + 1)])] = -1

    print 'OK'

    return pd.DataFrame(X_pick, index=X.index)

In [6]:
print 'Task 1.1'

data = pd.read_csv('./features.csv', index_col='match_id')
X = data.drop(FUTURE_COLUMNS, axis=1)

y_name = 'radiant_win'
y = data[y_name]

X.head()

Task 1.1


Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,0,35.0,103.0,-84.0,221.0,3,4,2,2,-52.0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,0,-20.0,149.0,-84.0,195.0,5,4,3,1,-5.0
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,1,-39.0,45.0,-77.0,221.0,3,4,3,1,13.0
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,0,-30.0,124.0,-80.0,184.0,0,4,2,0,27.0
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,0,46.0,182.0,-80.0,225.0,6,3,3,0,-16.0


In [7]:
print 'Task 1.2'

total_count = X.shape[0]
counts = X.count()
print ', '.join(counts[counts < total_count].index.tolist())

Task 1.2
first_blood_time, first_blood_team, first_blood_player1, first_blood_player2, radiant_bottle_time, radiant_courier_time, radiant_flying_courier_time, radiant_first_ward_time, dire_bottle_time, dire_courier_time, dire_flying_courier_time, dire_first_ward_time


In [8]:
print 'Task 1.3'

X.fillna(0, inplace=True)
X.head()

Task 1.3


Unnamed: 0_level_0,start_time,lobby_type,r1_hero,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,...,radiant_ward_sentry_count,radiant_first_ward_time,dire_bottle_time,dire_courier_time,dire_flying_courier_time,dire_tpscroll_count,dire_boots_count,dire_ward_observer_count,dire_ward_sentry_count,dire_first_ward_time
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1430198770,7,11,5,2098,1489,20,0,0,7,...,0,35.0,103.0,-84.0,221.0,3,4,2,2,-52.0
1,1430220345,0,42,4,1188,1033,9,0,1,12,...,0,-20.0,149.0,-84.0,195.0,5,4,3,1,-5.0
2,1430227081,7,33,4,1319,1270,22,0,0,12,...,1,-39.0,45.0,-77.0,221.0,3,4,3,1,13.0
3,1430263531,1,29,4,1779,1056,14,0,0,5,...,0,-30.0,124.0,-80.0,184.0,0,4,2,0,27.0
4,1430282290,7,13,4,1431,1090,8,1,0,8,...,0,46.0,182.0,-80.0,225.0,6,3,3,0,-16.0


In [9]:
print 'Task 1.4'

print y_name

Task 1.4
radiant_win


In [10]:
print 'Task 1.5'

variate_and_cross_validate_gradient_boosting(X)

Task 1.5
Running cross-validation for gradient boosting with 10 estimators:
  Fitting with fold 1 ...
  Fitting with fold 2 ...
  Fitting with fold 3 ...
  Fitting with fold 4 ...
  Fitting with fold 5 ...
Time elapsed: 0:00:26.586780
Mean score for 10 estimators: 0.664544090022

Running cross-validation for gradient boosting with 20 estimators:
  Fitting with fold 1 ...
  Fitting with fold 2 ...
  Fitting with fold 3 ...
  Fitting with fold 4 ...
  Fitting with fold 5 ...
Time elapsed: 0:00:51.436300
Mean score for 20 estimators: 0.682601160317

Running cross-validation for gradient boosting with 30 estimators:
  Fitting with fold 1 ...
  Fitting with fold 2 ...
  Fitting with fold 3 ...
  Fitting with fold 4 ...
  Fitting with fold 5 ...
Time elapsed: 0:01:20.047864
Mean score for 30 estimators: 0.689809669147



In [11]:
print 'Scaling...'
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

print 'Dropping categorical features...'
X_no_cat = X_scaled.drop(CATEGORICAL_COLUMNS, axis=1)

print 'OK'

Scaling...
Dropping categorical features...
OK


In [12]:
print 'Task 2.1'

variate_and_cross_validate_logistic_regression(X_scaled)

Task 2.1
Running cross-validation for logistic regression with C = 1.0 :
  Fitting with fold 1 ...
  Fitting with fold 2 ...
  Fitting with fold 3 ...
  Fitting with fold 4 ...
  Fitting with fold 5 ...
Time elapsed: 0:00:15.516868
Mean score for C = 1.0 : 0.71644257557

Running cross-validation for logistic regression with C = 5.0 :
  Fitting with fold 1 ...
  Fitting with fold 2 ...
  Fitting with fold 3 ...
  Fitting with fold 4 ...
  Fitting with fold 5 ...
Time elapsed: 0:00:12.751656
Mean score for C = 5.0 : 0.716442315065

Running cross-validation for logistic regression with C = 10.0 :
  Fitting with fold 1 ...
  Fitting with fold 2 ...
  Fitting with fold 3 ...
  Fitting with fold 4 ...
  Fitting with fold 5 ...
Time elapsed: 0:00:11.773597
Mean score for C = 10.0 : 0.716442300229



{1.0: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 5.0: LogisticRegression(C=5.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 10.0: LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)}

In [13]:
print 'Task 2.2'

variate_and_cross_validate_logistic_regression(X_no_cat)

Task 2.2
Running cross-validation for logistic regression with C = 1.0 :
  Fitting with fold 1 ...
  Fitting with fold 2 ...
  Fitting with fold 3 ...
  Fitting with fold 4 ...
  Fitting with fold 5 ...
Time elapsed: 0:00:13.110995
Mean score for C = 1.0 : 0.716481406549

Running cross-validation for logistic regression with C = 5.0 :
  Fitting with fold 1 ...
  Fitting with fold 2 ...
  Fitting with fold 3 ...
  Fitting with fold 4 ...
  Fitting with fold 5 ...
Time elapsed: 0:00:11.204941
Mean score for C = 5.0 : 0.716480934124

Running cross-validation for logistic regression with C = 10.0 :
  Fitting with fold 1 ...
  Fitting with fold 2 ...
  Fitting with fold 3 ...
  Fitting with fold 4 ...
  Fitting with fold 5 ...
Time elapsed: 0:00:10.617866
Mean score for C = 10.0 : 0.716480866338



{1.0: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 5.0: LogisticRegression(C=5.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 10.0: LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)}

In [15]:
print 'Tasks 2.3, 2.4'

hero_bag = make_word_bag_for_heros(X, CATEGORICAL_COLUMNS[1:])

X_with_bag = pd.concat([X_no_cat, hero_bag], axis=1)
X_with_bag.head()

Tasks 2.3, 2.4
Total heros: 108
Making words bag...
OK


Unnamed: 0_level_0,start_time,r1_level,r1_xp,r1_gold,r1_lh,r1_kills,r1_deaths,r1_items,r2_level,r2_xp,...,98,99,100,101,102,103,104,105,106,107
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-2.544364,1.400808,1.525972,0.734957,0.969743,-0.537757,-0.578083,-0.509023,-0.332256,-0.625222,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-2.540452,0.501314,-0.080139,-0.24757,-0.246859,-0.537757,1.017574,1.49293,0.578881,0.732454,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-2.539231,0.501314,0.15107,0.263085,1.190944,-0.537757,-0.578083,1.49293,-0.332256,0.224676,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2.532622,0.501314,0.96295,-0.198013,0.306142,-0.537757,-0.578083,-1.309804,-1.243393,-1.170813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-2.529221,0.501314,0.348745,-0.124754,-0.357459,0.968527,-0.578083,-0.108632,-1.243393,-1.008757,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
print 'Task 2.5'

clfs_with_bag = variate_and_cross_validate_logistic_regression(X_with_bag)

Task 2.5
Running cross-validation for logistic regression with C = 1.0 :
  Fitting with fold 1 ...
  Fitting with fold 2 ...
  Fitting with fold 3 ...
  Fitting with fold 4 ...
  Fitting with fold 5 ...
Time elapsed: 0:00:23.486848
Mean score for C = 1.0 : 0.751848261683

Running cross-validation for logistic regression with C = 5.0 :
  Fitting with fold 1 ...
  Fitting with fold 2 ...
  Fitting with fold 3 ...
  Fitting with fold 4 ...
  Fitting with fold 5 ...
Time elapsed: 0:00:25.887392
Mean score for C = 5.0 : 0.751845753715

Running cross-validation for logistic regression with C = 10.0 :
  Fitting with fold 1 ...
  Fitting with fold 2 ...
  Fitting with fold 3 ...
  Fitting with fold 4 ...
  Fitting with fold 5 ...
Time elapsed: 0:00:24.727854
Mean score for C = 10.0 : 0.7518455715



In [17]:
print 'Task 2.6'

best_clf = clfs_with_bag[1.]

X = pd.read_csv('./features_test.csv', index_col='match_id')
X.fillna(0, inplace=True)

X_no_cat = X.drop(CATEGORICAL_COLUMNS, axis=1)
X_no_cat_scaled = pd.DataFrame(scaler.fit_transform(X_no_cat), index=X_no_cat.index, columns=X_no_cat.columns)

hero_bag = make_word_bag_for_heros(X, CATEGORICAL_COLUMNS[1:])
X_with_bag = pd.concat([X_no_cat_scaled, hero_bag], axis=1)

predicted = best_clf.predict_proba(X_with_bag)[:, 1]

print 'Max:', predicted.max(), 'Min:', predicted.min()

Task 2.6
Total heros: 108
Making words bag...
OK
Max: 0.996967686715 Min: 0.00411188890906
