In [1]:
import sys

sys.path.insert(0,'../')
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.utils import shuffle
import pandas as pd
from itertools import chain
import numpy as np
from scipy.sparse import csr_matrix
from tqdm import tqdm
from multiprocessing import Pool
from scipy.sparse import vstack
from functools import reduce

pd.set_option("display.max_columns", 150)

In [2]:
def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [3]:
all_data = pd.concat([pd.read_csv('../data/features.csv.zip',compression='zip'),\
                     pd.read_csv('../data/features_test.csv.zip',compression='zip')])

In [4]:
all_data = all_data.reset_index()
del all_data['index']
all_data.head()

Unnamed: 0,barracks_status_dire,barracks_status_radiant,d1_deaths,d1_gold,d1_hero,d1_items,d1_kills,d1_level,d1_lh,d1_xp,d2_deaths,d2_gold,d2_hero,d2_items,d2_kills,d2_level,d2_lh,d2_xp,d3_deaths,d3_gold,d3_hero,d3_items,d3_kills,d3_level,d3_lh,d3_xp,d4_deaths,d4_gold,d4_hero,d4_items,d4_kills,d4_level,d4_lh,d4_xp,d5_deaths,d5_gold,d5_hero,d5_items,d5_kills,d5_level,d5_lh,d5_xp,dire_boots_count,dire_bottle_time,dire_courier_time,dire_first_ward_time,dire_flying_courier_time,dire_tpscroll_count,dire_ward_observer_count,dire_ward_sentry_count,duration,first_blood_player1,first_blood_player2,first_blood_team,first_blood_time,lobby_type,match_id,r1_deaths,r1_gold,r1_hero,r1_items,r1_kills,r1_level,r1_lh,r1_xp,r2_deaths,r2_gold,r2_hero,r2_items,r2_kills,r2_level,r2_lh,r2_xp,r3_deaths,r3_gold,r3_hero,r3_items,r3_kills,r3_level,r3_lh,r3_xp,r4_deaths,r4_gold,r4_hero,r4_items,r4_kills,r4_level,r4_lh,r4_xp,r5_deaths,r5_gold,r5_hero,r5_items,r5_kills,r5_level,r5_lh,r5_xp,radiant_boots_count,radiant_bottle_time,radiant_courier_time,radiant_first_ward_time,radiant_flying_courier_time,radiant_tpscroll_count,radiant_ward_observer_count,radiant_ward_sentry_count,radiant_win,start_time,tower_status_dire,tower_status_radiant
0,0.0,51.0,0,996,4,6,0,3,12,1058,0,986,42,4,0,4,12,1085,0,1536,21,6,0,5,23,2052,0,500,37,8,0,3,2,742,0,1003,84,9,1,3,3,958,4,103.0,-84.0,-52.0,221.0,3,2,2,2874.0,9.0,,1.0,7.0,7,0,0,1489,11,7,0,5,20,2098,0,991,67,4,0,3,10,842,0,1143,29,8,0,5,10,1909,0,741,20,7,0,3,6,757,1,658,105,11,0,3,4,732,2,134.0,-80.0,35.0,244.0,2,2,0,1.0,1430198770,0.0,1796.0
1,1.0,63.0,0,1384,39,8,0,5,16,1960,1,566,88,5,0,3,1,640,0,1350,79,12,2,3,2,720,0,583,7,7,0,2,0,440,0,1622,12,9,0,4,24,1470,4,149.0,-84.0,-5.0,195.0,5,3,1,2463.0,7.0,,1.0,54.0,0,1,1,1033,42,12,0,4,9,1188,1,993,49,7,0,4,10,1596,0,1502,67,7,1,4,18,1506,0,631,37,7,0,3,7,669,0,539,26,5,0,2,1,415,0,173.0,-80.0,-20.0,,2,2,0,1.0,1430220345,0.0,1974.0
2,63.0,0.0,1,2028,22,10,1,5,19,2305,1,959,66,10,0,3,19,1024,0,620,86,8,0,3,3,755,0,667,29,7,0,4,4,1319,0,1512,80,7,0,3,25,1350,4,45.0,-77.0,13.0,221.0,3,3,1,2130.0,3.0,,0.0,224.0,7,2,0,1270,33,12,0,4,22,1319,0,775,98,6,0,3,6,1314,0,909,20,6,1,3,0,1297,1,2096,27,6,1,5,26,2360,0,1627,4,9,0,3,27,1395,5,63.0,-82.0,-39.0,,2,2,1,0.0,1430227081,1830.0,0.0
3,63.0,50.0,0,1174,96,6,0,5,17,1878,0,1468,48,10,0,3,22,732,0,1051,15,7,0,4,11,1681,0,537,102,7,0,2,1,674,0,499,20,7,0,2,0,510,4,124.0,-80.0,27.0,184.0,0,2,0,1459.0,,,,,1,3,0,1056,29,5,0,4,14,1779,0,539,30,6,0,2,1,539,0,1139,75,6,0,5,15,2037,0,499,37,6,0,2,0,591,0,1075,41,6,0,3,12,712,3,208.0,-75.0,-30.0,,0,2,0,0.0,1430263531,2047.0,1920.0
4,63.0,3.0,2,586,26,9,0,3,1,704,0,1665,69,7,1,3,20,1169,0,638,22,9,0,3,1,1055,0,1275,25,8,0,5,18,1815,1,904,8,7,0,4,6,1119,3,182.0,-80.0,-16.0,225.0,6,3,0,2449.0,6.0,,1.0,-21.0,7,4,0,1090,13,8,1,4,8,1431,1,552,27,7,0,2,0,629,0,927,30,8,1,3,0,884,0,1439,72,11,1,3,16,925,0,880,93,8,0,4,7,1482,4,166.0,-81.0,46.0,181.0,1,2,0,0.0,1430282290,1974.0,4.0


In [5]:
len(all_data)

114407

In [6]:
all_data.first_blood_team = all_data.first_blood_team.replace(0.0,-1.0)
all_data.first_blood_team = all_data.first_blood_team.fillna(0.0)

all_data.first_blood_player2 = all_data.first_blood_player2.fillna(0.0)
all_data.first_blood_player1 = all_data.first_blood_player1.fillna(0.0)
all_data.first_blood_player1 = all_data.first_blood_player1.astype(int)
all_data.first_blood_player2 = all_data.first_blood_player2.astype(int)

In [7]:
# for feat in list(all_data.columns):
#     if feat.endswith('level'):
#         prefix = feat[:3]
#         all_data[prefix+'levhero'] = all_data[feat].astype(str).str.cat(all_data[prefix+'hero'].astype(str),sep='_'+prefix[0]+'_')

In [8]:
all_data.head()

Unnamed: 0,barracks_status_dire,barracks_status_radiant,d1_deaths,d1_gold,d1_hero,d1_items,d1_kills,d1_level,d1_lh,d1_xp,d2_deaths,d2_gold,d2_hero,d2_items,d2_kills,d2_level,d2_lh,d2_xp,d3_deaths,d3_gold,d3_hero,d3_items,d3_kills,d3_level,d3_lh,d3_xp,d4_deaths,d4_gold,d4_hero,d4_items,d4_kills,d4_level,d4_lh,d4_xp,d5_deaths,d5_gold,d5_hero,d5_items,d5_kills,d5_level,d5_lh,d5_xp,dire_boots_count,dire_bottle_time,dire_courier_time,dire_first_ward_time,dire_flying_courier_time,dire_tpscroll_count,dire_ward_observer_count,dire_ward_sentry_count,duration,first_blood_player1,first_blood_player2,first_blood_team,first_blood_time,lobby_type,match_id,r1_deaths,r1_gold,r1_hero,r1_items,r1_kills,r1_level,r1_lh,r1_xp,r2_deaths,r2_gold,r2_hero,r2_items,r2_kills,r2_level,r2_lh,r2_xp,r3_deaths,r3_gold,r3_hero,r3_items,r3_kills,r3_level,r3_lh,r3_xp,r4_deaths,r4_gold,r4_hero,r4_items,r4_kills,r4_level,r4_lh,r4_xp,r5_deaths,r5_gold,r5_hero,r5_items,r5_kills,r5_level,r5_lh,r5_xp,radiant_boots_count,radiant_bottle_time,radiant_courier_time,radiant_first_ward_time,radiant_flying_courier_time,radiant_tpscroll_count,radiant_ward_observer_count,radiant_ward_sentry_count,radiant_win,start_time,tower_status_dire,tower_status_radiant
0,0.0,51.0,0,996,4,6,0,3,12,1058,0,986,42,4,0,4,12,1085,0,1536,21,6,0,5,23,2052,0,500,37,8,0,3,2,742,0,1003,84,9,1,3,3,958,4,103.0,-84.0,-52.0,221.0,3,2,2,2874.0,9,0,1.0,7.0,7,0,0,1489,11,7,0,5,20,2098,0,991,67,4,0,3,10,842,0,1143,29,8,0,5,10,1909,0,741,20,7,0,3,6,757,1,658,105,11,0,3,4,732,2,134.0,-80.0,35.0,244.0,2,2,0,1.0,1430198770,0.0,1796.0
1,1.0,63.0,0,1384,39,8,0,5,16,1960,1,566,88,5,0,3,1,640,0,1350,79,12,2,3,2,720,0,583,7,7,0,2,0,440,0,1622,12,9,0,4,24,1470,4,149.0,-84.0,-5.0,195.0,5,3,1,2463.0,7,0,1.0,54.0,0,1,1,1033,42,12,0,4,9,1188,1,993,49,7,0,4,10,1596,0,1502,67,7,1,4,18,1506,0,631,37,7,0,3,7,669,0,539,26,5,0,2,1,415,0,173.0,-80.0,-20.0,,2,2,0,1.0,1430220345,0.0,1974.0
2,63.0,0.0,1,2028,22,10,1,5,19,2305,1,959,66,10,0,3,19,1024,0,620,86,8,0,3,3,755,0,667,29,7,0,4,4,1319,0,1512,80,7,0,3,25,1350,4,45.0,-77.0,13.0,221.0,3,3,1,2130.0,3,0,-1.0,224.0,7,2,0,1270,33,12,0,4,22,1319,0,775,98,6,0,3,6,1314,0,909,20,6,1,3,0,1297,1,2096,27,6,1,5,26,2360,0,1627,4,9,0,3,27,1395,5,63.0,-82.0,-39.0,,2,2,1,0.0,1430227081,1830.0,0.0
3,63.0,50.0,0,1174,96,6,0,5,17,1878,0,1468,48,10,0,3,22,732,0,1051,15,7,0,4,11,1681,0,537,102,7,0,2,1,674,0,499,20,7,0,2,0,510,4,124.0,-80.0,27.0,184.0,0,2,0,1459.0,0,0,0.0,,1,3,0,1056,29,5,0,4,14,1779,0,539,30,6,0,2,1,539,0,1139,75,6,0,5,15,2037,0,499,37,6,0,2,0,591,0,1075,41,6,0,3,12,712,3,208.0,-75.0,-30.0,,0,2,0,0.0,1430263531,2047.0,1920.0
4,63.0,3.0,2,586,26,9,0,3,1,704,0,1665,69,7,1,3,20,1169,0,638,22,9,0,3,1,1055,0,1275,25,8,0,5,18,1815,1,904,8,7,0,4,6,1119,3,182.0,-80.0,-16.0,225.0,6,3,0,2449.0,6,0,1.0,-21.0,7,4,0,1090,13,8,1,4,8,1431,1,552,27,7,0,2,0,629,0,927,30,8,1,3,0,884,0,1439,72,11,1,3,16,925,0,880,93,8,0,4,7,1482,4,166.0,-81.0,46.0,181.0,1,2,0,0.0,1430282290,1974.0,4.0


In [9]:
# all_hero_unique_vals = list(all_data.r1_hero.unique())

# def get_hero_ordered_level(level,side,max_level=7):
#     return [(side if j<=int(level) else 0.0) for j in range(max_level+1)]

# def encode_hero(herolev_val,heroes_total=108,max_level=7):
#     hero_level,hero_side,hero_index = herolev_val.split('_')
    
#     side_multiplier = 1.0 if hero_side=='r' else -1.0
    
#     res = [get_hero_ordered_level(hero_level,side_multiplier) if int(hero_index)==i else [0.0 for i in range(max_level+1)]
#             for i in all_hero_unique_vals]
#     return csr_matrix([r for r in chain(*res)])
    
# def encode_df_col(df_col):
#     return all_data[df_col].apply(encode_hero)

In [10]:
# p1 = Pool(4)

# levhero_cols = [col for col in all_data.columns if col.endswith('levhero')]

# all_heroes_vectors_accumulated = p1.map(encode_df_col, levhero_cols)

# v_stacked = [vstack(rows) for rows in all_heroes_vectors_accumulated]

# v_stacked_additioned  = reduce(lambda x,y: x + y, v_stacked)

# save_sparse_csr('./sparse_herolev.csr',v_stacked_additioned)

In [3]:
v_stacked_additioned = load_sparse_csr('./sparse_herolev.csr.npz')

In [4]:
v_stacked_dense = v_stacked_additioned.toarray()

In [6]:
v_stacked_dense[1,:]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -1., -1.,
       -1., -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [15]:
df_data_to_sort_radiants.head()

Unnamed: 0,r1_deaths,r1_gold,r1_hero,r1_items,r1_kills,r1_level,r1_lh,r1_xp,r2_deaths,r2_gold,r2_hero,r2_items,r2_kills,r2_level,r2_lh,r2_xp,r3_deaths,r3_gold,r3_hero,r3_items,r3_kills,r3_level,r3_lh,r3_xp,r4_deaths,r4_gold,r4_hero,r4_items,r4_kills,r4_level,r4_lh,r4_xp,r5_deaths,r5_gold,r5_hero,r5_items,r5_kills,r5_level,r5_lh,r5_xp
0,0,1489,11,7,0,5,20,2098,0,991,67,4,0,3,10,842,0,1143,29,8,0,5,10,1909,0,741,20,7,0,3,6,757,1,658,105,11,0,3,4,732
1,1,1033,42,12,0,4,9,1188,1,993,49,7,0,4,10,1596,0,1502,67,7,1,4,18,1506,0,631,37,7,0,3,7,669,0,539,26,5,0,2,1,415
2,0,1270,33,12,0,4,22,1319,0,775,98,6,0,3,6,1314,0,909,20,6,1,3,0,1297,1,2096,27,6,1,5,26,2360,0,1627,4,9,0,3,27,1395
3,0,1056,29,5,0,4,14,1779,0,539,30,6,0,2,1,539,0,1139,75,6,0,5,15,2037,0,499,37,6,0,2,0,591,0,1075,41,6,0,3,12,712
4,0,1090,13,8,1,4,8,1431,1,552,27,7,0,2,0,629,0,927,30,8,1,3,0,884,0,1439,72,11,1,3,16,925,0,880,93,8,0,4,7,1482


In [33]:
import re

cols_with_rad_heroes_nums = re.compile(r'r[1-5]_[de|go|it|ki|lh|xp|he].*')

cols_to_sort_rad = [col for col in all_data.columns if cols_with_rad_heroes_nums.match(col)]
df_data_to_sort_radiants = all_data[cols_to_sort_rad]

In [34]:
match_to_sort = re.compile(r'.[1-5]_xp')
cols_sort_by = [c for c in cols_to_sort_rad if match_to_sort.match(c)]
print('sorting by {}'.format(cols_sort_by))

cols_vals = np.array([0,0,0,0,0])
argsorted_rad_array = np.zeros((len(df_data_to_sort_radiants),len(cols_to_sort_rad)+1),dtype=int)
block_len_to_sort = len(cols_to_sort_rad)

sort_by_indexes = [i+1 for i,v in enumerate(cols_to_sort_rad) if match_to_sort.match(v)]

sorted_row_indexes = np.array([0 for i in range(len(cols_to_sort_rad)+1)])

def propagate_sort_by_indexes(cur_df_index,sort_by,propagate_num=int(len(cols_to_sort_rad)/len(cols_sort_by))):
    sorted_row_indexes[0] = 0
    for ind_0,ind in enumerate(sort_by):
        sorted_row_indexes[1+ind_0*propagate_num:1+(ind_0+1)*propagate_num] = np.arange(1+ind*propagate_num,1+(ind+1)*propagate_num)
    return sorted_row_indexes

for df_tuple in df_data_to_sort_radiants.itertuples():
    tuple_array = np.array(df_tuple)
    cols_vals[:] = tuple_array[sort_by_indexes]
        
    argsorted = np.argsort(cols_vals)
    cur_df_index = df_tuple[0]
    argsorted_rad_array[cur_df_index,0] = cur_df_index
    argsorted_rad_array[cur_df_index,:] = tuple_array[propagate_sort_by_indexes(cur_df_index,argsorted)]
    
for ind,col in enumerate(cols_to_sort_rad):
    all_data[col] = pd.Series(index=argsorted_rad_array[:,0],data=argsorted_rad_array[:,ind+1])

sorting by ['r1_xp', 'r2_xp', 'r3_xp', 'r4_xp', 'r5_xp']


In [35]:
all_data[cols_to_sort_rad].head()

Unnamed: 0,r1_deaths,r1_gold,r1_hero,r1_items,r1_kills,r1_level,r1_lh,r1_xp,r2_deaths,r2_gold,r2_hero,r2_items,r2_kills,r2_level,r2_lh,r2_xp,r3_deaths,r3_gold,r3_hero,r3_items,r3_kills,r3_level,r3_lh,r3_xp,r4_deaths,r4_gold,r4_hero,r4_items,r4_kills,r4_level,r4_lh,r4_xp,r5_deaths,r5_gold,r5_hero,r5_items,r5_kills,r5_level,r5_lh,r5_xp
0,1,658,105,11,0,3,4,732,0,741,20,7,0,3,6,757,0,991,67,4,0,3,10,842,0,1143,29,8,0,5,10,1909,0,1489,11,7,0,5,20,2098
1,0,539,26,5,0,2,1,415,0,631,37,7,0,3,7,669,1,1033,42,12,0,4,9,1188,0,1502,67,7,1,4,18,1506,1,993,49,7,0,4,10,1596
2,0,909,20,6,1,3,0,1297,0,775,98,6,0,3,6,1314,0,1270,33,12,0,4,22,1319,0,1627,4,9,0,3,27,1395,1,2096,27,6,1,5,26,2360
3,0,539,30,6,0,2,1,539,0,499,37,6,0,2,0,591,0,1075,41,6,0,3,12,712,0,1056,29,5,0,4,14,1779,0,1139,75,6,0,5,15,2037
4,1,552,27,7,0,2,0,629,0,927,30,8,1,3,0,884,0,1439,72,11,1,3,16,925,0,1090,13,8,1,4,8,1431,0,880,93,8,0,4,7,1482


In [36]:
cols_with_rad_heroes_nums = re.compile(r'd[1-5]_[de|go|it|ki|lh|xp|he].*')

cols_to_sort_rad = [col for col in all_data.columns if cols_with_rad_heroes_nums.match(col)]
df_data_to_sort_radiants = all_data[cols_to_sort_rad]

In [37]:
match_to_sort = re.compile(r'.[1-5]_xp')
cols_sort_by = [c for c in cols_to_sort_rad if match_to_sort.match(c)]
print('sorting by {}'.format(cols_sort_by))

cols_vals = np.array([0,0,0,0,0])
argsorted_rad_array = np.zeros((len(df_data_to_sort_radiants),len(cols_to_sort_rad)+1),dtype=int)
block_len_to_sort = len(cols_to_sort_rad)

sort_by_indexes = [i+1 for i,v in enumerate(cols_to_sort_rad) if match_to_sort.match(v)]

sorted_row_indexes = np.array([0 for i in range(len(cols_to_sort_rad)+1)])

def propagate_sort_by_indexes(cur_df_index,sort_by,propagate_num=int(len(cols_to_sort_rad)/len(cols_sort_by))):
    sorted_row_indexes[0] = 0
    for ind_0,ind in enumerate(sort_by):
        sorted_row_indexes[1+ind_0*propagate_num:1+(ind_0+1)*propagate_num] = np.arange(1+ind*propagate_num,1+(ind+1)*propagate_num)
    return sorted_row_indexes

for df_tuple in df_data_to_sort_radiants.itertuples():
    tuple_array = np.array(df_tuple)
    cols_vals[:] = tuple_array[sort_by_indexes]
        
    argsorted = np.argsort(cols_vals)
    cur_df_index = df_tuple[0]
    argsorted_rad_array[cur_df_index,0] = cur_df_index
    argsorted_rad_array[cur_df_index,:] = tuple_array[propagate_sort_by_indexes(cur_df_index,argsorted)]
    
for ind,col in enumerate(cols_to_sort_rad):
    all_data[col] = pd.Series(index=argsorted_rad_array[:,0],data=argsorted_rad_array[:,ind+1])

sorting by ['d1_xp', 'd2_xp', 'd3_xp', 'd4_xp', 'd5_xp']


In [38]:
all_data[cols_to_sort_rad].head()

Unnamed: 0,d1_deaths,d1_gold,d1_hero,d1_items,d1_kills,d1_level,d1_lh,d1_xp,d2_deaths,d2_gold,d2_hero,d2_items,d2_kills,d2_level,d2_lh,d2_xp,d3_deaths,d3_gold,d3_hero,d3_items,d3_kills,d3_level,d3_lh,d3_xp,d4_deaths,d4_gold,d4_hero,d4_items,d4_kills,d4_level,d4_lh,d4_xp,d5_deaths,d5_gold,d5_hero,d5_items,d5_kills,d5_level,d5_lh,d5_xp
0,0,500,37,8,0,3,2,742,0,1003,84,9,1,3,3,958,0,996,4,6,0,3,12,1058,0,986,42,4,0,4,12,1085,0,1536,21,6,0,5,23,2052
1,0,583,7,7,0,2,0,440,1,566,88,5,0,3,1,640,0,1350,79,12,2,3,2,720,0,1622,12,9,0,4,24,1470,0,1384,39,8,0,5,16,1960
2,0,620,86,8,0,3,3,755,1,959,66,10,0,3,19,1024,0,667,29,7,0,4,4,1319,0,1512,80,7,0,3,25,1350,1,2028,22,10,1,5,19,2305
3,0,499,20,7,0,2,0,510,0,537,102,7,0,2,1,674,0,1468,48,10,0,3,22,732,0,1051,15,7,0,4,11,1681,0,1174,96,6,0,5,17,1878
4,2,586,26,9,0,3,1,704,0,638,22,9,0,3,1,1055,1,904,8,7,0,4,6,1119,0,1665,69,7,1,3,20,1169,0,1275,25,8,0,5,18,1815


In [39]:
all_data.to_csv('sorted_all_data.csv',index=False)

In [40]:
all_data.head()

Unnamed: 0,barracks_status_dire,barracks_status_radiant,d1_deaths,d1_gold,d1_hero,d1_items,d1_kills,d1_level,d1_lh,d1_xp,d2_deaths,d2_gold,d2_hero,d2_items,d2_kills,d2_level,d2_lh,d2_xp,d3_deaths,d3_gold,d3_hero,d3_items,d3_kills,d3_level,d3_lh,d3_xp,d4_deaths,d4_gold,d4_hero,d4_items,d4_kills,d4_level,d4_lh,d4_xp,d5_deaths,d5_gold,d5_hero,d5_items,d5_kills,d5_level,d5_lh,d5_xp,dire_boots_count,dire_bottle_time,dire_courier_time,dire_first_ward_time,dire_flying_courier_time,dire_tpscroll_count,dire_ward_observer_count,dire_ward_sentry_count,duration,first_blood_player1,first_blood_player2,first_blood_team,first_blood_time,lobby_type,match_id,r1_deaths,r1_gold,r1_hero,r1_items,r1_kills,r1_level,r1_lh,r1_xp,r2_deaths,r2_gold,r2_hero,r2_items,r2_kills,r2_level,r2_lh,r2_xp,r3_deaths,r3_gold,r3_hero,r3_items,r3_kills,r3_level,r3_lh,r3_xp,r4_deaths,r4_gold,r4_hero,r4_items,r4_kills,r4_level,r4_lh,r4_xp,r5_deaths,r5_gold,r5_hero,r5_items,r5_kills,r5_level,r5_lh,r5_xp,radiant_boots_count,radiant_bottle_time,radiant_courier_time,radiant_first_ward_time,radiant_flying_courier_time,radiant_tpscroll_count,radiant_ward_observer_count,radiant_ward_sentry_count,radiant_win,start_time,tower_status_dire,tower_status_radiant
0,0.0,51.0,0,500,37,8,0,3,2,742,0,1003,84,9,1,3,3,958,0,996,4,6,0,3,12,1058,0,986,42,4,0,4,12,1085,0,1536,21,6,0,5,23,2052,4,103.0,-84.0,-52.0,221.0,3,2,2,2874.0,9,0,1.0,7.0,7,0,1,658,105,11,0,3,4,732,0,741,20,7,0,3,6,757,0,991,67,4,0,3,10,842,0,1143,29,8,0,5,10,1909,0,1489,11,7,0,5,20,2098,2,134.0,-80.0,35.0,244.0,2,2,0,1.0,1430198770,0.0,1796.0
1,1.0,63.0,0,583,7,7,0,2,0,440,1,566,88,5,0,3,1,640,0,1350,79,12,2,3,2,720,0,1622,12,9,0,4,24,1470,0,1384,39,8,0,5,16,1960,4,149.0,-84.0,-5.0,195.0,5,3,1,2463.0,7,0,1.0,54.0,0,1,0,539,26,5,0,2,1,415,0,631,37,7,0,3,7,669,1,1033,42,12,0,4,9,1188,0,1502,67,7,1,4,18,1506,1,993,49,7,0,4,10,1596,0,173.0,-80.0,-20.0,,2,2,0,1.0,1430220345,0.0,1974.0
2,63.0,0.0,0,620,86,8,0,3,3,755,1,959,66,10,0,3,19,1024,0,667,29,7,0,4,4,1319,0,1512,80,7,0,3,25,1350,1,2028,22,10,1,5,19,2305,4,45.0,-77.0,13.0,221.0,3,3,1,2130.0,3,0,-1.0,224.0,7,2,0,909,20,6,1,3,0,1297,0,775,98,6,0,3,6,1314,0,1270,33,12,0,4,22,1319,0,1627,4,9,0,3,27,1395,1,2096,27,6,1,5,26,2360,5,63.0,-82.0,-39.0,,2,2,1,0.0,1430227081,1830.0,0.0
3,63.0,50.0,0,499,20,7,0,2,0,510,0,537,102,7,0,2,1,674,0,1468,48,10,0,3,22,732,0,1051,15,7,0,4,11,1681,0,1174,96,6,0,5,17,1878,4,124.0,-80.0,27.0,184.0,0,2,0,1459.0,0,0,0.0,,1,3,0,539,30,6,0,2,1,539,0,499,37,6,0,2,0,591,0,1075,41,6,0,3,12,712,0,1056,29,5,0,4,14,1779,0,1139,75,6,0,5,15,2037,3,208.0,-75.0,-30.0,,0,2,0,0.0,1430263531,2047.0,1920.0
4,63.0,3.0,2,586,26,9,0,3,1,704,0,638,22,9,0,3,1,1055,1,904,8,7,0,4,6,1119,0,1665,69,7,1,3,20,1169,0,1275,25,8,0,5,18,1815,3,182.0,-80.0,-16.0,225.0,6,3,0,2449.0,6,0,1.0,-21.0,7,4,1,552,27,7,0,2,0,629,0,927,30,8,1,3,0,884,0,1439,72,11,1,3,16,925,0,1090,13,8,1,4,8,1431,0,880,93,8,0,4,7,1482,4,166.0,-81.0,46.0,181.0,1,2,0,0.0,1430282290,1974.0,4.0


In [41]:
dont_use_cols = [c for c in all_data.columns if c.endswith('_time')]+\
                [c for c in all_data.columns if c.endswith('_levhero')]+\
    [c for c in all_data.columns if c.endswith('_hero')]+\
                ['radiant_win', 'match_id','duration']+\
                ['start_time','tower_status_dire','tower_status_radiant','barracks_status_radiant','barracks_status_dire']

In [86]:
import re

use_regexp = re.compile(r'([r|d][1-5].*|.*y_type)')

cols_to_use = [c for c in all_data.columns if c.endswith('_level')]+\
                [c for c in all_data.columns if c.endswith('_gold')]+\
              [c for c in all_data.columns if c.endswith('_xp')]+\
            [c for c in all_data.columns if c.endswith('_lh')]+\
            [c for c in all_data.columns if c.endswith('_deaths')]+\
            [c for c in all_data.columns if c.endswith('_kills')]+\
            [c for c in all_data.columns if c.endswith('_items')]+\
            [c for c in all_data.columns if c.endswith('_hero')]+\
                [c for c in all_data.columns if c.endswith('first_blood_team') or c.endswith('lobby_type')]
                
cols_to_use = [c for c in cols_to_use if use_regexp.match(c)]
                # cols_to_use = []
    # [c for c in all_data.columns if c.endswith('_levhero')]+

In [87]:
train = all_data[~all_data.radiant_win.isnull()]
test = all_data[all_data.radiant_win.isnull()]

In [88]:
Y_train = train.radiant_win

if not cols_to_use:
    X_train = train[train.columns.difference(dont_use_cols)]
    X_test = test[test.columns.difference(dont_use_cols)]
else:
    X_train = train[cols_to_use]
    X_test = test[cols_to_use]

X_train,Y_train = shuffle(X_train,Y_train)

In [89]:
clf = RandomForestClassifier(n_estimators=90)
clf.fit(X_train.as_matrix(),Y_train.as_matrix())


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=90, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [90]:
zipped = [z for z in  zip(clf.feature_importances_,X_train.columns)]
print(sorted(zipped,key=lambda x:x[0]))

[(0.0026597674990077669, 'r1_kills'), (0.0028516302505383695, 'd5_deaths'), (0.0028683927739286412, 'r5_deaths'), (0.0029717342160406235, 'd1_kills'), (0.0030597136594689538, 'r4_kills'), (0.0032287363010895954, 'd4_deaths'), (0.0032325693719790203, 'd4_kills'), (0.0032422634710251438, 'r2_kills'), (0.0032795104519049278, 'd2_deaths'), (0.0033427581950520808, 'r4_deaths'), (0.0033546683220703551, 'r3_kills'), (0.0033893444172124359, 'r2_level'), (0.0034539059459273388, 'd3_deaths'), (0.0034653701136624286, 'r5_level'), (0.0035244468097128216, 'd2_level'), (0.0035470708075677729, 'd5_level'), (0.003565119074838319, 'd1_deaths'), (0.0035734675110476151, 'd2_kills'), (0.0036474772420855609, 'd4_level'), (0.0036481286482400429, 'd5_kills'), (0.0036804564204836613, 'r3_deaths'), (0.0036915330518035245, 'r2_deaths'), (0.0037163135791975048, 'd1_level'), (0.003745646420484511, 'd3_kills'), (0.0037580682656912154, 'r5_kills'), (0.0037939733380956161, 'r1_level'), (0.003845657422041752, 'r3_lev

In [82]:
#cols_tree = [c[1] for c in sorted(zipped,key=lambda x:x[0])[-7:]]

In [96]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(RandomForestClassifier(n_estimators=90),X_train.as_matrix(),Y_train.as_matrix(),scoring='roc_auc', cv=8,n_jobs=4)

In [97]:
print(np.mean(scores))
print(scores)

0.703882911115
[ 0.7066477   0.70222574  0.70374769  0.7104968   0.70702336  0.70458306
  0.69751008  0.69882885]


In [93]:
from my_calibration import CalibratedClassifierCV
from sklearn import linear_model, decomposition, datasets
from sklearn.cross_validation import train_test_split


class vw_calibre():
    def __init__(self,n_estimators=90):
        self.vw = RandomForestClassifier(n_estimators=n_estimators)
        self.calibre = None
    
    def fit(self,X_train,Y_train):
        X_tr,X_cal,Y_tr,Y_cal = train_test_split(X_train,Y_train,train_size=0.96,stratify=Y_train)
        self.vw.fit(X_tr,Y_tr)
        self.calibre = CalibratedClassifierCV(self.vw, cv='prefit', method='sigmoid')
        self.calibre.fit(X_cal, Y_cal)
        print('calibrating on {}'.format(len(Y_cal)))
        return self
    
    def get_params(self,deep=False):
        return {'n_estimators':90}
    
    def set_params(self,header_dict,learning_rate,passes,log_stderr_to_file,deep=False):
        return self
    
    def predict_proba(self,X):
        return self.calibre.predict_proba(X)

In [94]:
scores = cross_val_score(vw_calibre(n_estimators=90),X_train.as_matrix(),Y_train.as_matrix(),scoring='log_loss', cv=8,n_jobs=4)

calibrating on 3404


  sample_weight=sample_weight)


calibrating on 3404
calibrating on 3404


  sample_weight=sample_weight)
  sample_weight=sample_weight)


calibrating on 3404


  sample_weight=sample_weight)


calibrating on 3404


  sample_weight=sample_weight)


calibrating on 3404


  sample_weight=sample_weight)


calibrating on 3404


  sample_weight=sample_weight)


calibrating on 3404


  sample_weight=sample_weight)


In [95]:
print(np.mean(scores))
print(scores)

-0.625469644776
[-0.62326723 -0.62648903 -0.6223194  -0.62239373 -0.62319384 -0.62609369
 -0.63064573 -0.6293545 ]


In [52]:
from my_calibration import CalibratedClassifierCV
from sklearn import linear_model, decomposition, datasets
from sklearn.cross_validation import train_test_split


class vw_calibre():
    def __init__(self,n_estimators=90):
        self.vw = RandomForestClassifier(n_estimators=n_estimators)
        self.calibre = None
    
    def fit(self,X_train,Y_train):
        X_tr,X_cal,Y_tr,Y_cal = train_test_split(X_train,Y_train,train_size=0.90,stratify=Y_train)
        self.vw.fit(X_tr,Y_tr)
        self.calibre = CalibratedClassifierCV(self.vw, cv='prefit', method='isotonic')
        self.calibre.fit(X_cal, Y_cal)
        print('calibrating on {}'.format(len(Y_cal)))
        return self
    
    def get_params(self,deep=False):
        return {'n_estimators':90}
    
    def set_params(self,header_dict,learning_rate,passes,log_stderr_to_file,deep=False):
        return self
    
    def predict_proba(self,X):
        return self.calibre.predict_proba(X)

In [53]:
scores = cross_val_score(vw_calibre(n_estimators=90),X_train.as_matrix(),Y_train.as_matrix(),scoring='log_loss', cv=8,n_jobs=4)

calibrating on 8508


  sample_weight=sample_weight)


calibrating on 8508


  sample_weight=sample_weight)


calibrating on 8508


  sample_weight=sample_weight)


calibrating on 8508


  sample_weight=sample_weight)


calibrating on 8508


  sample_weight=sample_weight)


calibrating on 8508


  sample_weight=sample_weight)


calibrating on 8508


  sample_weight=sample_weight)


calibrating on 8508


  sample_weight=sample_weight)


In [54]:
print(np.mean(scores))
print(scores)

-0.65350699595
[-0.65363331 -0.6485373  -0.64296607 -0.68003853 -0.65183194 -0.64973103
 -0.6504942  -0.6508236 ]
