In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold

from sklearn.cross_validation import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso, Ridge

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
def normalize_data(X):
    return pd.DataFrame(
        StandardScaler().fit_transform(X),
        index = X.index, 
        columns=X.columns
    )

In [4]:
train_matches = pd.read_csv('data/train.csv')
test_matches = pd.read_csv('data/test.csv')

In [5]:
gold = pd.read_csv('data/gold.csv', index_col='mid')
gold = gold[gold.times == 600]
gold.drop('times', 1, inplace=True)

radiant_gold = gold[['player_0', 'player_1', 'player_2', 'player_3', 'player_4']].sum(axis=1)
dire_gold = gold[['player_5', 'player_6', 'player_7', 'player_8', 'player_9']].sum(axis=1)

for col in gold.columns:
    gold['gold_' + col] = gold[col]
    gold.drop(col, 1, inplace=True)
gold.head()

Unnamed: 0_level_0,gold_player_0,gold_player_1,gold_player_2,gold_player_3,gold_player_4,gold_player_5,gold_player_6,gold_player_7,gold_player_8,gold_player_9
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,3454,5206,2613,4426,5755,4072,3997,5917,1725,6384
1,2477,5760,3816,4353,5759,7659,5066,2748,4440,4623
2,3604,1948,8581,4390,2869,3096,2301,5130,2530,2491
3,3457,5464,4432,2961,4314,3345,4791,1906,5328,2247
4,3675,4103,5154,3030,2076,3920,3494,3392,4458,2220


In [6]:
gold_rank_dif = pd.DataFrame(data = np.sort(gold.values[:,0:5]) - np.sort(gold.values[:,5:10]),
                             index=gold.index,
                             columns = ['gold_rank_{}'.format(i) for i in range(5)]) 
# gold_rank_dif = normalize_data(gold_rank_dif)
gold_rank_dif.head()

Unnamed: 0_level_0,gold_rank_0,gold_rank_1,gold_rank_2,gold_rank_3,gold_rank_4
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,888,-543,354,-711,-629
1,-271,-624,-270,693,-1899
2,-353,378,1074,1294,3451
3,1055,1210,969,-359,136
4,-144,-362,181,183,696


In [7]:
gold_square_rank_dif = pd.DataFrame(data = np.sort(np.square(gold.values[:,0:5])) - np.sort(np.square(gold.values[:,5:10])),
                             index=gold.index,
                             columns = ['gold_square_rank_{}'.format(i) for i in range(5)]) 
# gold_square_rank_dif = normalize_data(gold_square_rank_dif)
gold_square_rank_dif.head()

Unnamed: 0_level_0,gold_square_rank_0,gold_square_rank_1,gold_square_rank_2,gold_square_rank_3,gold_square_rank_4
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3852144,-4045893,3008292,-7908453,-7635431
1,-1415975,-5151744,-2423520,7501725,-25482681
2,-1499897,2026080,6587916,9686884,47316661
3,5134685,6901840,7421571,-3311057,1467712
4,-618624,-2324764,1297589,1468209,6689952


In [30]:
gold = pd.read_csv('data/gold.csv', index_col='mid')
gold = gold[gold.times == 600]
gold.drop('times', 1, inplace=True)
gold_log_rank_dif = pd.DataFrame(data = np.sort(np.log(gold.values[:,0:5])) - np.sort(np.log(gold.values[:,5:10])),
                             index=gold.index,
                             columns = ['gold_log_rank_{}'.format(i) for i in range(5)]) 
# gold_log_rank_dif = normalize_data(gold_log_rank_dif)
gold_log_rank_dif.head()

Unnamed: 0_level_0,gold_log_rank_0,gold_log_rank_1,gold_log_rank_2,gold_log_rank_3,gold_log_rank_4
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.415272,-0.146011,0.083362,-0.128018,-0.103726
1,-0.103825,-0.151452,-0.060179,0.128212,-0.284944
2,-0.166541,0.141279,0.353825,0.349218,0.514445
3,0.44052,0.430805,0.254399,-0.077888,0.025205
4,-0.067064,-0.112857,0.050506,0.045627,0.145073


In [9]:
new_gold = pd.read_csv('data/gold.csv', index_col='mid')
new_gold = new_gold[new_gold.times == 600]
new_gold.drop('times', 1, inplace=True)
heroes = pd.read_csv('data/heroes.csv', index_col='mid')

mean_gold = np.zeros(111)
for hero_num in range(111):
    hero_values = []
    for player_num in range(10):
        colname = 'player_{}'.format(player_num)
        hero_index = heroes.loc[heroes[colname] == hero_num].index
        hero_values.extend(new_gold.loc[hero_index][colname].values)
    mean_gold[hero_num] = int(np.mean(hero_values))

mean_gold[0] = 100000
print 'Source:\n', mean_gold
print 'Sorted:\n', np.sort(mean_gold)
new_mean_gold = np.array(map(lambda x: x if x > 4400 else 10000, mean_gold))
mean_gold = new_mean_gold
print 'After threshold:\n', mean_gold

Source:
[ 100000.    4793.    4149.    3651.    3124.    3789.    3245.    3224.
    4364.    4589.    3101.    4808.    3217.    4635.    4535.    5610.
    3314.    4150.    3062.    2610.    3785.    3000.    4603.    3873.
    4939.    4383.    3343.    4516.    4352.    4605.    4443.    4447.
    3635.    3305.    5088.    4666.    4527.    4603.    4285.    2838.
    2793.    3345.    3678.    3807.    4264.    4556.    4426.    4225.
    4485.    4493.    2736.    4022.    4648.    4187.    4117.    4461.
    4843.    3359.    4398.    4403.    2970.    2825.    3845.    3786.
    4562.    4364.    2684.    4646.    4162.    2895.    4601.    3486.
    4865.    4075.    6446.    4589.    3293.    4323.    4425.    4224.
    2965.    4139.    4968.    4354.    4374.    4719.    2680.    2754.
    2921.    3712.    2940.    2838.    2795.    3290.    3938.    3838.
    3374.    3016.    2959.    3370.    3129.    4483.    4430.    2908.
    2873.    4453.    3710.    4110.    392

In [10]:
mean_predict_gold = heroes.apply(lambda x: mean_gold[x])

temp_gold_score = new_gold / mean_predict_gold
threshold_gold_score = pd.DataFrame(index=temp_gold_score.index)
for col in temp_gold_score.columns:
    threshold_gold_score['threshold_gold_score_' + col] = temp_gold_score[col]
# threshold_gold_score.reset_index(level=0, inplace=True)
threshold_gold_score.head()
radiant_rank_gold_score = pd.DataFrame(data = np.sort(threshold_gold_score.values[:,0:5]),
                                       index=threshold_gold_score.index,
                                       columns = ['gold_score_rank_{0}'.format(i) for i in range(5)]) 
# radiant_rank_gold_score = normalize_data(radiant_rank_gold_score)
dire_rank_gold_score = pd.DataFrame(data = np.sort(threshold_gold_score.values[:,5:10]),
                                       index=threshold_gold_score.index,
                                       columns = ['gold_score_rank_{0}'.format(i) for i in range(5)]) 
# dire_rank_gold_score = normalize_data(dire_rank_gold_score)
dire_rank_gold_score.head()

Unnamed: 0_level_0,gold_score_rank_0,gold_score_rank_1,gold_score_rank_2,gold_score_rank_3,gold_score_rank_4
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.1725,0.3997,0.846922,0.990382,1.162932
1,0.2748,0.444,0.5066,0.961522,1.188179
2,0.2301,0.2491,0.3096,0.564103,1.054471
3,0.1906,0.2247,0.3345,1.041295,1.078761
4,0.222,0.3494,0.392,0.760368,0.977203


In [11]:
radiant_rank_gold_score.head()

Unnamed: 0_level_0,gold_score_rank_0,gold_score_rank_1,gold_score_rank_2,gold_score_rank_3,gold_score_rank_4
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.2613,0.3454,0.5206,0.5755,0.788948
1,0.2477,0.3816,0.881353,1.220598,1.262385
2,0.1948,0.2869,0.3604,0.944899,1.929615
3,0.2961,0.3457,0.4314,0.5464,0.988624
4,0.2076,0.3675,0.65798,0.731373,1.123121


In [12]:
gold_score_rel = radiant_rank_gold_score / dire_rank_gold_score
for i, col in enumerate(gold_score_rel.columns):
    gold_score_rel['gold_score_rank_rel_{0}'.format(i)] = gold_score_rel[col]
    gold_score_rel.drop(col, 1, inplace=True)
gold_score_rel.head()

Unnamed: 0_level_0,gold_score_rank_rel_0,gold_score_rank_rel_1,gold_score_rank_rel_2,gold_score_rank_rel_3,gold_score_rank_rel_4
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.514783,0.864148,0.614697,0.581089,0.678413
1,0.901383,0.859459,1.73974,1.269443,1.062454
2,0.846588,1.151746,1.164083,1.675048,1.829937
3,1.553515,1.538496,1.289686,0.524731,0.916444
4,0.935135,1.051803,1.678522,0.961867,1.149322


In [13]:
gold_score_dif = radiant_rank_gold_score - dire_rank_gold_score
for i, col in enumerate(gold_score_dif.columns):
    gold_score_dif['gold_score_rank_dif_{0}'.format(i)] = gold_score_dif[col]
    gold_score_dif.drop(col, 1, inplace=True)
gold_score_dif.head()

Unnamed: 0_level_0,gold_score_rank_dif_0,gold_score_rank_dif_1,gold_score_rank_dif_2,gold_score_rank_dif_3,gold_score_rank_dif_4
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0888,-0.0543,-0.326322,-0.414882,-0.373984
1,-0.0271,-0.0624,0.374753,0.259075,0.074206
2,-0.0353,0.0378,0.0508,0.380796,0.875145
3,0.1055,0.121,0.0969,-0.494895,-0.090137
4,-0.0144,0.0181,0.26598,-0.028995,0.145918


In [15]:
gold_rank_dif.reset_index(inplace=True)
gold_square_rank_dif.reset_index(inplace=True)
gold_log_rank_dif.reset_index(inplace=True)
gold_score_rel.reset_index(inplace=True)
gold_score_dif.reset_index(inplace=True)

In [28]:
all_stats_gold = pd.DataFrame(data=gold.index)
all_stats_gold['gold_dif'] = radiant_gold - dire_gold
all_stats_gold = pd.merge(all_stats_gold, gold_rank_dif, on='mid', how='left')
all_stats_gold = pd.merge(all_stats_gold, gold_square_rank_dif, on='mid', how='left')
all_stats_gold = pd.merge(all_stats_gold, gold_log_rank_dif, on='mid', how='left')
all_stats_gold = pd.merge(all_stats_gold, gold_score_rel, on='mid', how='left')
all_stats_gold = pd.merge(all_stats_gold, gold_score_dif, on='mid', how='left')
all_stats_gold.index = all_stats_gold.mid
all_stats_gold.drop('mid', 1, inplace=True)
all_stats_gold = normalize_data(all_stats_gold)
all_stats_gold.reset_index(inplace=True)
all_stats_gold.head()

Unnamed: 0,mid,gold_dif,gold_rank_0,gold_rank_1,gold_rank_2,gold_rank_3,gold_rank_4,gold_square_rank_0,gold_square_rank_1,gold_square_rank_2,...,gold_score_rank_rel_0,gold_score_rank_rel_1,gold_score_rank_rel_2,gold_score_rank_rel_3,gold_score_rank_rel_4,gold_score_rank_dif_0,gold_score_rank_dif_1,gold_score_rank_dif_2,gold_score_rank_dif_3,gold_score_rank_dif_4
0,0,-0.191463,1.2354,-0.636307,0.361294,-0.665978,-0.463171,0.969054,-0.701083,0.373339,...,1.128138,-0.495837,-0.869275,-0.969156,-1.020963,1.113889,-0.382653,-1.121482,-1.157989,-1.140037
1,1,-0.657847,-0.364904,-0.732195,-0.275577,0.611509,-1.316314,-0.343611,-0.894262,-0.303178,...,-0.34471,-0.507585,1.027717,0.256079,-0.007774,-0.326219,-0.440294,1.299354,0.712788,0.197841
2,2,1.556804,-0.478126,0.453976,1.096144,1.158354,2.277633,-0.364522,0.359618,0.819171,...,-0.476278,0.224744,0.057072,0.978037,2.017024,-0.428108,0.272745,0.180734,1.050663,2.588696
3,3,0.793066,1.465987,1.4389,0.988979,-0.345696,0.05073,1.288627,1.211356,0.923,...,1.22114,1.19375,0.268858,-1.069471,-0.392982,1.321393,0.864809,0.339919,-1.380092,-0.292735
4,4,0.130693,-0.189547,-0.422038,0.184725,0.147465,0.426918,-0.144934,-0.400422,0.160275,...,-0.263666,-0.025665,0.924493,-0.29139,0.221403,-0.168416,0.132557,0.92376,-0.086841,0.411904


In [29]:
all_stats_gold.to_csv('processing_tables/all_stats_gold.csv', index=None)

In [None]:
gold_stats_threshold = pd.DataFrame(index=new_gold.index)
gold_stats_threshold['gold_dif'] = radiant_gold - dire_gold
gold_stats_threshold['top_gold_dif'] = gold_rank_dif.gold_rank_4
gold_stats_threshold['second_gold_dif'] = gold_rank_dif.gold_rank_3
gold_stats_threshold['third_gold_dif'] = gold_rank_dif.gold_rank_2
gold_stats_threshold['threshold_score_sum_dif'] = radiant_rank_gold_score.sum(1) - dire_rank_gold_score.sum(1)
gold_stats_threshold['top_threshold_gold_score_dif'] = radiant_rank_gold_score.gold_score_rank_4 - dire_rank_gold_score.gold_score_rank_4
gold_stats_threshold['second_threshold_gold_score_dif'] = radiant_rank_gold_score.gold_score_rank_3 - dire_rank_gold_score.gold_score_rank_3
gold_stats_threshold['third_threshold_gold_score_dif'] = radiant_rank_gold_score.gold_score_rank_2 - dire_rank_gold_score.gold_score_rank_2
gold_stats_threshold = normalize_data(gold_stats_threshold)
gold_stats_threshold.reset_index(inplace=True)
gold_stats_threshold.to_csv('processing_tables/gold_stats_threshold_dif.csv', index=None)
gold_stats_threshold.head()