### Parse the Heroes' Attribute from the website (http://dota2.gamepedia.com/Table_of_hero_attributes)

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import json
from sklearn import cross_validation, linear_model
import dota2api as d2
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier


%matplotlib inline

### Get Heroes Features table

In [2]:
url = 'http://dota2.gamepedia.com/Table_of_hero_attributes'
d2 = requests.get(url)

In [3]:
soup = BeautifulSoup(d2.content)

In [4]:
table = soup.find('table', attrs={'class':'wikitable'})

In [98]:
headings = [th.get_text().strip() for th in table.find("tr").find_all("th")]

In [126]:
datasets=[]
att =[]

for row in table.find_all("tr")[1:]:
    datasets.append([td.get_text().strip() for td in row.find_all("td")])

    for td in row.find_all('td')[1:2]:
        for a in td.find_all('a'):
            att.append(a['title'])

In [128]:
datasets_np = np.array(datasets)

In [129]:
datasets_np[:,1] = att

In [130]:
df = pd.DataFrame(datasets_np, columns=headings)

In [131]:
df['HERO'][4]

u'Arc Warden'

In [146]:
df = df[df['HERO'] != 'Arc Warden']

In [151]:
df.to_csv('file/hero_attributes', sep='t')    

### EDA
1. Missing Data:
- Only consider 10 human player match. The other type might have missing data
- There were extra features in captain mode, it won't be consider here.


In [2]:
pwd

u'/home/ubuntu/repo/dsi/DOTA2_Recommendation-System/notebook'

In [3]:
cd ..

/home/ubuntu/repo/dsi/DOTA2_Recommendation-System


In [4]:
import data_pipeline.data_clean as d_clean

In [5]:
df = d_clean.json_to_df(sample=100)

In [6]:
df = d_clean.qualify_matches(df)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9715 entries, 0 to 9999
Data columns (total 36 columns):
barracks_status_dire       9715 non-null int64
barracks_status_radiant    9715 non-null int64
cluster                    9715 non-null int64
dire_captain               100 non-null float64
dire_guild_id              1 non-null float64
dire_guild_logo            1 non-null float64
dire_guild_name            1 non-null object
dire_logo                  8 non-null float64
dire_name                  8 non-null object
dire_team_complete         8 non-null float64
dire_team_id               8 non-null float64
duration                   9715 non-null int64
engine                     9715 non-null int64
first_blood_time           9715 non-null int64
game_mode                  9715 non-null int64
human_players              9715 non-null int64
leagueid                   9715 non-null int64
lobby_type                 9715 non-null int64
match_id                   9715 non-null int64
match_se

In [8]:
X, y = d_clean.matches_features(df)

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9715 entries, 0 to 9714
Columns: 224 entries, 0 to 223
dtypes: int64(224)
memory usage: 16.7 MB


In [10]:
X.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,214,215,216,217,218,219,220,221,222,223
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
X.columns =X.columns+1

In [12]:
X.columns

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            215, 216, 217, 218, 219, 220, 221, 222, 223, 224],
           dtype='int64', length=224)

In [13]:
X_sum = X.apply(sum)

In [14]:
X_delete = X_sum[X_sum==0]

In [15]:
X_delete.index

Int64Index([24, 108, 136, 220], dtype='int64')

In [16]:
X_temp = X.drop(X.columns[[23, 107, 135, 219]], axis=1)

In [17]:
X_temp.columns

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            214, 215, 216, 217, 218, 219, 221, 222, 223, 224],
           dtype='int64', length=220)

In [18]:
X_temp.apply(sum).min()

58

In [19]:
X = X_temp

In [20]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9715 entries, 0 to 9714
Columns: 220 entries, 1 to 224
dtypes: int64(220)
memory usage: 16.4 MB


In [21]:
X_sum = X.apply(sum)

In [22]:
X_sum.sort(ascending=False)

  if __name__ == '__main__':


In [23]:
X_sum[:20]

133    1749
21     1659
11     1540
14     1409
123    1398
126    1360
8      1182
120    1168
119    1149
7      1146
44     1045
156    1039
74      939
104     916
186     909
185     909
32      891
216     880
144     867
73      867
dtype: int64

In [24]:
hero_pop_id = X_sum.index[:20]

In [36]:
hero_pop_name_rad = []
hero_pop_name_dir = []
for i in hero_pop_id:
    if i <= 112:
        hero_pop_name_rad.append(heroes_df[heroes_df['id']==i].localized_name)
    else:
        hero_pop_name_dir.append(heroes_df[heroes_df['id']==i-112].localized_name)

In [37]:
hero_pop_name_rad

[20    Windranger
 Name: localized_name, dtype: object, 9    Shadow Fiend
 Name: localized_name, dtype: object, 13    Pudge
 Name: localized_name, dtype: object, 7    Juggernaut
 Name: localized_name, dtype: object, 6    Earthshaker
 Name: localized_name, dtype: object, 42    Phantom Assassin
 Name: localized_name, dtype: object, 72    Invoker
 Name: localized_name, dtype: object, 102    Legion Commander
 Name: localized_name, dtype: object, 30    Riki
 Name: localized_name, dtype: object, 71    Alchemist
 Name: localized_name, dtype: object]

In [38]:
hero_pop_name_dir

[20    Windranger
 Name: localized_name, dtype: object, 9    Shadow Fiend
 Name: localized_name, dtype: object, 13    Pudge
 Name: localized_name, dtype: object, 7    Juggernaut
 Name: localized_name, dtype: object, 6    Earthshaker
 Name: localized_name, dtype: object, 42    Phantom Assassin
 Name: localized_name, dtype: object, 72    Invoker
 Name: localized_name, dtype: object, 71    Alchemist
 Name: localized_name, dtype: object, 102    Legion Commander
 Name: localized_name, dtype: object, 30    Riki
 Name: localized_name, dtype: object]

#### Summary:
 - delete two heroes id doesn't exist, it could be done in data preprocessing
 - Use updated matrix in the prediction

In [39]:
X.shape

(9715, 220)

In [40]:
y.shape

(9715,)

In [41]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.9)

In [42]:
clf_lgit = linear_model.LogisticRegression()

In [43]:
cross_validation.cross_val_score(clf_lgit,X_train, y_train, cv=5)

array([ 0.58461538,  0.58762887,  0.54639175,  0.56185567,  0.56185567])

In [44]:
clf_lgit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [45]:
lgit_coef = clf_lgit.coef_

In [46]:
lgit_coef.shape

(1, 220)

In [47]:
np.argmax(lgit_coef[0])

101

In [48]:
lgit_coef[0][108]

0.81071508494210842

In [49]:
lgit_coef[0][:110]

array([-0.3055207 , -0.07413302, -0.22183268, -0.3114596 ,  0.4153105 ,
        0.15685986,  0.76005157,  0.64573422,  0.42250949, -0.06218987,
        0.16743837, -0.27491298, -1.21338573,  0.14994133,  0.04057047,
        0.5972274 , -0.4696202 , -0.04383802, -0.51516781, -0.13683052,
       -0.04833735,  0.13727569,  0.44298107, -0.78930575,  0.25710296,
        0.80692671,  0.11558378, -0.37672863,  0.0673019 ,  0.67310991,
       -0.0563991 ,  0.09859625, -0.4759347 , -0.09412004,  0.54745734,
        0.59099003, -0.06869453, -0.51918745, -0.0673933 ,  0.03206799,
        0.19794379, -0.11782566, -0.03138579, -0.43624422, -0.3969547 ,
        0.78826319,  0.01380005,  0.03009636,  0.4801992 ,  0.49234275,
        0.62393978, -0.17405836, -0.75280756,  0.32406983, -0.04808448,
        0.97901511, -0.36521335, -0.30549392,  0.12065284, -0.86854949,
       -0.21649113, -0.9278149 ,  1.11536567, -0.33702598,  0.61344058,
        0.21350144,  0.02882311,  0.06376328, -0.32560823,  0.03

In [50]:
lgit_coef[0][110:]

array([ 0.25152484, -0.06963197,  0.04784691, -0.19859189, -0.31957486,
        0.24691199, -0.65178947,  0.01649309,  0.65596929,  0.10530963,
       -0.02701696, -0.43916126, -0.01770561,  0.25284972,  0.73731389,
        0.05194307,  0.63027381, -0.26077115, -0.52036052, -0.10154376,
       -0.09998432, -0.12261149,  0.32220991,  0.15838003,  0.29119924,
        0.30656339,  0.23303196,  0.29143959, -0.35030561,  0.11804591,
       -0.53630498,  0.2469048 , -0.2488462 ,  0.39397321, -0.30109932,
       -0.48425004, -0.83410132,  0.73715197,  0.17374745, -0.22226101,
       -0.61668823,  0.61316301,  0.64945211, -0.05278453, -0.15026281,
        0.47165355, -0.1430178 , -0.00849185, -0.00673319, -0.27946699,
        0.59046083,  0.40342911, -0.00705721,  0.83405962,  0.35389135,
       -0.02253736,  0.17574322,  0.02376565,  0.08990487, -0.55244814,
        0.21894326,  0.02686887, -0.17054859,  0.33405017,  0.38079275,
       -0.78272564, -0.67737293, -0.7401226 , -0.78242745, -0.08

In [51]:
# Check random_forest
clf_rf = RandomForestClassifier(n_estimators=5000, min_samples_leaf=50, oob_score=True, n_jobs=-1, random_state=50)
cross_validation.cross_val_score(clf_rf, X_train, y_train, cv=5)

array([ 0.53333333,  0.53608247,  0.53608247,  0.53608247,  0.53608247])

### Hero Recommendation based on the winning probablity

In [52]:
X_test.shape

(8744, 220)

In [71]:
x_input = X_test.iloc[0,:]

In [72]:
clf_lgit.predict_proba(x_input)



array([[ 0.88170526,  0.11829474]])

In [73]:
clf_lgit.predict_log_proba(x_input)



array([[-0.12589745, -2.13457598]])

In [75]:
# Providing a bunch of heroes, find the largest winning probability of Hero combination

## Input
chosen_heroes_rad = [1, 2]
chosen_heroes_dir = [28, 99]

## Output
suggest_heroes_rad =[]
suggest_heroes_dir=[]

In [83]:
# Construct feature matrix according to input

## construct heroes pool with all the heroes in this patch
heroes_pool = heroes_df['id'][:-1].values

In [104]:
## feature matrix with all given heroes

x_temp = np.zeros(224)

for i in chosen_heroes_rad:
    x_temp[i-1] =1
for j in chosen_heroes_dir:
    x_temp[j+110-1] =1

In [105]:
x_temp

array([ 1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [110]:
def fit_transform(np_arr):
    return np.delete(np_arr,[24, 108, 24+112, 108+112])

In [111]:
fit_transform(x_temp)

array([ 1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [114]:
prob_t = clf_lgit.predict_proba(fit_transform(x_temp))



In [115]:
type(prob_t), prob_t.shape

(numpy.ndarray, (1, 2))

In [119]:
prob_t[0][1]

0.58806490941936651

In [102]:
## Find the highest probability for radiant win

### update heroes pool

heroes_pool_updated = np.copy(heroes_pool)

for i in chosen_heroes_rad:
    heroes_pool_updated = heroes_pool_updated[heroes_pool_updated!=i]
for j in chosen_heroes_dir:
    heroes_pool_updated = heroes_pool_updated[heroes_pool_updated!=j]

In [103]:
heroes_pool_updated

array([  3,   4,   5,   6,   7,   8,   9,  11,  10,  12,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  23,  25,  31,  26,  27,  29,
        30,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
        44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
        57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  69,  68,
        70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,
        83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
        96,  97,  98, 100, 101, 102, 103, 104, 106, 107, 109, 110, 111,
       105, 112])

In [124]:
### make possible heroes feature matrix

candidate_prob = []

for hero_candidate in heroes_pool_updated:
    x_temp_candidate = np.copy(x_temp)
    x_temp_candidate[hero_candidate-1] = 1
    prob = clf_lgit.predict_proba(fit_transform(x_temp_candidate))
    candidate_prob.append((prob[0][1],hero_candidate))




In [125]:
sorted(candidate_prob, reverse=True)[0:5]

[(0.82472114219661041, 103),
 (0.81326055029583633, 64),
 (0.79166428911884379, 57),
 (0.7752354747754, 96),
 (0.7625455859095307, 111)]

In [26]:
api = d2.Initialise()

In [27]:
heroes = api.get_heroes()

In [28]:
type(heroes)

dota2api.src.response.Dota2Dict

In [29]:
heroes.keys()

[u'status', u'count', u'heroes']

In [30]:
heroes_df = pd.DataFrame(heroes['heroes'])

In [31]:
heroes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111 entries, 0 to 110
Data columns (total 3 columns):
id                111 non-null int64
localized_name    111 non-null object
name              111 non-null object
dtypes: int64(1), object(2)
memory usage: 3.5+ KB


In [32]:
heroes_df.head(1)

Unnamed: 0,id,localized_name,name
0,1,Anti-Mage,npc_dota_hero_antimage


In [33]:
for i in range(1,114):
    if (heroes_df['id']==i).any():
        pass
    else:
        print i

24
108


In [34]:
heroes_df[heroes_df['id']==101+1]

Unnamed: 0,id,localized_name,name
100,102,Abaddon,npc_dota_hero_abaddon


In [35]:
heroes_df[heroes_df['id']==21]

Unnamed: 0,id,localized_name,name
20,21,Windranger,npc_dota_hero_windrunner


In [205]:
heroes_df.to_csv('file/hero_id_list.csv',sep=',')