### Parse the Heroes' Attribute from the website (http://dota2.gamepedia.com/Table_of_hero_attributes)

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import json
from sklearn import cross_validation, linear_model
import dota2api as d2
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier


%matplotlib inline

### Get Heroes Features table

In [2]:
url = 'http://dota2.gamepedia.com/Table_of_hero_attributes'
d2 = requests.get(url)

In [3]:
soup = BeautifulSoup(d2.content)

In [4]:
table = soup.find('table', attrs={'class':'wikitable'})

In [98]:
headings = [th.get_text().strip() for th in table.find("tr").find_all("th")]

In [126]:
datasets=[]
att =[]

for row in table.find_all("tr")[1:]:
    datasets.append([td.get_text().strip() for td in row.find_all("td")])

    for td in row.find_all('td')[1:2]:
        for a in td.find_all('a'):
            att.append(a['title'])

In [128]:
datasets_np = np.array(datasets)

In [129]:
datasets_np[:,1] = att

In [130]:
df = pd.DataFrame(datasets_np, columns=headings)

In [131]:
df['HERO'][4]

u'Arc Warden'

In [146]:
df = df[df['HERO'] != 'Arc Warden']

In [151]:
df.to_csv('file/hero_attributes', sep='t')    

### EDA
1. Missing Data:
- Only consider 10 human player match. The other type might have missing data
- There were extra features in captain mode, it won't be consider here.


In [2]:
pwd

u'/home/ubuntu/repo/dsi/DOTA2_Recommendation-System/notebook'

In [3]:
cd ..

/home/ubuntu/repo/dsi/DOTA2_Recommendation-System


In [4]:
import data_pipeline.data_clean as d_clean

In [5]:
df = d_clean.json_to_df(sample=100)

In [6]:
df = d_clean.qualify_matches(df)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9715 entries, 0 to 9999
Data columns (total 36 columns):
barracks_status_dire       9715 non-null int64
barracks_status_radiant    9715 non-null int64
cluster                    9715 non-null int64
dire_captain               100 non-null float64
dire_guild_id              1 non-null float64
dire_guild_logo            1 non-null float64
dire_guild_name            1 non-null object
dire_logo                  8 non-null float64
dire_name                  8 non-null object
dire_team_complete         8 non-null float64
dire_team_id               8 non-null float64
duration                   9715 non-null int64
engine                     9715 non-null int64
first_blood_time           9715 non-null int64
game_mode                  9715 non-null int64
human_players              9715 non-null int64
leagueid                   9715 non-null int64
lobby_type                 9715 non-null int64
match_id                   9715 non-null int64
match_se

In [8]:
X, y = d_clean.matches_features(df)

In [9]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9715 entries, 0 to 9714
Columns: 224 entries, 0 to 223
dtypes: int64(224)
memory usage: 16.7 MB


In [10]:
X.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,214,215,216,217,218,219,220,221,222,223
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
X.columns =X.columns+1

In [12]:
X.columns

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            215, 216, 217, 218, 219, 220, 221, 222, 223, 224],
           dtype='int64', length=224)

In [13]:
X_sum = X.apply(sum)

In [14]:
X_delete = X_sum[X_sum==0]

In [15]:
X_delete.index

Int64Index([24, 108, 136, 220], dtype='int64')

In [16]:
X_temp = X.drop(X.columns[[23, 107, 135, 219]], axis=1)

In [17]:
X_temp.columns

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            214, 215, 216, 217, 218, 219, 221, 222, 223, 224],
           dtype='int64', length=220)

In [18]:
X_temp.apply(sum).min()

58

In [19]:
X = X_temp

In [20]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9715 entries, 0 to 9714
Columns: 220 entries, 1 to 224
dtypes: int64(220)
memory usage: 16.4 MB


In [21]:
X_sum = X.apply(sum)

In [22]:
X_sum.sort(ascending=False)

  if __name__ == '__main__':


In [23]:
X_sum[:20]

133    1749
21     1659
11     1540
14     1409
123    1398
126    1360
8      1182
120    1168
119    1149
7      1146
44     1045
156    1039
74      939
104     916
186     909
185     909
32      891
216     880
144     867
73      867
dtype: int64

In [24]:
hero_pop_id = X_sum.index[:20]

In [35]:
hero_pop_name_rad = []
hero_pop_name_dia = []
for i in hero_pop_id:
    if i <= 112:
        hero_pop_name_rad.append(heroes_df[heroes_df['id']==i].localized_name)
    else:
        hero_pop_name_dia.append(heroes_df[heroes_df['id']==i-112].localized_name)

In [36]:
hero_pop_name_rad

[20    Windranger
 Name: localized_name, dtype: object, 9    Shadow Fiend
 Name: localized_name, dtype: object, 13    Pudge
 Name: localized_name, dtype: object, 7    Juggernaut
 Name: localized_name, dtype: object, 6    Earthshaker
 Name: localized_name, dtype: object, 42    Phantom Assassin
 Name: localized_name, dtype: object, 72    Invoker
 Name: localized_name, dtype: object, 102    Legion Commander
 Name: localized_name, dtype: object, 30    Riki
 Name: localized_name, dtype: object, 71    Alchemist
 Name: localized_name, dtype: object]

In [37]:
hero_pop_name_dia

[20    Windranger
 Name: localized_name, dtype: object, 9    Shadow Fiend
 Name: localized_name, dtype: object, 13    Pudge
 Name: localized_name, dtype: object, 7    Juggernaut
 Name: localized_name, dtype: object, 6    Earthshaker
 Name: localized_name, dtype: object, 42    Phantom Assassin
 Name: localized_name, dtype: object, 72    Invoker
 Name: localized_name, dtype: object, 71    Alchemist
 Name: localized_name, dtype: object, 102    Legion Commander
 Name: localized_name, dtype: object, 30    Riki
 Name: localized_name, dtype: object]

#### Summary:
 - delete two heroes id doesn't exist, it could be done in data preprocessing
 - Use updated matrix in the prediction

In [38]:
X.shape

(9715, 220)

In [39]:
y.shape

(9715,)

In [40]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.9)

In [41]:
clf_lgit = linear_model.LogisticRegression()

In [42]:
cross_validation.cross_val_score(clf_lgit,X_train, y_train, cv=5)

array([ 0.55384615,  0.56923077,  0.56701031,  0.50515464,  0.58549223])

In [43]:
clf_lgit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [44]:
lgit_coef = clf_lgit.coef_

In [45]:
lgit_coef.shape

(1, 220)

In [46]:
np.argmax(lgit_coef[0])

108

In [50]:
lgit_coef[0][108]

1.1282473751961479

In [52]:
lgit_coef[0][:110]

array([-0.52870201,  0.6163506 , -0.0587006 ,  0.2341668 ,  0.26000573,
       -0.18579704,  0.27434046, -0.16223665,  0.45425061, -1.20583622,
       -0.10166066, -0.2399234 ,  0.39816525,  0.07628064, -0.04610872,
        0.28080203, -0.57040252, -0.03140006, -0.00253736,  0.086976  ,
       -0.06894919,  0.34731554,  0.52515853, -0.19341738,  0.35382741,
        0.17710438, -0.01434613,  0.75774921,  0.18943232,  0.46705415,
        0.46870767, -0.43344516,  0.11842902, -0.30297552,  0.46276033,
        0.26876514, -0.49211659, -0.66160476,  0.11736965, -0.17125105,
        0.50090481,  0.24272329,  0.03041913, -0.53391669, -0.30204574,
        0.12694257,  0.04730296,  0.55606683, -0.3255795 ,  0.23700577,
       -0.18368891, -0.37411746, -0.27010607, -0.43598791, -0.16413498,
       -0.01482596, -0.28590108, -0.16920838,  0.38176099, -0.94693283,
        0.12276545,  0.11779374,  0.66879691, -0.44900306,  0.53244501,
        0.71479837,  0.16474162,  0.05286256,  0.04745959,  0.22

In [53]:
lgit_coef[0][110:]

array([-0.24987788, -0.05284231, -0.14664745, -0.31226795,  0.05337615,
        0.28046604,  0.10065281, -0.4125716 , -0.40013192,  0.03877261,
       -0.02166151,  0.29953087,  0.30471751,  0.09833867,  0.07843367,
        0.20040942,  0.34302121,  0.13782525,  0.10537193,  0.12823541,
       -0.14997235, -0.48721794,  0.46750065,  0.37254214, -0.38244776,
        0.02837457,  0.01895218,  0.03041116, -0.09578147, -0.54296718,
        0.05638939, -0.20078123,  0.50293612,  0.10009229, -0.47760618,
       -0.82010539, -0.6596078 ,  0.31725468, -0.01800856, -0.25343872,
       -0.44931573, -0.03222735,  0.64425736,  0.28647165, -0.19877898,
       -0.19232841, -0.13660101,  0.76389066,  0.00719104,  0.04434256,
        0.12550807,  0.56693264,  0.25995263,  0.31825716,  0.10710971,
       -0.90775254,  0.25509864,  0.5667826 ,  0.8897012 ,  0.52667424,
       -0.14306892,  0.02366361, -0.42121501,  0.03059159, -0.08786413,
       -0.47015632, -0.20516368, -0.46075762, -0.77957616, -0.57

In [47]:
# Check random_forest
clf_rf = RandomForestClassifier(n_estimators=5000, min_samples_leaf=50, oob_score=True, n_jobs=-1, random_state=50)
cross_validation.cross_val_score(clf_rf, X_train, y_train, cv=5)

array([ 0.5025641 ,  0.44615385,  0.46907216,  0.5       ,  0.50259067])

### Hero Recommendation based on the winning probablity

In [None]:
# Providing a bunch of heroes, find the largest winning probability of Hero combination

## Input
chosen_heroes_rad = [1, 2]
chosen_heroes_dia = [28, 99]

## Output
suggest_heroes_rad =[]
suggest_heroes_dia=[]

In [25]:
api = d2.Initialise()

In [26]:
heroes = api.get_heroes()

In [27]:
type(heroes)

dota2api.src.response.Dota2Dict

In [28]:
heroes.keys()

[u'status', u'count', u'heroes']

In [29]:
heroes_df = pd.DataFrame(heroes['heroes'])

In [30]:
heroes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111 entries, 0 to 110
Data columns (total 3 columns):
id                111 non-null int64
localized_name    111 non-null object
name              111 non-null object
dtypes: int64(1), object(2)
memory usage: 3.5+ KB


In [31]:
heroes_df.head(1)

Unnamed: 0,id,localized_name,name
0,1,Anti-Mage,npc_dota_hero_antimage


In [32]:
for i in range(1,114):
    if (heroes_df['id']==i).any():
        pass
    else:
        print i

24
108


In [33]:
heroes_df[heroes_df['id']==101+1]

Unnamed: 0,id,localized_name,name
100,102,Abaddon,npc_dota_hero_abaddon


In [34]:
heroes_df[heroes_df['id']==21]

Unnamed: 0,id,localized_name,name
20,21,Windranger,npc_dota_hero_windrunner


In [205]:
heroes_df.to_csv('file/hero_id_list.csv',sep=',')