### Parse the Heroes' Attribute from the website (http://dota2.gamepedia.com/Table_of_hero_attributes)

In [202]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import json
from sklearn import cross_validation, linear_model
import dota2api as d2
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier


%matplotlib inline

### Get Heroes Features table

In [2]:
url = 'http://dota2.gamepedia.com/Table_of_hero_attributes'
d2 = requests.get(url)

In [3]:
soup = BeautifulSoup(d2.content)

In [4]:
table = soup.find('table', attrs={'class':'wikitable'})

In [98]:
headings = [th.get_text().strip() for th in table.find("tr").find_all("th")]

In [126]:
datasets=[]
att =[]

for row in table.find_all("tr")[1:]:
    datasets.append([td.get_text().strip() for td in row.find_all("td")])

    for td in row.find_all('td')[1:2]:
        for a in td.find_all('a'):
            att.append(a['title'])

In [128]:
datasets_np = np.array(datasets)

In [129]:
datasets_np[:,1] = att

In [130]:
df = pd.DataFrame(datasets_np, columns=headings)

In [131]:
df['HERO'][4]

u'Arc Warden'

In [146]:
df = df[df['HERO'] != 'Arc Warden']

In [151]:
df.to_csv('file/hero_attributes', sep='t')    

### EDA
1. Missing Data:
- Only consider 10 human player match. The other type might have missing data
- There were extra features in captain mode, it won't be consider here.


In [2]:
pwd

u'/home/hao/Repos/DSI/DOTA2_Recommendation-System/notebook'

In [38]:
import data_pipeline.data_clean as d_clean

In [39]:
df = d_clean.json_to_df(sample=100)

In [41]:
df = d_clean.qualify_matches(df)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9715 entries, 0 to 9999
Data columns (total 36 columns):
barracks_status_dire       9715 non-null int64
barracks_status_radiant    9715 non-null int64
cluster                    9715 non-null int64
dire_captain               100 non-null float64
dire_guild_id              1 non-null float64
dire_guild_logo            1 non-null float64
dire_guild_name            1 non-null object
dire_logo                  8 non-null float64
dire_name                  8 non-null object
dire_team_complete         8 non-null float64
dire_team_id               8 non-null float64
duration                   9715 non-null int64
engine                     9715 non-null int64
first_blood_time           9715 non-null int64
game_mode                  9715 non-null int64
human_players              9715 non-null int64
leagueid                   9715 non-null int64
lobby_type                 9715 non-null int64
match_id                   9715 non-null int64
match_se

In [172]:
X, y = d_clean.matches_features(df)

In [173]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9715 entries, 0 to 9714
Columns: 224 entries, 0 to 223
dtypes: int64(224)
memory usage: 16.7 MB


In [174]:
X.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,214,215,216,217,218,219,220,221,222,223
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [175]:
X.columns =X.columns+1

In [176]:
X.columns

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10, 
            ...
            215, 216, 217, 218, 219, 220, 221, 222, 223, 224],
           dtype='int64', length=224)

In [177]:
X_sum = X.apply(sum)

In [178]:
X_delete = X_sum[X_sum==0]

In [179]:
X_delete.index

Int64Index([24, 108, 136, 220], dtype='int64')

In [180]:
X_temp = X.drop(X.columns[[23, 107, 135, 219]], axis=1)

In [181]:
X_temp.columns

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10, 
            ...
            214, 215, 216, 217, 218, 219, 221, 222, 223, 224],
           dtype='int64', length=220)

In [182]:
X_temp.apply(sum).min()

58

In [183]:
X = X_temp

In [184]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9715 entries, 0 to 9714
Columns: 220 entries, 1 to 224
dtypes: int64(220)
memory usage: 16.4 MB


In [185]:
X_sum = X.apply(sum)

In [186]:
X_sum.sort(ascending=False)

In [201]:
X_sum[:20]

133    1749
21     1659
11     1540
14     1409
123    1398
126    1360
8      1182
120    1168
119    1149
7      1146
44     1045
156    1039
74      939
104     916
186     909
185     909
32      891
216     880
144     867
73      867
dtype: int64

In [187]:
hero_pop_id = X_sum.index[:20]

In [188]:
hero_pop_name_rad = []
hero_pop_name_dia = []
for i in hero_pop_id:
    if i <= 112:
        hero_pop_name_rad.append(heroes_df[heroes_df['id']==i].localized_name)
    else:
        hero_pop_name_dia.append(heroes_df[heroes_df['id']==i-112].localized_name)

In [189]:
hero_pop_name_rad

[20    Windranger
 Name: localized_name, dtype: object, 9    Shadow Fiend
 Name: localized_name, dtype: object, 13    Pudge
 Name: localized_name, dtype: object, 7    Juggernaut
 Name: localized_name, dtype: object, 6    Earthshaker
 Name: localized_name, dtype: object, 42    Phantom Assassin
 Name: localized_name, dtype: object, 72    Invoker
 Name: localized_name, dtype: object, 102    Legion Commander
 Name: localized_name, dtype: object, 30    Riki
 Name: localized_name, dtype: object, 71    Alchemist
 Name: localized_name, dtype: object]

In [190]:
hero_pop_name_dia

[20    Windranger
 Name: localized_name, dtype: object, 9    Shadow Fiend
 Name: localized_name, dtype: object, 13    Pudge
 Name: localized_name, dtype: object, 7    Juggernaut
 Name: localized_name, dtype: object, 6    Earthshaker
 Name: localized_name, dtype: object, 42    Phantom Assassin
 Name: localized_name, dtype: object, 72    Invoker
 Name: localized_name, dtype: object, 71    Alchemist
 Name: localized_name, dtype: object, 102    Legion Commander
 Name: localized_name, dtype: object, 30    Riki
 Name: localized_name, dtype: object]

#### Summary:
 - delete two heroes id doesn't exist, it could be done in data preprocessing
 - Use updated matrix in the prediction

In [191]:
X.shape

(9715, 220)

In [192]:
y.shape

(9715,)

In [193]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.9)

In [194]:
clf_lgit = linear_model.LogisticRegression()

In [195]:
cross_validation.cross_val_score(clf_lgit,X_train, y_train, cv=5)

array([ 0.48205128,  0.44615385,  0.51546392,  0.54123711,  0.49222798])

In [196]:
clf_lgit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [197]:
lgit_coef = clf_lgit.coef_

In [198]:
lgit_coef.shape

(1, 220)

In [199]:
np.argmax(lgit_coef[0])

101

In [204]:
# Check random_forest
clf_rf = RandomForestClassifier(n_estimators=5000, min_samples_leaf=50, oob_score=True, n_jobs=-1, random_state=50)
cross_validation.cross_val_score(clf_rf, X_train, y_train, cv=5)

array([ 0.45641026,  0.49230769,  0.53608247,  0.50515464,  0.49222798])

In [71]:
api = d2.Initialise()

In [73]:
heroes = api.get_heroes()

In [74]:
type(heroes)

dota2api.src.response.Dota2Dict

In [75]:
heroes.keys()

[u'status', u'count', u'heroes']

In [79]:
heroes_df = pd.DataFrame(heroes['heroes'])

In [164]:
heroes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 111 entries, 0 to 110
Data columns (total 3 columns):
id                111 non-null int64
localized_name    111 non-null object
name              111 non-null object
dtypes: int64(1), object(2)
memory usage: 3.5+ KB


In [165]:
heroes_df.head(1)

Unnamed: 0,id,localized_name,name
0,1,Anti-Mage,npc_dota_hero_antimage


In [89]:
for i in range(1,114):
    if (heroes_df['id']==i).any():
        pass
    else:
        print i

24
108


In [200]:
heroes_df[heroes_df['id']==101+1]

Unnamed: 0,id,localized_name,name
100,102,Abaddon,npc_dota_hero_abaddon


In [157]:
heroes_df[heroes_df['id']==21]

Unnamed: 0,id,localized_name,name
20,21,Windranger,npc_dota_hero_windrunner


In [205]:
heroes_df.to_csv('file/hero_id_list.csv',sep=',')