In [1]:
from scrapy.selector import Selector
from selenium import webdriver
import pandas as pd
import time

# Players Ranking

In [110]:
url = 'http://www.hltv.org/stats/players'

try:
    driver = webdriver.Safari()
    driver.get(url)
    time.sleep(2)
    response = driver.page_source

    players = {
        'names': [],
        'urls': [],
        'country': [],
    }
    
    tr = Selector(text=response).xpath('/html/body/div/div/div/div/div/table/tbody/tr')
    for player in tr:
        players['names'].append(player.css('a::text').get())
        players['urls'].append(player.css('a').attrib['href'])
        players['country'].append(player.css('img').attrib['title'])
finally:
    driver.close()

In [111]:
len(players['names'])

713

In [113]:
df = pd.DataFrame({
    'player_name': players['names'],
    'url': [f"http://www.hltv.org{x}" for x in players['urls']],
    'country': players['country'],
    'dpr': None,
    'kast': None,
    'impact': None,
    'adr': None,
    'kpr': None,
    'rating': None
})

In [114]:
df.head()

Unnamed: 0,player_name,url,country,dpr,kast,impact,adr,kpr,rating
0,ZywOo,http://www.hltv.org/stats/players/11893/zywoo,France,,,,,,
1,s1mple,http://www.hltv.org/stats/players/7998/s1mple,Ukraine,,,,,,
2,sh1ro,http://www.hltv.org/stats/players/16920/sh1ro,Russia,,,,,,
3,degster,http://www.hltv.org/stats/players/17306/degster,Russia,,,,,,
4,Kaze,http://www.hltv.org/stats/players/8950/kaze,Malaysia,,,,,,


In [115]:
df.tail()

Unnamed: 0,player_name,url,country,dpr,kast,impact,adr,kpr,rating
708,B1ad3,http://www.hltv.org/stats/players/472/b1ad3,Ukraine,,,,,,
709,shinobi,http://www.hltv.org/stats/players/9152/shinobi,United States,,,,,,
710,netrick,http://www.hltv.org/stats/players/8422/netrick,Denmark,,,,,,
711,HUNDEN,http://www.hltv.org/stats/players/7415/hunden,Denmark,,,,,,
712,OCEAN,http://www.hltv.org/stats/players/6455/ocean,Canada,,,,,,


In [116]:
df.to_csv('players.csv')

In [224]:
df[df['country'] == 'Brazil']

Unnamed: 0,player_name,url,country,dpr,kast,impact,adr,kpr,rating
12,coldzera,http://www.hltv.org/stats/players/9216/coldzera,Brazil,0.61,73.8,1.1,80.0,0.78,1.16
14,vsm,http://www.hltv.org/stats/players/16816/vsm,Brazil,0.66,72.5,1.31,85.9,0.8,1.22
16,KSCERATO,http://www.hltv.org/stats/players/15631/kscerato,Brazil,0.57,75.7,1.08,79.2,0.74,1.19
22,yuurih,http://www.hltv.org/stats/players/12553/yuurih,Brazil,0.64,73.6,1.18,85.1,0.77,1.2
28,kNgV-,http://www.hltv.org/stats/players/5736/kngv,Brazil,0.65,70.3,1.23,79.0,0.77,1.13
55,HEN1,http://www.hltv.org/stats/players/8565/hen1,Brazil,0.58,72.9,1.1,74.7,0.71,1.11
79,exit,http://www.hltv.org/stats/players/11154/exit,Brazil,0.62,73.8,1.04,79.8,0.73,1.14
85,trk,http://www.hltv.org/stats/players/11163/trk,Brazil,0.66,74.0,1.16,81.0,0.74,1.16
97,fer,http://www.hltv.org/stats/players/8564/fer,Brazil,0.68,71.4,1.22,83.2,0.75,1.09
100,hardzao,http://www.hltv.org/stats/players/16817/hardzao,Brazil,0.63,72.9,1.09,76.5,0.72,1.13


# Player Statistics

In [118]:
df = pd.read_csv('players.csv', index_col=0)
df.head()

Unnamed: 0,player_name,url,country,dpr,kast,impact,adr,kpr,rating
0,ZywOo,http://www.hltv.org/stats/players/11893/zywoo,France,,,,,,
1,s1mple,http://www.hltv.org/stats/players/7998/s1mple,Ukraine,,,,,,
2,sh1ro,http://www.hltv.org/stats/players/16920/sh1ro,Russia,,,,,,
3,degster,http://www.hltv.org/stats/players/17306/degster,Russia,,,,,,
4,Kaze,http://www.hltv.org/stats/players/8950/kaze,Malaysia,,,,,,


In [123]:
from tqdm import tqdm

try:
    driver = webdriver.Safari()
    
    player_stats = {
        'dpr': [],
        'kast': [],
        'impact': [],
        'adr': [],
        'kpr': [],
        'rating': []
    }
    
    for url in tqdm(df['url']):
        driver.get(url)
        time.sleep(.5)
        
        r = Selector(text=str(driver.page_source))
        
        rating = r.xpath('/html/body/div/div[1]/div[2]/div[1]/div[2]/div[6]/div[2]/div[2]/div[1]/div[2]/div[1]/text()').get()
        if rating is None:
            rating = r.xpath('/html/body/div[3]/div[1]/div[2]/div[1]/div[2]/div[6]/div[2]/div[2]/div[1]/div[2]/div[1]/text()').get()
            
        dpr = r.xpath('/html/body/div/div[1]/div[2]/div[1]/div[2]/div[6]/div[2]/div[2]/div[2]/div[2]/div[1]/text()').get()
        if dpr is None:
            dpr = r.xpath('/html/body/div[3]/div[1]/div[2]/div[1]/div[2]/div[6]/div[2]/div[2]/div[2]/div[2]/div[1]/text()').get()
        
        kast = r.xpath('/html/body/div/div[1]/div[2]/div[1]/div[2]/div[6]/div[2]/div[2]/div[3]/div[2]/div[1]/text()').get()
        if kast is None:
            kast = r.xpath('/html/body/div[3]/div[1]/div[2]/div[1]/div[2]/div[6]/div[2]/div[2]/div[3]/div[2]/div[1]/text()').get()
        
        impact = r.xpath('/html/body/div/div[1]/div[2]/div[1]/div[2]/div[6]/div[2]/div[3]/div[1]/div[2]/div[1]/text()').get()
        if impact is None:
            impact = r.xpath('/html/body/div[3]/div[1]/div[2]/div[1]/div[2]/div[6]/div[2]/div[3]/div[1]/div[2]/div[1]/text()').get()
        
        adr = r.xpath('/html/body/div/div[1]/div[2]/div[1]/div[2]/div[6]/div[2]/div[3]/div[2]/div[2]/div[1]/text()').get()
        if adr is None:
            adr = r.xpath('/html/body/div[3]/div[1]/div[2]/div[1]/div[2]/div[6]/div[2]/div[3]/div[2]/div[2]/div[1]/text()').get()
            
        kpr = r.xpath('/html/body/div/div[1]/div[2]/div[1]/div[2]/div[6]/div[2]/div[3]/div[3]/div[2]/div[1]/text()').get()
        if kpr is None:
            kpr = r.xpath('/html/body/div[3]/div[1]/div[2]/div[1]/div[2]/div[6]/div[2]/div[3]/div[3]/div[2]/div[1]/text()').get()          
            
        player_stats['rating'].append(rating)
        player_stats['dpr'].append(dpr)
        player_stats['kast'].append(kast)
        player_stats['impact'].append(impact)
        player_stats['adr'].append(adr)
        player_stats['kpr'].append(kpr)
        
        #print(f"{rating} - {dpr} {kast} {impact} {adr} {kpr}")
finally:
    driver.close()

100%|██████████| 713/713 [32:30<00:00,  2.74s/it]


In [174]:
def to_float(x: str) -> float:
    try:
        x = x.replace('%', '')
        return float(x)
    except:
        return 0.00

df['dpr'] = list(map(to_float, player_stats['dpr']))
df['kast'] = list(map(to_float, player_stats['kast']))
df['impact'] = list(map(to_float, player_stats['impact']))
df['adr'] = list(map(to_float, player_stats['adr']))
df['kpr'] = list(map(to_float, player_stats['kpr']))
df['rating'] = list(map(to_float, player_stats['rating']))

df.to_csv('players.csv')
df.head()

Unnamed: 0,player_name,url,country,dpr,kast,impact,adr,kpr,rating
0,ZywOo,http://www.hltv.org/stats/players/11893/zywoo,France,0.62,74.8,1.48,89.4,0.86,1.34
1,s1mple,http://www.hltv.org/stats/players/7998/s1mple,Ukraine,0.64,73.8,1.35,86.1,0.85,1.24
2,sh1ro,http://www.hltv.org/stats/players/16920/sh1ro,Russia,0.54,76.0,1.23,79.6,0.77,1.26
3,degster,http://www.hltv.org/stats/players/17306/degster,Russia,0.61,72.1,1.3,80.7,0.8,1.23
4,Kaze,http://www.hltv.org/stats/players/8950/kaze,Malaysia,0.6,73.6,1.25,81.0,0.79,1.2


In [25]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [26]:
df = pd.read_csv('players.csv')
df_train = df.copy()

#df_train['kast'] = df_train['kast'] / 100.
#df_train['adr'] = df_train['adr'] / 100.
#df_train['impact'] = df_train['impact'] / 100.

df_train.head()

Unnamed: 0.1,Unnamed: 0,player_name,url,country,dpr,kast,impact,adr,kpr,rating
0,0,ZywOo,http://www.hltv.org/stats/players/11893/zywoo,France,0.62,74.8,1.48,89.4,0.86,1.34
1,1,s1mple,http://www.hltv.org/stats/players/7998/s1mple,Ukraine,0.64,73.8,1.35,86.1,0.85,1.24
2,2,sh1ro,http://www.hltv.org/stats/players/16920/sh1ro,Russia,0.54,76.0,1.23,79.6,0.77,1.26
3,3,degster,http://www.hltv.org/stats/players/17306/degster,Russia,0.61,72.1,1.3,80.7,0.8,1.23
4,4,Kaze,http://www.hltv.org/stats/players/8950/kaze,Malaysia,0.6,73.6,1.25,81.0,0.79,1.2


In [39]:
X = np.array(df_train[['dpr', 'kast', 'impact', 'adr', 'kpr']])
y = np.array(df_train[['rating']])

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,
                                                    test_size=0.2)

In [41]:
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_test, y_test)

0.5032026561793783

In [42]:
y_pred = reg.predict(X_test)

print(f'Coefficients: {reg.coef_}')

print(f'R2 score:{r2_score(y_test, y_pred)}')
print(f'RMSE:{mean_squared_error(y_test, y_pred, squared=False)}')
print(f'MAE:{mean_absolute_error(y_test, y_pred)}')

Coefficients: [[-5.49675329e-01  6.74824619e-03  1.60900273e-01 -4.99125230e-04
   9.05451206e-01]]
R2 score:0.5032026561793783
RMSE:0.052974992819747525
MAE:0.02553702518752474
