In [130]:
import numpy as np
import pandas as pd
import requests

In [131]:
def url_from_page(page: int) -> str:
    offset = (page - 1) * 50
    return f"https://www.hltv.org/stats/matches?startDate=all&offset={offset}"

data_frames = []

for page in range(1, 20):
    url = url_from_page(page)

    response = requests.get(url)

    df = pd.read_html(response.text)
    data_frames.append(pd.concat(df))

teams = pd.concat(data_frames)
teams

ImportError: html5lib not found, please install it

In [None]:
# Dump raw matches into CSV file

from datetime import datetime

prefix = datetime.utcnow().strftime('%d%m%Y_%H%M%S')

teams.to_csv(f'../data/{prefix}_matches.csv')

In [None]:
import re

def get_score(value: str) -> int:
    return int(re.search(r'\((\d+)\)', value).group(1))

teams['Team1 Score'] = list(map(get_score, teams['Team1'].tolist()))
teams['Team2 Score'] = list(map(get_score, teams['Team2'].tolist()))

In [None]:
# Remove score from team name

def get_team(value: str) -> str:
    return re.search(r'.+?(?=\s\()', value).group()

teams['Team1'] = list(map(get_team, teams['Team1'].tolist()))
teams['Team2'] = list(map(get_team, teams['Team2'].tolist()))
teams

In [None]:
teams['Team1 Won'] = teams['Team1 Score'] > teams['Team2 Score']
teams['Team2 Won'] = teams['Team1 Score'] < teams['Team2 Score']
teams

In [None]:
winners = []

for index, row in teams.iterrows():
    if row['Team1 Won']:
        winners.append(row['Team1'])
    else:
        winners.append(row['Team2'])

teams['Winner'] = winners
teams

In [None]:
full_teams = teams.copy(deep=True)
teams = teams[['Date', 'Team1', 'Team2', 'Team1 Score', 'Team2 Score', 'Winner', 'Map']]
teams

In [None]:
team_1_df = teams[['Team1', 'Team1 Score', 'Winner', 'Map', 'Team2', 'Team2 Score']].rename(columns={
    'Team1': 'Team',
    'Team1 Score': 'Score',
    'Team2': 'Opponent',
    'Team2 Score': 'Opponent Score',
})
team_1_df['Is Winner'] = team_1_df['Team'] == team_1_df['Winner']

team_2_df = teams[['Team2', 'Team2 Score', 'Winner', 'Map', 'Team1', 'Team1 Score']].rename(columns={
    'Team2': 'Team',
    'Team2 Score': 'Score',
    'Team1': 'Opponent',
    'Team1 Score': 'Opponent Score',
})
team_2_df['Is Winner'] = team_2_df['Team'] == team_2_df['Winner']

teams = pd.concat([team_1_df, team_2_df])
teams

In [None]:
# Dump into CSV file

prefix = datetime.utcnow().strftime('%d%m%Y_%H%M%S')

teams.to_csv(f'../data/{prefix}_teams.csv')

In [None]:
sum_teams = teams.groupby('Team').sum('Score')
sum_teams

In [None]:
sum_teams['Matches'] = teams.groupby('Team').size()
sum_teams

In [None]:
sum_teams['Winrate'] = sum_teams['Is Winner'] * 100 / sum_teams['Matches']
sum_teams.sort_values('Matches', ascending=False)

In [None]:
sum_teams[sum_teams['Matches']> 15].sort_values('Winrate', ascending=False)