In [1]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
df = pd.read_csv("data/all_data.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
# remove unknown columns
columns = df.columns
unknown_cols = [c for c in columns if "Unnamed" in c]
df = df.drop(columns=unknown_cols)

# remove rows with only na values
df = df.dropna(how="all")

features_to_keep = ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "FTR", "HS", "AS", "HST", "AST", 
                    "B365H", "B365D", "B365A",
                    "IWH", "IWD", "IWA",
                    "WHH", "WHD", "WHA"]
df = df[features_to_keep]

In [4]:
name_conversions = {
'St Etienne': 'Saint-Etienne',
'Ajaccio GFCO': 'Ajaccio', 
'Ath Madrid': 'Atletico',
'Ein Frankfurt': 'Frankfurt',
'Espanol': 'Espanyol',
'La Coruna': 'Depor',
'Nurnberg': 'Nuernberg',
'M\'gladbach': 'Gladbach',
'Bayern Munich': 'Bayern',
'Greuther Furth': 'Fuerth',
'Sp Gijon': 'Gijon',
'FC Koln': 'Koeln',
'Ath Bilbao': 'Bilbao',
'Hansa Rostock': 'Rostock',
'Fortuna Dusseldorf': 'Duesseldorf',
'Schalke 04': 'Schalke',
'Werder Bremen': 'Werder',
'Vallecano': 'RayoVallecano'
}

In [5]:
from os.path import exists
all_teams = set(df["HomeTeam"].unique()).union(set(df["AwayTeam"].unique()))


In [6]:

club_elo_base_url = "http://api.clubelo.com/"
name_mismatches = []
exceptions = []
for team_name in all_teams:
    try:
        if exists("elo/" + team_name + ".csv"):
            continue
        team_name_url = name_conversions.get(team_name, team_name).replace(" ", "")
        team_data = requests.get(club_elo_base_url + team_name_url, stream=True)
        byte_data = team_data.content
        if len(byte_data) == 38:
            # means that only header was returned, so team was not found
            name_mismatches.append(team_name)
            continue
        with open("elo/" + team_name + ".csv", mode='wb') as f:
            f.write(team_data.content)
    except:
        exceptions.append(team_name)
        continue
print("Name mismatches: ", name_mismatches)
print("Exceptions: ", exceptions)

Name mismatches:  ['Evian Thonon Gaillard', 'Gimnastic', 'Munich 1860', 'Kaiserslautern', 'Arles']
Exceptions:  []


In [8]:
import datetime
from functools import partial

# for team in all_teams:
#     df.loc[(df["HomeTeam"] == team) | (df["AwayTeam"] == team)]
# # df = df.apply(add_elo_feature, axis=1)
# df.to_csv("data/all_data_with_elo.csv")
# df

dfs = {}

for team in all_teams:
    if exists("elo/" + team + ".csv"):
        dfs[team] = pd.read_csv("elo/" + team + ".csv")
        
def find_team_elo_at(team_name, date):
    elo_df = dfs.get(team_name, None)
    if elo_df is None:
        return None
    elo_df["From"] = pd.to_datetime(elo_df["From"], dayfirst=True)
    try:
        elo = elo_df.loc[elo_df["From"] >= date]["Elo"].iloc[0]
        return elo
    except:
        return None
    
def add_elo_feature(s):
    s["HomeTeamELO"] = find_team_elo_at(s["HomeTeam"], s["Date"])
    s["AwayTeamELO"] = find_team_elo_at(s["AwayTeam"], s["Date"])
    return s

f = np.vectorize(find_team_elo_at)

df["HomeTeamELO"] = f(df["HomeTeam"], df["Date"])
df["AwayTeamELO"] = f(df["AwayTeam"], df["Date"])

# df = df.apply(add_elo_feature, axis=1)
df.to_csv("data/all_data_with_elo.csv")
df

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,...,B365D,B365A,IWH,IWD,IWA,WHH,WHD,WHA,HomeTeamELO,AwayTeamELO
0,2000-07-28,Marseille,Troyes,3.0,1.0,H,,,,,...,,,1.45,3.50,5.00,1.45,3.50,6.00,1690.283447,1580.911621
1,2000-07-28,Paris SG,Strasbourg,3.0,1.0,H,,,,,...,,,1.35,3.60,6.50,1.40,3.70,6.50,1719.976318,1636.466431
2,2000-07-29,Auxerre,Sedan,0.0,1.0,A,,,,,...,,,1.70,3.10,3.80,1.65,3.25,4.70,1635.098511,1634.386719
3,2000-07-29,Bordeaux,Metz,1.0,1.0,D,,,,,...,,,1.55,3.30,4.50,1.50,3.40,6.00,1729.042725,1677.067505
4,2000-07-29,Guingamp,St Etienne,2.0,2.0,D,,,,,...,,,2.20,2.80,2.80,2.40,2.90,2.75,1576.033813,1621.786011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37394,2021-10-24,Sevilla,Levante,5.0,3.0,H,18.0,12.0,11.0,7.0,...,4.2,8.0,1.47,4.40,7.00,1.44,4.20,7.50,1839.052124,1629.306763
37395,2021-10-24,Stuttgart,Union Berlin,1.0,1.0,D,8.0,11.0,2.0,3.0,...,3.5,2.9,2.50,3.40,2.85,2.45,3.40,2.80,1641.394043,1690.527466
37396,2021-10-24,Verona,Lazio,4.0,1.0,H,16.0,9.0,7.0,4.0,...,3.6,2.2,3.05,3.55,2.25,3.10,3.50,2.25,1591.301147,1709.902466
37397,2021-10-24,West Ham,Tottenham,1.0,0.0,H,13.0,7.0,4.0,4.0,...,3.5,3.0,2.40,3.45,2.95,2.38,3.40,2.90,1834.961426,1804.304077
