# Moneyball Project: UEFA Euro 2020 Fantasy Football
Passion project to leverage data-driven decision making for team selection in [UEFA Euro 2020 Fantasy Football](https://gaming.uefa.com/en/uefaeuro2020fantasyfootball/overview)

## Data Preparation and Cleansing
-----------------------------
### Purpose
Initial exploration on available dataset, aggregating and merging to dataframe for further exploration.

### Author
[Christian Wibisono](https://github.com/christianwbsn)



## 1. Import Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
import re

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_row',50) 

In [2]:
import difflib
from tqdm import tqdm
from nltk import everygrams

In [3]:
DATA_DIR = "../data"

## 2. Common Function

In [4]:
def camel_to_snake(name):
    name = re.sub(" ", "", name)
    name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()

In [5]:
def extract_date(date):
    return pd.Series([date.year, date.month, date.day])

In [6]:
def euro_fantasy_score(df):
    # Not covered by dataset
    # Common - Goal from outside the box 2 points
    # Common - Winning a penalty  2 points
    # Common - Conceding a penalty -1 points
    # Common - Own Goal -2 points
   
    # common
    score = 1 
    if df["min"] >= 60:
        score += 1
    if df["assists"] > 0:
        score += (df["assists"] * 3)
    if df["penalty_kick_miss"] > 0:
        score -= (df["penalty_kick_miss"] * 2)
    if df["yellow_cards"] > 0:
        score -= 1
    if df["red_cards"] > 0:
        score -= 3
        
    # position specific    
    if df["position"] == "F":
        score += (df["goals"] * 4)
    if df["position"] == "M":
        score += (df["goals"] * 5)
        if df["min"] >= 60 and df["clean_sheet"] > 0:
            score += 1
    if df["position"] == "D":
        score += (df["goals"] * 6)
        if df["min"] >= 60 and df["clean_sheet"] > 0:
            score += 4
        score -= (df['goals_allowed'] // 2)
    if df["position"] == "GK":
        score += (df["goals"] * 6)
        score += (df["penalty_kick_saved"] * 5)
        if df["min"] >= 60 and df["clean_sheet"] > 0:
            score += 4
        score += (df["saves"] // 3)
        score -= (df["goals_allowed"] // 2)
    return score

## 3. Dataset Exploration

### 3.1 Main Dataset

In [7]:
## Using this dataset as SSOT for player name and team name
main_df  = pd.read_csv("{}/interim/md_1_df.csv".format(DATA_DIR))

In [8]:
main_df["date"] = pd.to_datetime(main_df["date"])

### 3.2 Euro 2020 Dataset

#### 3.2.1 Players

#### 3.2.1.1 Appending last matchday data

In [9]:
with open('{}/raw/euro-2020/players_1.json'.format(DATA_DIR))as f:
    data    = json.load(f)
    players = data["data"]["value"]["playerList"]

In [10]:
old_players_df = pd.json_normalize(players)
old_players_df.rename(camel_to_snake, axis=1, inplace=True)

In [11]:
with open('{}/raw/euro-2020/players_2.json'.format(DATA_DIR))as f:
    data    = json.load(f)
    players = data["data"]["value"]["playerList"]

In [12]:
players_df = pd.json_normalize(players)

In [13]:
players_df.rename(camel_to_snake, axis=1, inplace=True)

In [14]:
players_df = players_df[players_df["trained"]!='']

In [15]:
players_df = pd.merge(players_df, old_players_df[["p_f_name", "g_s",  "assist", "y_c", "r_c", "p_m"]],
                     on="p_f_name", suffixes=("", "_last_md"))

In [16]:
players_df["g_s"] = players_df["g_s"] - players_df["g_s_last_md"]
players_df["assist"] = players_df["assist"] - players_df["assist_last_md"]
players_df["y_c"] = players_df["y_c"] - players_df["y_c_last_md"]
players_df["r_c"] = players_df["r_c"] - players_df["r_c_last_md"]
players_df["p_m"] = players_df["p_m"] - players_df["p_m_last_md"]

In [17]:
players_df = players_df.drop(["g_s_last_md",  "assist_last_md", "y_c_last_md", "r_c_last_md", "p_m_last_md"], axis=1)

In [18]:
players_df["date"] = players_df["current_matches_list"].apply(lambda x: x[0]["matchDate"])
players_df["opponent_name"] = players_df["current_matches_list"].apply(lambda x: x[0]["vsTSCode"])

In [19]:
all_players_name = main_df["player"].unique()
def get_closest_match(name):
    # return closest match for join operation
    return ''.join(list(difflib.get_close_matches(name, all_players_name, n=1, cutoff=0.7)))

In [20]:
players_df["closest_match"] = players_df["p_f_name"].apply(get_closest_match)
players_df["player"] = players_df.apply(lambda x: x["closest_match"] if x["closest_match"] != "" else x["p_f_name"], axis=1)

In [21]:
players_df["date"] = pd.to_datetime(players_df["date"])
players_df[["year", "month", "day"]] = players_df["date"].apply(extract_date)

In [22]:
main_df.head()

Unnamed: 0,player,date,league_name,team_name,opponent_name,fantasy_points,min,position,goals,assists,shots,shots_on_goal,crosses,fouls_drawn,fouls_committed,tackles_won,interceptions,yellow_cards,red_cards,penalty_kick_miss,clean_sheet,goals_allowed,accurate_passes,shots_assisted,shootout_goals,shootout_misses,game_started,year,month,day,saves,wins,penalty_kick_saved,shootout_saves,points
0,Artem Dzyuba,2019-06-08,European Championship Qualifiers,Russia,San Marino,73.66,90,F,4,1,16.0,7.0,1.0,1.0,1.0,0.0,0.0,0,0,1,1,0,23.0,3.0,0.0,0.0,1,2019,6,8,0.0,0.0,0.0,0.0,19.0
1,Cristiano Ronaldo,2019-09-10,European Championship Qualifiers,Portugal,Lithuania,56.7,79,F,4,0,8.0,5.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,35.0,3.0,0.0,0.0,1,2019,9,10,0.0,0.0,0.0,0.0,18.0
2,Memphis Depay,2019-03-21,European Championship Qualifiers,Netherlands,Belarus,54.0,90,F,2,2,6.0,4.0,6.0,3.0,0.0,1.0,0.0,0,0,0,1,0,40.0,3.0,0.0,0.0,1,2019,3,21,0.0,0.0,0.0,0.0,16.0
3,Denis Cheryshev,2019-10-13,European Championship Qualifiers,Russia,Cyprus,53.32,90,M,2,2,4.0,3.0,13.0,0.0,3.0,1.0,0.0,0,0,0,1,0,36.0,5.0,0.0,0.0,1,2019,10,13,0.0,0.0,0.0,0.0,19.0
4,Cristiano Ronaldo,2019-11-14,European Championship Qualifiers,Portugal,Lithuania,51.8,83,F,3,0,13.0,5.0,0.0,1.0,0.0,0.0,0.0,0,0,0,1,0,40.0,2.0,0.0,0.0,1,2019,11,14,0.0,0.0,0.0,0.0,14.0


In [23]:
players_df.head()

Unnamed: 0,id,p_d_name,p_f_name,latin_name,t_name,t_id,team_played,c_code,skill,value,is_active,sel_per,md_id,tot_pts,g_s,assist,c_s,g_c,y_c,r_c,o_g,p_s,p_c,p_e,saves,p_m,b_r,g_ob,m_om,m_om_pts,p_status,match_atd,trained,is_played,sel_in_per,sel_out_per,upcoming_matches_list,current_matches_list,avg_player_pts,avg_player_value,last_gd_points,category1,category2,category3,category4,category5,category6,category7,category8,category9,category10,category11,category12,category13,category14,category15,date,opponent_name,closest_match,player,year,month,day
0,63706,C. Ronaldo,Cristiano Ronaldo,Cristiano Ronaldo,Portugal,110,1,POR,4,12.0,1,29,2,19,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,1,2,"[{'mdId': '3', 'tId': '110', 'tSCode': 'Portug...","[{'mdId': '2', 'tId': '110', 'tSCode': 'Portug...",9.5,1.6,9,1,0,14,0.9,2.0,0,0,0,0,0,0,0,0,1,0,2021-06-19 18:00:00,Germany,Cristiano Ronaldo,Cristiano Ronaldo,2021,6,19
1,250076574,K. Mbappé,Kylian Mbappé,Kylian Mbappe,France,43,1,FRA,4,12.0,1,42,2,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,3,1,"[{'mdId': '3', 'tId': '43', 'tSCode': 'France'...","[{'mdId': '2', 'tId': '43', 'tSCode': 'France'...",2.0,0.3,2,1,0,19,3.2,2.3,0,0,0,0,0,0,0,0,1,0,2021-06-19 15:00:00,Hungary,Kylian Mbappé,Kylian Mbappé,2021,6,19
2,250016833,H. Kane,Harry Kane,Harry Kane,England,39,1,ENG,4,11.5,1,35,2,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,2,1,"[{'mdId': '3', 'tId': '39', 'tSCode': 'England...","[{'mdId': '2', 'tId': '39', 'tSCode': 'England...",2.0,0.4,2,1,0,10,1.6,0.1,0,0,0,0,0,0,0,0,1,0,2021-06-18 21:00:00,Scotland,Harry Kane,Harry Kane,2021,6,18
3,250002096,R. Lewandowski,Robert Lewandowski,Robert Lewandowski,Poland,109,1,POL,4,11.5,1,11,2,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,0,2,"[{'mdId': '3', 'tId': '109', 'tSCode': 'Poland...","[{'mdId': '2', 'tId': '109', 'tSCode': 'Poland...",3.5,0.6,5,1,0,3,0.1,1.1,0,0,0,0,0,0,0,0,1,0,2021-06-19 21:00:00,Spain,Robert Lewandowski,Robert Lewandowski,2021,6,19
4,250010802,R. Lukaku,Romelu Lukaku,Romelu Lukaku,Belgium,13,1,BEL,4,11.0,1,54,2,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,3,1,"[{'mdId': '3', 'tId': '13', 'tSCode': 'Belgium...","[{'mdId': '2', 'tId': '13', 'tSCode': 'Belgium...",6.0,1.1,2,1,0,16,3.0,0.9,0,0,0,0,0,0,0,0,1,0,2021-06-17 18:00:00,Denmark,Romelu Lukaku,Romelu Lukaku,2021,6,17


In [24]:
with open('{}/raw/euro-2020/fixtures.json'.format(DATA_DIR))as f:
    data    = json.load(f)
    fixtures = data["data"]["value"][1]["match"]

In [25]:
fixtures_df = pd.json_normalize(fixtures)
fixtures_df["atName"] = fixtures_df["atName"].apply(lambda x: x.strip())
fixtures_df["htName"] = fixtures_df["htName"].apply(lambda x: x.strip())

In [26]:
def heuristic_minutes_played(df):
    if (df["g_s"] == 0) and (df["assist"] == 0) and (df["y_c"] == 0) and (df["r_c"] == 0) and (df["last_gd_points"] == 2):
        return 90
    elif df["last_gd_points"] >= 2:
        return 90
    elif df["last_gd_points"] == 0:
        return 0
    else:
        return 59

In [27]:
players_df["min"] = players_df.apply(heuristic_minutes_played, axis=1)
players_df.rename(columns={"t_name": "team_name", "g_s": "goals", "assist": "assists", 
                           "y_c": "yellow_cards", "r_c" : "red_cards", "last_gd_points": "points",
                           "p_m": "penalty_kick_miss"}, inplace=True)
players_df["league_name"] = "European Championship 2020"

In [28]:
fixtures_df.head()

Unnamed: 0,mId,gmIsCurrent,gmIsLocked,gdId,isFeedLive,isLive,dateTime,dateTimeLock,htId,htName,htShortName,htCCode,atId,atName,atShortName,atCCode,gameNo,mdName,htScore,atScore,htAggScore,atAggScore,aggDescription,aggFlag,matchStatus,teamSc,teamScStartDate,teamScEndDate,lineupAnnounced,isMatchPostponed,groupId,groupName,stadiumId,stadiumName,stadiumThumb,venueId,venueName,venueCountryCode
0,2024460,2,1,6,2,2,06/16/2021 15:00:00,06/16/2021 15:00:00,42,Finland,Finland,FIN,57451,Russia,Russia,RUS,33674,Match 15,0,1,0,0,,0,2,,,,1,0,2006439,Group B,250003363,Gazprom Arena,https://img.uefa.com/imgml/stadium/matchinfo/w...,2850,St Petersburg,RUS
1,2024457,2,1,6,2,2,06/16/2021 18:00:00,06/16/2021 18:00:00,135,Turkey,Turkey,TUR,144,Wales,Wales,WAL,33674,Match 14,0,2,0,0,,0,2,,,,1,0,2006438,Group A,250002745,Bakı Olimpiya Stadionu,https://img.uefa.com/imgml/stadium/matchinfo/w...,1162,Baku,AZE
2,2024458,2,1,6,2,2,06/16/2021 21:00:00,06/16/2021 21:00:00,66,Italy,Italy,ITA,128,Switzerland,Switzerland,SUI,33674,Match 13,3,0,0,0,,0,2,,,,1,0,2006438,Group A,57775,Olimpico in Rome,https://img.uefa.com/imgml/stadium/matchinfo/w...,2637,Rome,ITA
3,2024444,2,1,7,2,2,06/17/2021 15:00:00,06/17/2021 15:00:00,57166,Ukraine,Ukraine,UKR,59205,North Macedonia,North Macedonia,MKD,33674,Match 18,2,1,0,0,,0,2,,,,1,0,2006440,Group C,250001298,National Arena Bucharest,https://img.uefa.com/imgml/stadium/matchinfo/w...,1326,Bucharest,ROU
4,2024459,2,1,7,2,2,06/17/2021 18:00:00,06/17/2021 18:00:00,35,Denmark,Denmark,DEN,13,Belgium,Belgium,BEL,33674,Match 16,1,2,0,0,,0,2,,,,1,0,2006439,Group B,63462,Parken,https://img.uefa.com/imgml/stadium/matchinfo/w...,1449,Copenhagen,DEN


In [29]:
fixtures_df[fixtures_df["htName"] == "Belgium"]["htScore"]

Series([], Name: htScore, dtype: object)

In [30]:
def goals_allowed(df):
    if df["team_name"] in fixtures_df["htName"].values:
        return int(fixtures_df[fixtures_df["htName"] == df["team_name"]].reset_index()["atScore"])
    else:
        return int(fixtures_df[fixtures_df["atName"] == df["team_name"]].reset_index()["htScore"])

In [31]:
players_df["goals_allowed"] = players_df.apply(goals_allowed, axis=1)
players_df["clean_sheet"] = players_df["goals_allowed"].apply(lambda x: 1 if x == 0 else 0)
players_df["game_started"] = players_df["min"].apply(lambda x: 1 if x >= 60 else 0)

In [32]:
players_df.shape

(620, 68)

In [33]:
# if players have multiple position choose the most common position
position = main_df.groupby("player").agg(position=('position', "first")).to_dict()["position"]

In [34]:
players_df["position"] = players_df["player"].apply(lambda x: position[x]
                                                    if x in position.keys() else "")

In [35]:
players_df.shape

(620, 69)

In [36]:
players_df

Unnamed: 0,id,p_d_name,p_f_name,latin_name,team_name,t_id,team_played,c_code,skill,value,is_active,sel_per,md_id,tot_pts,goals,assists,c_s,g_c,yellow_cards,red_cards,o_g,p_s,p_c,p_e,saves,penalty_kick_miss,b_r,g_ob,m_om,m_om_pts,p_status,match_atd,trained,is_played,sel_in_per,sel_out_per,upcoming_matches_list,current_matches_list,avg_player_pts,avg_player_value,points,category1,category2,category3,category4,category5,category6,category7,category8,category9,category10,category11,category12,category13,category14,category15,date,opponent_name,closest_match,player,year,month,day,min,league_name,goals_allowed,clean_sheet,game_started,position
0,63706,C. Ronaldo,Cristiano Ronaldo,Cristiano Ronaldo,Portugal,110,1,POR,4,12.0,1,29,2,19,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,1,2,"[{'mdId': '3', 'tId': '110', 'tSCode': 'Portug...","[{'mdId': '2', 'tId': '110', 'tSCode': 'Portug...",9.5,1.6,9,1,0,14,0.9,2.0,0,0,0,0,0,0,0,0,1,0,2021-06-19 18:00:00,Germany,Cristiano Ronaldo,Cristiano Ronaldo,2021,6,19,90,European Championship 2020,4,0,1,F
1,250076574,K. Mbappé,Kylian Mbappé,Kylian Mbappe,France,43,1,FRA,4,12.0,1,42,2,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,3,1,"[{'mdId': '3', 'tId': '43', 'tSCode': 'France'...","[{'mdId': '2', 'tId': '43', 'tSCode': 'France'...",2.0,0.3,2,1,0,19,3.2,2.3,0,0,0,0,0,0,0,0,1,0,2021-06-19 15:00:00,Hungary,Kylian Mbappé,Kylian Mbappé,2021,6,19,90,European Championship 2020,1,0,1,F
2,250016833,H. Kane,Harry Kane,Harry Kane,England,39,1,ENG,4,11.5,1,35,2,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,2,1,"[{'mdId': '3', 'tId': '39', 'tSCode': 'England...","[{'mdId': '2', 'tId': '39', 'tSCode': 'England...",2.0,0.4,2,1,0,10,1.6,0.1,0,0,0,0,0,0,0,0,1,0,2021-06-18 21:00:00,Scotland,Harry Kane,Harry Kane,2021,6,18,90,European Championship 2020,0,1,1,F
3,250002096,R. Lewandowski,Robert Lewandowski,Robert Lewandowski,Poland,109,1,POL,4,11.5,1,11,2,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,0,2,"[{'mdId': '3', 'tId': '109', 'tSCode': 'Poland...","[{'mdId': '2', 'tId': '109', 'tSCode': 'Poland...",3.5,0.6,5,1,0,3,0.1,1.1,0,0,0,0,0,0,0,0,1,0,2021-06-19 21:00:00,Spain,Robert Lewandowski,Robert Lewandowski,2021,6,19,90,European Championship 2020,1,0,1,F
4,250010802,R. Lukaku,Romelu Lukaku,Romelu Lukaku,Belgium,13,1,BEL,4,11.0,1,54,2,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,3,1,"[{'mdId': '3', 'tId': '13', 'tSCode': 'Belgium...","[{'mdId': '2', 'tId': '13', 'tSCode': 'Belgium...",6.0,1.1,2,1,0,16,3.0,0.9,0,0,0,0,0,0,0,0,1,0,2021-06-17 18:00:00,Denmark,Romelu Lukaku,Romelu Lukaku,2021,6,17,90,European Championship 2020,1,0,1,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
615,250107124,N. Patterson,Nathan Patterson,Nathan Patterson,Scotland,117,1,SCO,2,4.0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,0,0,"[{'mdId': '3', 'tId': '117', 'tSCode': 'Scotla...","[{'mdId': '2', 'tId': '117', 'tSCode': 'Scotla...",0.0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,2021-06-18 21:00:00,England,Nathan Patterson,Nathan Patterson,2021,6,18,0,European Championship 2020,0,1,0,
616,250107375,A. Bayındır,Altay Bayındır,Altay Bayindir,Turkey,135,1,TUR,1,4.0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,0,0,"[{'mdId': '3', 'tId': '135', 'tSCode': 'Turkey...","[{'mdId': '2', 'tId': '135', 'tSCode': 'Turkey...",0.0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,2021-06-16 18:00:00,Wales,Altay Bayındır,Altay Bayındır,2021,6,16,0,European Championship 2020,2,0,0,
617,250091355,J. Lotomba,Jordan Lotomba,Jordan Lotomba,Switzerland,128,1,SUI,2,4.0,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,0,0,"[{'mdId': '3', 'tId': '128', 'tSCode': 'Switze...","[{'mdId': '2', 'tId': '128', 'tSCode': 'Switze...",0.0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,2021-06-16 21:00:00,Italy,Jordan Lotomba,Jordan Lotomba,2021,6,16,0,European Championship 2020,3,0,0,D
618,1905874,N. Boilesen,Nicolai Boilesen,Nicolai Boilesen,Denmark,35,1,DEN,2,4.0,1,8,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,0,1,"[{'mdId': '3', 'tId': '35', 'tSCode': 'Denmark...","[{'mdId': '2', 'tId': '35', 'tSCode': 'Denmark...",0.0,0.0,0,0,0,0,0.1,0.0,0,0,0,0,0,0,0,0,0,0,2021-06-17 18:00:00,Belgium,Nicolai Boilesen,Nicolai Boilesen,2021,6,17,0,European Championship 2020,2,0,0,D


In [37]:
def update_data(df):
    main_df_columns = ["player", "date", "league_name", "game_started",
                       "team_name", "opponent_name", "position", "goals_allowed", "clean_sheet",
                       "year", "month", "day", "min", "goals", "assists", 
                       "penalty_kick_miss","yellow_cards", "red_cards", "saves", "points"]
    return df[main_df_columns]

In [38]:
new_train = update_data(players_df)

In [39]:
main_df = pd.concat([main_df, new_train])

In [40]:
main_df = main_df.fillna(0)

In [41]:
main_df.to_csv("{}/interim/md_2_df.csv".format(DATA_DIR), index=False)

#### 3.2.1.2 Generating test data

In [42]:
with open('{}/raw/euro-2020/players_3.json'.format(DATA_DIR))as f:
    data    = json.load(f)
    players = data["data"]["value"]["playerList"]

In [43]:
players_df = pd.json_normalize(players)

In [44]:
players_df.rename(camel_to_snake, axis=1, inplace=True)

In [45]:
players_df = players_df[players_df["trained"]!='']

In [46]:
players_df.head()

Unnamed: 0,id,p_d_name,p_f_name,latin_name,t_name,t_id,team_played,c_code,skill,value,is_active,sel_per,md_id,tot_pts,g_s,assist,c_s,g_c,y_c,r_c,o_g,p_s,p_c,p_e,saves,p_m,b_r,g_ob,m_om,m_om_pts,p_status,match_atd,trained,is_played,sel_in_per,sel_out_per,upcoming_matches_list,current_matches_list,avg_player_pts,avg_player_value,last_gd_points,category1,category2,category3,category4,category5,category6,category7,category8,category9,category10,category11,category12,category13,category14,category15
0,63706,C. Ronaldo,Cristiano Ronaldo,Cristiano Ronaldo,Portugal,110,0,POR,4,12.1,1,32,3,19,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,2,1,"[{'mdId': '3', 'tId': '110', 'tSCode': 'Portug...","[{'mdId': '3', 'tId': '110', 'tSCode': 'Portug...",9.5,1.6,9,1,0,13,1.8,2.0,0,0,0,0,0,0,0,0,1,0
1,250076574,K. Mbappé,Kylian Mbappé,Kylian Mbappe,France,43,0,FRA,4,12.0,1,36,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,1,2,"[{'mdId': '3', 'tId': '43', 'tSCode': 'France'...","[{'mdId': '3', 'tId': '43', 'tSCode': 'France'...",2.0,0.3,2,1,0,9,0.8,0.7,0,0,0,0,0,0,0,0,1,0
2,250016833,H. Kane,Harry Kane,Harry Kane,England,39,0,ENG,4,11.5,1,29,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,0,3,"[{'mdId': '3', 'tId': '39', 'tSCode': 'England...","[{'mdId': '3', 'tId': '39', 'tSCode': 'England...",2.0,0.4,2,1,0,9,0.5,0.4,0,0,0,0,0,0,0,0,1,0
3,250002096,R. Lewandowski,Robert Lewandowski,Robert Lewandowski,Poland,109,0,POL,4,11.5,1,11,3,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,0,1,"[{'mdId': '3', 'tId': '109', 'tSCode': 'Poland...","[{'mdId': '3', 'tId': '109', 'tSCode': 'Poland...",3.5,0.6,5,1,0,3,0.2,0.3,0,0,0,0,0,0,0,0,1,0
4,250010802,R. Lukaku,Romelu Lukaku,Romelu Lukaku,Belgium,13,0,BEL,4,11.1,1,55,3,12,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,2,1,"[{'mdId': '3', 'tId': '13', 'tSCode': 'Belgium...","[{'mdId': '3', 'tId': '13', 'tSCode': 'Belgium...",6.0,1.1,2,1,0,22,2.2,1.8,0,0,0,0,0,0,0,0,1,0


In [47]:
players_df["date"] = players_df["upcoming_matches_list"].apply(lambda x: x[0]["matchDate"])
players_df["opponent_name"] = players_df["upcoming_matches_list"].apply(lambda x: x[0]["vsTSCode"])

In [48]:
all_players_name = main_df["player"].unique()
def get_closest_match(name):
    # return closest match for join operation
    return ''.join(list(difflib.get_close_matches(name, all_players_name, n=1, cutoff=0.7)))

In [49]:
players_df["closest_match"] = players_df["p_f_name"].apply(get_closest_match)
players_df["player"] = players_df.apply(lambda x: x["closest_match"] if x["closest_match"] != "" else x["p_f_name"], axis=1)

In [50]:
players_df["date"] = pd.to_datetime(players_df["date"])
players_df[["year", "month", "day"]] = players_df["date"].apply(extract_date)

In [51]:
main_df.head()

Unnamed: 0,player,date,league_name,team_name,opponent_name,fantasy_points,min,position,goals,assists,shots,shots_on_goal,crosses,fouls_drawn,fouls_committed,tackles_won,interceptions,yellow_cards,red_cards,penalty_kick_miss,clean_sheet,goals_allowed,accurate_passes,shots_assisted,shootout_goals,shootout_misses,game_started,year,month,day,saves,wins,penalty_kick_saved,shootout_saves,points
0,Artem Dzyuba,2019-06-08,European Championship Qualifiers,Russia,San Marino,73.66,90,F,4,1,16.0,7.0,1.0,1.0,1.0,0.0,0.0,0,0,1,1,0,23.0,3.0,0.0,0.0,1,2019,6,8,0.0,0.0,0.0,0.0,19.0
1,Cristiano Ronaldo,2019-09-10,European Championship Qualifiers,Portugal,Lithuania,56.7,79,F,4,0,8.0,5.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,35.0,3.0,0.0,0.0,1,2019,9,10,0.0,0.0,0.0,0.0,18.0
2,Memphis Depay,2019-03-21,European Championship Qualifiers,Netherlands,Belarus,54.0,90,F,2,2,6.0,4.0,6.0,3.0,0.0,1.0,0.0,0,0,0,1,0,40.0,3.0,0.0,0.0,1,2019,3,21,0.0,0.0,0.0,0.0,16.0
3,Denis Cheryshev,2019-10-13,European Championship Qualifiers,Russia,Cyprus,53.32,90,M,2,2,4.0,3.0,13.0,0.0,3.0,1.0,0.0,0,0,0,1,0,36.0,5.0,0.0,0.0,1,2019,10,13,0.0,0.0,0.0,0.0,19.0
4,Cristiano Ronaldo,2019-11-14,European Championship Qualifiers,Portugal,Lithuania,51.8,83,F,3,0,13.0,5.0,0.0,1.0,0.0,0.0,0.0,0,0,0,1,0,40.0,2.0,0.0,0.0,1,2019,11,14,0.0,0.0,0.0,0.0,14.0


In [52]:
players_df.head()

Unnamed: 0,id,p_d_name,p_f_name,latin_name,t_name,t_id,team_played,c_code,skill,value,is_active,sel_per,md_id,tot_pts,g_s,assist,c_s,g_c,y_c,r_c,o_g,p_s,p_c,p_e,saves,p_m,b_r,g_ob,m_om,m_om_pts,p_status,match_atd,trained,is_played,sel_in_per,sel_out_per,upcoming_matches_list,current_matches_list,avg_player_pts,avg_player_value,last_gd_points,category1,category2,category3,category4,category5,category6,category7,category8,category9,category10,category11,category12,category13,category14,category15,date,opponent_name,closest_match,player,year,month,day
0,63706,C. Ronaldo,Cristiano Ronaldo,Cristiano Ronaldo,Portugal,110,0,POR,4,12.1,1,32,3,19,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,2,1,"[{'mdId': '3', 'tId': '110', 'tSCode': 'Portug...","[{'mdId': '3', 'tId': '110', 'tSCode': 'Portug...",9.5,1.6,9,1,0,13,1.8,2.0,0,0,0,0,0,0,0,0,1,0,2021-06-23 21:00:00,France,Cristiano Ronaldo,Cristiano Ronaldo,2021,6,23
1,250076574,K. Mbappé,Kylian Mbappé,Kylian Mbappe,France,43,0,FRA,4,12.0,1,36,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,1,2,"[{'mdId': '3', 'tId': '43', 'tSCode': 'France'...","[{'mdId': '3', 'tId': '43', 'tSCode': 'France'...",2.0,0.3,2,1,0,9,0.8,0.7,0,0,0,0,0,0,0,0,1,0,2021-06-23 21:00:00,Portugal,Kylian Mbappé,Kylian Mbappé,2021,6,23
2,250016833,H. Kane,Harry Kane,Harry Kane,England,39,0,ENG,4,11.5,1,29,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,0,3,"[{'mdId': '3', 'tId': '39', 'tSCode': 'England...","[{'mdId': '3', 'tId': '39', 'tSCode': 'England...",2.0,0.4,2,1,0,9,0.5,0.4,0,0,0,0,0,0,0,0,1,0,2021-06-22 21:00:00,Czech Republic,Harry Kane,Harry Kane,2021,6,22
3,250002096,R. Lewandowski,Robert Lewandowski,Robert Lewandowski,Poland,109,0,POL,4,11.5,1,11,3,7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,0,1,"[{'mdId': '3', 'tId': '109', 'tSCode': 'Poland...","[{'mdId': '3', 'tId': '109', 'tSCode': 'Poland...",3.5,0.6,5,1,0,3,0.2,0.3,0,0,0,0,0,0,0,0,1,0,2021-06-23 18:00:00,Sweden,Robert Lewandowski,Robert Lewandowski,2021,6,23
4,250010802,R. Lukaku,Romelu Lukaku,Romelu Lukaku,Belgium,13,0,BEL,4,11.1,1,55,3,12,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,In contention to start next game,0,2,1,"[{'mdId': '3', 'tId': '13', 'tSCode': 'Belgium...","[{'mdId': '3', 'tId': '13', 'tSCode': 'Belgium...",6.0,1.1,2,1,0,22,2.2,1.8,0,0,0,0,0,0,0,0,1,0,2021-06-21 21:00:00,Finland,Romelu Lukaku,Romelu Lukaku,2021,6,21


In [53]:
players_df.rename(columns={"t_name": "team_name"}, inplace=True)
players_df["league_name"] = "European Championship 2020"

In [54]:
players_df.shape

(620, 64)

In [55]:
# if players have multiple position choose the most common position
position = main_df.groupby("player").agg(position=('position',
                                                   lambda x: x.value_counts().sort_index().sort_values(ascending=False).index[0])).to_dict()["position"]

In [56]:
players_df["position"] = players_df["player"].apply(lambda x: position[x]
                                                    if x in position.keys() else "")

In [57]:
players_df.shape

(620, 65)

In [58]:
def generate_test_data(df):
    main_df_columns = ["player", "date", "league_name", 
                       "team_name", "opponent_name", "position",
                       "year", "month", "day"]
    return df[main_df_columns]

In [59]:
test = generate_test_data(players_df)

In [60]:
main_df = pd.concat([main_df, test])

In [61]:
main_df = pd.merge(main_df, players_df[["player", "value", "skill"]], on=["player"], how="left")

In [62]:
def get_agg_before(df):
    merged_df = df.copy()
    merged_df = pd.merge(merged_df, df, on=["player", "team_name"])
    merged_df = merged_df[merged_df['date_y'] < merged_df["date_x"]]
    merged_df["is_scoring"] = merged_df["goals_y"].apply(lambda x: 1 if x > 0 else 0)
    merged_df["is_assisting"] = merged_df["assists_y"].apply(lambda x: 1 if x > 0 else 0)
    merged_df_1 = merged_df.groupby(["player", "team_name", "date_x"]).agg(
                                                                        prev_mean_points=("points_y", "mean"),
                                                                        prev_mean_goals=("goals_y", "median"),
                                                                        prev_mean_assists=("assists_y", "mean"),
                                                                        prev_max_points=("points_y", "max"),
                                                                        prev_std_points=("points_y", "std"),
                                                                        prev_std_goals=("goals_y", "std"),
                                                                        prev_std_assists=("assists_y", "std"),
                                                                        prev_median_min=("min_y", "median"),
                                                                        prev_ratio_starter=("game_started_y", "mean"),
                                                                        count_played=("date_y","nunique"),
                                                                        goal_consistency=("is_scoring", "mean"),
                                                                        assist_consistency=("is_assisting", "mean"),
                                                                        clean_sheet_consistency=("clean_sheet_y", "mean")
                                                                       )
    
    merged_df_1 = merged_df_1.reset_index()
    merged_df_1.rename(columns={"date_x": "date"}, inplace=True)
    
    merged_df_2 = merged_df.groupby(["team_name", "date_x"]).agg(count_team_played=("date_y", "nunique"))
    merged_df_2 = merged_df_2.reset_index()
    merged_df_2.rename(columns={"date_x": "date"}, inplace=True)
    
    
    merged_df_3 = merged_df[merged_df["opponent_name_x"] == merged_df["opponent_name_y"]]
    merged_df_3 = merged_df_3.groupby(["player", "team_name", "date_x"]).agg(prev_max_goal_to_specific_opp=("goals_y", "max"),
                                                                           prev_max_points_to_specific_opp=("points_y", "max"),
                                                                           prev_mean_points_to_specific_opp=("points_y", "mean"))
    merged_df_3 = merged_df_3.reset_index()
    merged_df_3.rename(columns={"date_x": "date", "opponent_name_y": "opponent_name"}, inplace=True)
    
    merged_df = pd.merge(merged_df_1, merged_df_2, on=["team_name", "date"], how="left")
    merged_df = pd.merge(merged_df, merged_df_3, on=["player", "team_name", "date"], how="left")
    merged_df["prev_ratio_played"] = merged_df["count_played"] / merged_df["count_team_played"]
    return merged_df


In [63]:
agg = get_agg_before(main_df)

In [64]:
main_df.head()

Unnamed: 0,player,date,league_name,team_name,opponent_name,fantasy_points,min,position,goals,assists,shots,shots_on_goal,crosses,fouls_drawn,fouls_committed,tackles_won,interceptions,yellow_cards,red_cards,penalty_kick_miss,clean_sheet,goals_allowed,accurate_passes,shots_assisted,shootout_goals,shootout_misses,game_started,year,month,day,saves,wins,penalty_kick_saved,shootout_saves,points,value,skill
0,Artem Dzyuba,2019-06-08,European Championship Qualifiers,Russia,San Marino,73.66,90.0,F,4.0,1.0,16.0,7.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,23.0,3.0,0.0,0.0,1.0,2019,6,8,0.0,0.0,0.0,0.0,19.0,8.5,4.0
1,Cristiano Ronaldo,2019-09-10,European Championship Qualifiers,Portugal,Lithuania,56.7,79.0,F,4.0,0.0,8.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,35.0,3.0,0.0,0.0,1.0,2019,9,10,0.0,0.0,0.0,0.0,18.0,12.1,4.0
2,Memphis Depay,2019-03-21,European Championship Qualifiers,Netherlands,Belarus,54.0,90.0,F,2.0,2.0,6.0,4.0,6.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,40.0,3.0,0.0,0.0,1.0,2019,3,21,0.0,0.0,0.0,0.0,16.0,10.0,4.0
3,Denis Cheryshev,2019-10-13,European Championship Qualifiers,Russia,Cyprus,53.32,90.0,M,2.0,2.0,4.0,3.0,13.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,36.0,5.0,0.0,0.0,1.0,2019,10,13,0.0,0.0,0.0,0.0,19.0,8.0,3.0
4,Cristiano Ronaldo,2019-11-14,European Championship Qualifiers,Portugal,Lithuania,51.8,83.0,F,3.0,0.0,13.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,40.0,2.0,0.0,0.0,1.0,2019,11,14,0.0,0.0,0.0,0.0,14.0,12.1,4.0


In [65]:
main_df = main_df.sort_values(["player", "date"])

In [66]:
main_df['last_md_points'] = main_df.groupby("player")["points"].shift()
main_df['last_md_goals'] = main_df.groupby("player")["goals"].shift()
main_df['last_md_assists'] = main_df.groupby("player")["assists"].shift()

In [67]:
main_df = main_df.drop(["goals", "assists", "shots", "shots_on_goal", "crosses", "fouls_drawn", 
                        "fouls_committed", "tackles_won", "interceptions", "yellow_cards", "red_cards",
                        "penalty_kick_miss", "clean_sheet", "goals_allowed", "accurate_passes",
                        "shots_assisted", "shootout_goals", "shootout_misses", "game_started", "saves", "wins", 
                        "penalty_kick_saved", "shootout_saves"], axis=1)
main_df = pd.merge(main_df, agg, how="left", on=["player", "team_name", "date"])

In [68]:
main_df.columns

Index(['player', 'date', 'league_name', 'team_name', 'opponent_name',
       'fantasy_points', 'min', 'position', 'year', 'month', 'day', 'points',
       'value', 'skill', 'last_md_points', 'last_md_goals', 'last_md_assists',
       'prev_mean_points', 'prev_mean_goals', 'prev_mean_assists',
       'prev_max_points', 'prev_std_points', 'prev_std_goals',
       'prev_std_assists', 'prev_median_min', 'prev_ratio_starter',
       'count_played', 'goal_consistency', 'assist_consistency',
       'clean_sheet_consistency', 'count_team_played',
       'prev_max_goal_to_specific_opp', 'prev_max_points_to_specific_opp',
       'prev_mean_points_to_specific_opp', 'prev_ratio_played'],
      dtype='object')

In [69]:
players_df.to_csv("{}/interim/fantasy_euro.csv".format(DATA_DIR), index=False)

In [70]:
main_df

Unnamed: 0,player,date,league_name,team_name,opponent_name,fantasy_points,min,position,year,month,day,points,value,skill,last_md_points,last_md_goals,last_md_assists,prev_mean_points,prev_mean_goals,prev_mean_assists,prev_max_points,prev_std_points,prev_std_goals,prev_std_assists,prev_median_min,prev_ratio_starter,count_played,goal_consistency,assist_consistency,clean_sheet_consistency,count_team_played,prev_max_goal_to_specific_opp,prev_max_points_to_specific_opp,prev_mean_points_to_specific_opp,prev_ratio_played
0,Aaron Ramsey,2018-09-06 00:00:00,UEFA Nations League,Wales,Republic of Ireland,21.00,90.0,M,2018,9,6,7.0,8.5,3.0,,,,,,,,,,,,,,,,,,,,,
1,Aaron Ramsey,2018-09-06 00:00:00,UEFA Nations League,Wales,Republic of Ireland,21.00,90.0,M,2018,9,6,7.0,4.0,1.0,7.0,1.0,0.0,,,,,,,,,,,,,,,,,,
2,Aaron Ramsey,2018-09-09 00:00:00,UEFA Nations League,Wales,Denmark,13.50,90.0,M,2018,9,9,2.0,8.5,3.0,7.0,1.0,0.0,7.000000,1.0,0.0,7.0,0.000000,0.000000,0.0,90.0,1.000000,1.0,1.000000,0.0,0.000000,1.0,,,,1.000000
3,Aaron Ramsey,2018-09-09 00:00:00,UEFA Nations League,Wales,Denmark,13.50,90.0,M,2018,9,9,2.0,4.0,1.0,2.0,0.0,0.0,7.000000,1.0,0.0,7.0,0.000000,0.000000,0.0,90.0,1.000000,1.0,1.000000,0.0,0.000000,1.0,,,,1.000000
4,Aaron Ramsey,2018-10-11 00:00:00,International Friendlies,Wales,Spain,1.42,90.0,M,2018,10,11,2.0,8.5,3.0,2.0,0.0,0.0,4.500000,0.5,0.0,7.0,2.672612,0.534522,0.0,90.0,1.000000,2.0,0.500000,0.0,0.000000,2.0,,,,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9071,Çaglar Söyüncü,2021-03-27 00:00:00,European World Cup Qualifiers,Turkey,Norway,16.72,90.0,D,2021,3,27,12.0,5.0,2.0,1.0,0.0,0.0,3.200000,0.0,0.0,6.0,2.274078,0.000000,0.0,90.0,0.933333,15.0,0.000000,0.0,0.400000,27.0,,,,0.555556
9072,Çaglar Söyüncü,2021-03-30 00:00:00,European World Cup Qualifiers,Turkey,Latvia,3.54,90.0,D,2021,3,30,0.0,5.0,2.0,12.0,1.0,0.0,3.750000,0.0,0.0,12.0,3.109126,0.250000,0.0,90.0,0.937500,16.0,0.062500,0.0,0.437500,28.0,,,,0.571429
9073,Çaglar Söyüncü,2021-06-11 21:00:00,European Championship 2020,Turkey,Italy,0.00,0.0,D,2021,6,11,0.0,5.0,2.0,0.0,0.0,0.0,3.529412,0.0,0.0,12.0,3.144790,0.242536,0.0,90.0,0.941176,17.0,0.058824,0.0,0.411765,29.0,,,,0.586207
9074,Çaglar Söyüncü,2021-06-16 18:00:00,European Championship 2020,Turkey,Wales,0.00,59.0,D,2021,6,16,1.0,5.0,2.0,0.0,0.0,0.0,3.333333,0.0,0.0,12.0,3.162278,0.235702,0.0,90.0,0.888889,18.0,0.055556,0.0,0.388889,30.0,,,,0.600000


## 3.3 National Team FIFA Rank Dataset

In [71]:
main_df["date"].describe()

  main_df["date"].describe()


count                    9076
unique                    112
top       2020-10-14 00:00:00
freq                      251
first     2018-09-05 00:00:00
last      2021-06-23 21:00:00
Name: date, dtype: object

In [72]:
fifa_rank = pd.read_csv("{}/raw/historical-match-and-rank/fifa_ranking-2021-05-27.csv".format(DATA_DIR))

In [73]:
CUTOFF_DATE = "2018-01-01" 
fifa_rank = fifa_rank[fifa_rank["rank_date"] > CUTOFF_DATE]
fifa_rank = fifa_rank[["country_full", "rank", "total_points", "rank_date"]]
fifa_rank["rank_date"] = pd.to_datetime(fifa_rank["rank_date"])
fifa_rank = fifa_rank.sort_values(by=["country_full", "rank_date"])

In [74]:
# get fifa rank closest to the match date
df_with_rank = pd.merge(main_df[["team_name", "date"]], fifa_rank, how="left", left_on="team_name", right_on="country_full")
df_with_rank["time_diff"] = df_with_rank.apply(lambda x: (x['date']-x['rank_date']).total_seconds(), axis=1)
df_with_rank = df_with_rank[df_with_rank["time_diff"] > 0] # filter out rank after match
df_with_rank = df_with_rank.sort_values(by=["team_name", "time_diff"], ascending=False)
df_with_rank = df_with_rank.groupby(["team_name", "date"]).agg(prev_team_highest_rank=("rank", "min"),
                                                               team_rank=("rank", "last"),
                                                               team_total_points=("total_points", "last")).reset_index()

In [75]:
main_df = pd.merge(main_df, df_with_rank, how="left", on=["team_name", "date"])

In [76]:
# get fifa rank closest to the match date
df_with_rank = pd.merge(main_df[["opponent_name", "date"]], fifa_rank, how="left", left_on="opponent_name", right_on="country_full")
df_with_rank["time_diff"] = df_with_rank.apply(lambda x: (x['date']-x['rank_date']).total_seconds(), axis=1)
df_with_rank = df_with_rank[df_with_rank["time_diff"] > 0] # filter out rank after match
df_with_rank = df_with_rank.sort_values(by=["opponent_name", "time_diff"], ascending=False)
df_with_rank = df_with_rank.groupby(["opponent_name", "date"]).agg(prev_opponent_highest_rank=("rank", "min"),
                                                                   opponent_rank=("rank", "last"),
                                                                   opponent_total_points=("total_points", "last")).reset_index()

In [77]:
main_df = pd.merge(main_df, df_with_rank, how="left", on=["opponent_name", "date"])

In [78]:
main_df.head()

Unnamed: 0,player,date,league_name,team_name,opponent_name,fantasy_points,min,position,year,month,day,points,value,skill,last_md_points,last_md_goals,last_md_assists,prev_mean_points,prev_mean_goals,prev_mean_assists,prev_max_points,prev_std_points,prev_std_goals,prev_std_assists,prev_median_min,prev_ratio_starter,count_played,goal_consistency,assist_consistency,clean_sheet_consistency,count_team_played,prev_max_goal_to_specific_opp,prev_max_points_to_specific_opp,prev_mean_points_to_specific_opp,prev_ratio_played,prev_team_highest_rank,team_rank,team_total_points,prev_opponent_highest_rank,opponent_rank,opponent_total_points
0,Aaron Ramsey,2018-09-06,UEFA Nations League,Wales,Republic of Ireland,21.0,90.0,M,2018,9,6,7.0,8.5,3.0,,,,,,,,,,,,,,,,,,,,,,18,19,1536,26,29,1484
1,Aaron Ramsey,2018-09-06,UEFA Nations League,Wales,Republic of Ireland,21.0,90.0,M,2018,9,6,7.0,4.0,1.0,7.0,1.0,0.0,,,,,,,,,,,,,,,,,,,18,19,1536,26,29,1484
2,Aaron Ramsey,2018-09-09,UEFA Nations League,Wales,Denmark,13.5,90.0,M,2018,9,9,2.0,8.5,3.0,7.0,1.0,0.0,7.0,1.0,0.0,7.0,0.0,0.0,0.0,90.0,1.0,1.0,1.0,0.0,0.0,1.0,,,,1.0,18,19,1536,9,9,1580
3,Aaron Ramsey,2018-09-09,UEFA Nations League,Wales,Denmark,13.5,90.0,M,2018,9,9,2.0,4.0,1.0,2.0,0.0,0.0,7.0,1.0,0.0,7.0,0.0,0.0,0.0,90.0,1.0,1.0,1.0,0.0,0.0,1.0,,,,1.0,18,19,1536,9,9,1580
4,Aaron Ramsey,2018-10-11,International Friendlies,Wales,Spain,1.42,90.0,M,2018,10,11,2.0,8.5,3.0,2.0,0.0,0.0,4.5,0.5,0.0,7.0,2.672612,0.534522,0.0,90.0,1.0,2.0,0.5,0.0,0.0,2.0,,,,1.0,18,19,1536,6,9,1597


In [79]:
main_df.to_csv("{}/interim/main.csv".format(DATA_DIR), index=False)

In [80]:
historical_matches = pd.read_csv("{}/raw/historical-match-and-rank/international-footbal-match.csv".format(DATA_DIR))

In [81]:
historical_matches["date"] = pd.to_datetime(historical_matches["date"])

In [82]:
historical_matches  = historical_matches[historical_matches["date"] > "2010-01-01"]

In [83]:
historical_matches["match"] = historical_matches["home_team"] + ',' + historical_matches['away_team']

In [84]:
historical_matches["match"] = historical_matches["match"].apply(lambda x: ' '.join(sorted(x.split(","))))

In [85]:
def get_match_result(df):
    if df["home_score"] > df["away_score"]:
        return df["home_team"]
    elif df["away_score"] > df["home_score"]:
        return df["away_team"]
    else:
        return "Draw"

In [86]:
historical_matches["result"] = historical_matches.apply(get_match_result, axis=1)

In [87]:
historical_matches["margin"] = historical_matches.apply(lambda x: abs(x["home_score"] - x["away_score"]), axis=1)

In [88]:
historical_matches.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,match,result,margin
31892,2010-01-02,Iran,North Korea,1,0,Friendly,Doha,Qatar,True,Iran North Korea,Iran,1
31893,2010-01-02,Qatar,Mali,0,0,Friendly,Doha,Qatar,False,Mali Qatar,Draw,0
31894,2010-01-02,Syria,Zimbabwe,6,0,Friendly,Kuala Lumpur,Malaysia,True,Syria Zimbabwe,Syria,6
31895,2010-01-02,Yemen,Tajikistan,0,1,Friendly,Sana'a,Yemen,False,Tajikistan Yemen,Tajikistan,1
31896,2010-01-03,Angola,Gambia,1,1,Friendly,Vila Real de Santo António,Portugal,True,Angola Gambia,Draw,0


In [89]:
def get_all_historical_matches(df, team, opp, date):
    name_tuple = ' '.join(sorted([team, opp]))
    hist = df[(df['match'] == name_tuple) & (df["date"] < date)]
    hth = hist["result"].value_counts()
    team_win, opp_win, draw = 0, 0, 0
    if "Draw" in hth.keys():
        draw = hth["Draw"]
    if team in hth.keys():
        team_win = hth[team]
    if opp in hth.keys():
        opp_win = hth[opp]
    max_margin = hist["margin"].max()
    team_score = hist[hist['home_team'] == team]["home_score"].sum() + hist[hist['away_team'] == team]["away_score"].sum()
    opp_score = hist[hist['home_team'] == opp]["home_score"].sum() + hist[hist['away_team'] == opp]["away_score"].sum()
    return pd.Series([team_win, opp_win, draw, team_score, opp_score, max_margin])

In [90]:
main_df[["hth_team_win", "hth_opp_win", "hth_draw", "hth_team_score", "hth_opp_score", "htt_max_margin"]] = main_df.apply(lambda x: get_all_historical_matches(historical_matches, x["team_name"], x["opponent_name"], x["date"]), axis=1)

## 3.4 Transfermarkt Dataset

### 3.4.1 National Team Level

In [91]:
euro = pd.read_excel("{}/raw/transfermarkt/transfermarkt-market-value.xlsx".format(DATA_DIR), sheet_name=0)
nations_league = pd.read_excel("{}/raw/transfermarkt/transfermarkt-market-value.xlsx".format(DATA_DIR), sheet_name=1)
euro_qual = pd.read_excel("{}/raw/transfermarkt/transfermarkt-market-value.xlsx".format(DATA_DIR), sheet_name=2)
wc_euro_qual = pd.read_excel("{}/raw/transfermarkt/transfermarkt-market-value.xlsx".format(DATA_DIR), sheet_name=3)

In [92]:
nations_league["league_name"] = "UEFA Nations League"
euro_qual["league_name"] = "European Championship Qualifiers"
wc_euro_qual["league_name"] = "European World Cup Qualifiers"
euro["league_name"] = "European Championship 2020"

In [93]:
euro = euro.drop(["EURO participations"], axis=1)
euro.rename(columns={"Average Age": "Age"}, inplace=True)

In [94]:
def preprocess_market_value(text):
    match = re.sub("€", "", text)
    match = re.search("(\d+(?:\.\d+)?)", text)
    val = float(match.group())
    num = text[match.end():]
    if num == "bn":
        val *= 10e9
    elif num == "m":
        val *= 10e6
    elif num == "Th.":
        val *= 10e3
    return val

In [95]:
mv_df = pd.concat([nations_league, euro_qual, wc_euro_qual, euro])

In [96]:
mv_df["market_value"] = mv_df["Market Value"].apply(preprocess_market_value)
mv_df["mean_market_value"] = mv_df["Average Market Value"].apply(preprocess_market_value)

In [97]:
mv_df = mv_df.drop_duplicates(subset=["Club", "league_name"], keep="first")
mv_df = mv_df[["Club", "league_name", "Age", "market_value", "mean_market_value"]]
mv_df.rename(columns={"Club" : "team_name", "Age": "mean_squad_age"}, inplace=True)

In [98]:
main_df = pd.merge(main_df, mv_df, how="left", on=["team_name", "league_name"])

In [99]:
main_df.rename(columns={"mean_squad_age" : "team_mean_squad_age",
                        "mean_market_value": "team_mean_market_value",
                        "market_value" : "team_market_value"
                       }, inplace=True)

In [100]:
mv_df.rename(columns={"team_name" : "opponent_name"}, inplace=True)

In [101]:
main_df = pd.merge(main_df, mv_df, how="left", on=["opponent_name", "league_name"])

In [102]:
main_df.rename(columns={"mean_squad_age" : "opponent_mean_squad_age",
                        "mean_market_value": "opponent_mean_market_value",
                        "market_value" : "opponent_market_value"
                       }, inplace=True)

## 3.6 FIFA Dataset

In [103]:
fifa_21 =  pd.read_csv("{}/raw/fifa/fifa-players_21.csv".format(DATA_DIR))
fifa_20 =  pd.read_csv("{}/raw/fifa/players_20.csv".format(DATA_DIR))
fifa_19 =  pd.read_csv("{}/raw/fifa/players_19.csv".format(DATA_DIR))
fifa_18 =  pd.read_csv("{}/raw/fifa/players_18.csv".format(DATA_DIR))

In [104]:
fifa_21["nationality"] = fifa_21["nationality"].apply(lambda x: x.strip())
fifa_20["nationality"] = fifa_20["nationality"].apply(lambda x: x.strip())
fifa_19["nationality"] = fifa_19["nationality"].apply(lambda x: x.strip())
fifa_18["nationality"] = fifa_18["nationality"].apply(lambda x: x.strip())

In [105]:
fifa_21 = fifa_21[fifa_21['nationality'].isin(main_df['team_name'].unique())]
fifa_20 = fifa_20[fifa_20['nationality'].isin(main_df['team_name'].unique())]
fifa_19 = fifa_19[fifa_19['nationality'].isin(main_df['team_name'].unique())]
fifa_18 = fifa_18[fifa_18['nationality'].isin(main_df['team_name'].unique())]

In [106]:
fifa_21['len_name'] = fifa_21["long_name"].apply(lambda x: len(x.split(" ")))
fifa_21['len_short_name'] = fifa_21["short_name"].apply(lambda x: len(x.split(" ")))
fifa_21['min_char_in_name'] = fifa_21['long_name'].apply(lambda x: min(len(y) for y in x.split()))
fifa_21['min_char_in_short_name'] = fifa_21['short_name'].apply(lambda x: min(len(y) for y in x.split()))

fifa_20['len_name'] = fifa_20["long_name"].apply(lambda x: len(x.split(" ")))
fifa_20['len_short_name'] = fifa_20["short_name"].apply(lambda x: len(x.split(" ")))
fifa_20['min_char_in_name'] = fifa_20['long_name'].apply(lambda x: min(len(y) for y in x.split()))
fifa_20['min_char_in_short_name'] = fifa_20['short_name'].apply(lambda x: min(len(y) for y in x.split()))

fifa_19['len_name'] = fifa_19["long_name"].apply(lambda x: len(x.split(" ")))
fifa_19['len_short_name'] = fifa_19["short_name"].apply(lambda x: len(x.split(" ")))
fifa_19['min_char_in_name'] = fifa_19['long_name'].apply(lambda x: min(len(y) for y in x.split()))
fifa_19['min_char_in_short_name'] = fifa_19['short_name'].apply(lambda x: min(len(y) for y in x.split()))

fifa_18['len_name'] = fifa_18["long_name"].apply(lambda x: len(x.split(" ")))
fifa_18['len_short_name'] = fifa_18["short_name"].apply(lambda x: len(x.split(" ")))
fifa_18['min_char_in_name'] = fifa_18['long_name'].apply(lambda x: min(len(y) for y in x.split()))
fifa_18['min_char_in_short_name'] = fifa_18['short_name'].apply(lambda x: min(len(y) for y in x.split()))

In [107]:
def join_tuple_string(strings_tuple):
    return ' '.join(strings_tuple)

def create_unigram_bigram_trigram_quadgram(text, x):
    token_list = text.split(" ")
    tuple_gram = list(everygrams(token_list, 2, x))
    result = map(join_tuple_string, tuple_gram) 
    if x > 2:
         return list(result) + [' '.join(token_list[::len(token_list)-1])]
    return list(result)

def calculate_closest_token(df):
    everygram = create_unigram_bigram_trigram_quadgram(df["long_name"], df['len_name'])
    closest = difflib.get_close_matches(df["short_name"], everygram, n=1)
    return ''.join(closest)

In [108]:
fifa_21['closest_match'] = fifa_21.apply(calculate_closest_token, axis=1)
fifa_20['closest_match'] = fifa_20.apply(calculate_closest_token, axis=1)
fifa_19['closest_match'] = fifa_19.apply(calculate_closest_token, axis=1)
fifa_18['closest_match'] = fifa_18.apply(calculate_closest_token, axis=1)

In [109]:
name_mapping = {
 'Aleksandar Dragović': 'Aleksandar Dragovic',
 'Aleš Matějů': 'Ales Mateju',
 'Alex Král': 'Alex Kral',
 'Anatoliy Trubin': 'Anatolii Trubin',
 'András Schäfer': 'Andras Schafer',
 'Dean Cornelius': 'Andreas Cornelius',
 'Andrej Kramarić': 'Andrej Kramaric',
 'Ante Rebić': 'Ante Rebic',
 'Bartosz Bereszyński': 'Bartosz Bereszynski',
 'Bećir Omeragić': 'Becir Omeragic',
 'Bogdan Mykhaylychenko': 'Bogdan Mykhaylichenko',
 'Borna Barišić': 'Borna Barisic',
 'B. Embolo': 'Breel Embolo',
 'Bruno Petković': 'Bruno Petkovic',
 'Burak Yılmaz': 'Burak Yilmaz',
 'Che Adams': 'Che Adams',
 'D.Rice': "Declan Rice",
 'Christian Günter': 'Chris Gunter',
 'Liam Craig Gordon': 'Craig Gordon',
 'Azpilicueta': 'César Azpilicueta',
 'Anga Dedryck Boyata': 'Dedryck Boyata',
 'Davor Lovren': 'Dejan Lovren',
 'Lemi Zakaria': 'Denis Zakaria',
 'Diego Javier Llorente': 'Diego Llorente',
 'Dmitriy Barinov': 'Dimitri Barinov',
 'Domagoj Bradarić': 'Domagoj Bradaric',
 'Dominik Livaković': 'Dominik Livakovic',
 'van de Beek': 'Donny van de Beek',
 'Dorukhan Toköz': 'Dorukhan Tokoz',
 'Duje Ćaleta-Car': 'Duje Caleta-Car',
 'Dušan Kuciak': 'Dusan Kuciak',
 'Miklós Sigér': 'Dávid Miklós Sigér',
 'Eray Ervin Cömert': 'Eray Cömert',
 'Frederik Rønnow': 'Frederik Rönnow',
 'Georgiy Bushchan': 'Georgi Bushchan',
 'Georgiy Dzhikiya': 'Georgi Dzhikiya',
 'Glen Adjei Kamara': 'Glen Kamara',
 'Greg Taylor': 'Greg Taylor',
 'Hakan Çalhanoğlu': 'Hakan Calhanoglu',
 'Hakan Calhanoglu':'Hakan Calhanoglu',
 'Haris Seferović': 'Haris Seferovic',
 'İlkay Gündoğan': 'Ilkay Gündogan',
 'İrfan Can Kahveci': 'Irfan Kahveci',
 'Ivan Perišić': 'Ivan Perisic',
 'Jakub Holúbek': 'Jakub Holubek',
 'Jamal Musiala': 'Jamal Musiala',
 'Alexander Lawrence': 'James Alexander Lawrence',
 'Jan Bořil': 'Jan Boril',
 'Jens Jønsson': 'Jens Jonsson',
 'Jere Juhani Uronen': 'Jere Uronen',
 'Jiří Pavlenka': 'Jirí Pavlenka',
 'Joakim Mæhle': 'Joakim Maehle',
 'Joseff Morrell': 'Joe Morrell',
 'Jordi Alba Ramos': 'Jordi Alba',
 'Josip Juranović': 'Josip Juranovic',
 'Palhinha': 'João Palhinha',
 'Jérémy Doku': 'Jéremy Doku',
 'Kamil Jóźwiak': 'Kamil Jozwiak',
 'Karol Świderski': 'Karol Swiderski',
 'Stefan Ristovski': 'Stefan Spirovski',
 'Kurt Happy Zouma': 'Kurt Zouma',
 'Lasse Schøne': 'Lasse Schöne',
 'Lovre Kalinić': 'Lovre Kalinic',
 'Lucas Hernández Pi': 'Lucas Hernández',
 'Luka Modrić': 'Luka Modric',
 'Lukáš Haraslín': 'Lukas Haraslin',
 'Lukáš Masopust': 'Lukas Masopust',
 'Łukasz Fabiański': 'Lukasz Fabianski',
 'Lukáš Hrádecký': 'Lukás Hrádecky',
 'Manuel Viana': 'Manuel Akanji',
 'Marcelo Brozović': 'Marcelo Brozovic',
 'Marcus Danielsson': 'Marcus Danielson',
 'Marek Hamšík': 'Marek Hamsik',
 'Marko Arnautović': 'Marko Arnautovic',
 'Martin Dúbravka': 'Martin Dubravka',
 'Matěj Vydra': 'Matej Vydra',
 'Mateo Kovačić': 'Mateo Kovacic',
 'Matúš Bero': 'Matús Bero',
 'Michael Krmenčík': 'Michal Krmencik',
 'Michael Gurski': 'Michal Duris',
 'Michał Helik': 'Michal Helik',
 'Carl Mikael Lustig': 'Mikael Lustig',
 'Oyarzabal': 'Mikel Oyarzabal',
 'Milan Škriniar': 'Milan Skriniar',
 'Mile Svilar': 'Mile Skoric',
 'Mislav Oršić': 'Mislav Orsic',
 'M. Kean': 'Moise Kean',
 'Mykola Matvienko': 'Mykola Matvyenko',
 'Nemanja Nikolić': 'Nemanja Nikolics',
 'N. Hämäläinen': 'Niko Hämäläinen',
 'Nikola Vlašić': 'Nikola Vlasic',
 'Nélson Cabral Semedo': 'Nélson Semedo',
 'Okay Yokuşlu': 'Okay Yokuslu',
 'Aleksandr Zhirov': 'Oleksandr Zubkov',
 'Ondřej Čelůstka': 'Ondrej Celustka',
 'Ondřej Kúdela': 'Ondrej Kudela',
 'Orkun Kökçü': 'Orkun Kökcü',
 'O. Kabak': 'Ozan Kabak',
 'Patrik Hrošovský': 'Patrik Hrosovsky',
 'Pavel Kadeřábek': 'Pavel Kaderábek',
 'Petr Ševčík': 'Petr Sevcik',
 'Philip Foden': 'Phil Foden',
 'Leo Bengtsson': 'Pierre Bengtsson',
 'Piotr Zieliński': 'Piotr Zielinski',
 'Przemysław Frankowski': 'Przemyslaw Frankowski',
 'Przemysław Płacheta': 'Przemyslaw Placheta',
 'Raphaël Varane': 'Raphael Varane',
 'Renato Júnior Luz Sanches': 'Renato Sanches',
 'Róbert Boženík': 'Robert Bozenik',
 'Ruslan Malinovskyi': 'Ruslan Malinovskiy',
 'Ryan Jiro Gravenberch': 'Ryan Gravenberch',
 'Saša Kalajdžić': 'Sasa Kalajdzic',
 'Sergiy Kryvtsov': 'Serhii Kryvtsov',
 'Šime Vrsaljko': 'Sime Vrsaljko',
 'Tamás Cseri': 'Tamas Cseri',
 'Taylan Antalyalı': 'Taylan Antalyali',
 'Tomáš Pekhart': 'Tomas Pekhart',
 'Tomáš Souček': 'Tomas Soucek',
 'Tomáš Suslov': 'Tomas Suslov',
 'Tomasz Kędziora': 'Tomasz Kedziora',
 'Thomas Holmes': 'Tomás Holes',
 'Tomáš Vaclík': 'Tomás Vaclik',
 'Uğurcan Çakır': 'Ugurcan Çakir',
 'Umut Meraş': 'Umut Meras',
 'Cengiz Umut Meraş': 'Umut Meras',
 'Vitaliy Mykolenko': 'Vitalii Mykolenko',
 'Vladimír Coufal': 'Vladimir Coufal',
 'Vladimír Darida': 'Vladimir Darida',
 'William Silva de Carvalho': 'William Carvalho',
 'Yuriy Zhirkov': 'Yuri Zhirkov',
 'Yusuf Yazıcı': 'Yusuf Yazici',
 'Çağlar Söyüncü': 'Çaglar Söyüncü',
 'C. Eriksen': "Christian Eriksen",
 'Alexander Walke': 'Alexander Isak',
 'Aleksandr Sobolev': 'Alexander Sobolev',
 'Antonín Barák': 'Antonin Barak',
 'Benjamin Cabango': 'Ben Cabango',
 'Bogdan Mykhaylichenko': 'Bogdan Mykhaylichenko',
 'Borna Barisic': 'Borna Barisic',
 'Mikael Lustig': 'Carl Mikael Lustig',
 'Che Adams': 'Che Adams',
 'Chris Gunter': 'Chris Gunter',
 'Christian Gentner': 'Christian Günter',
 'Daniel Avramovski': 'Daniel Avramovski',
 'Declan Rice': 'Declan Rice',
 'Dejan Kulusevski': 'Dejan Kulusevski',
 'Diogo José': 'Diogo Jota',
 'Domagoj Vida': 'Domagoj Vida',
 'Dominik Livakovic': 'Dominik Livakovic',
 'Dylan Levitt': 'Dylan Levitt',
 'Dávid Sigér': 'Dávid Sigér',
 'Eduard Sobol': 'Eduard Sobol',
 'Eljif Elmas': 'Eljif Elmas',
 'Eric García Martret': 'Eric García',
 'Ethan Ampadu': 'Ethan Ampadu',
 'Ferhan Hasani': 'Ferhan Hasani',
 'Filip Helander': 'Filip Holender',
 'Greg Taylor': 'Greg Taylor',
 'Halil Dervişoğlu': 'Halil Dervisoglu',
 'Irfan Kahveci': 'Irfan Can Kahveci',
 'Ivan Trickovski': 'Ivan Trickovski',
 'Jakub Świerczok': 'Jakub Swierczok',
 'Jamal Musiala': 'Jamal Musiala',
 'James Lawrence': 'Jamie Lawrence',
 'Jens-Lys Cajuste': 'Jens Cajuste',
 'Josip Juranovic': 'Josip Juranovic',
 'Jude Bellingham': 'Jude Bellingham',
 'Kacper Trelowski': 'Kacper Kozlowski',
 'Kamil Piątkowski': 'Kamil Piatkowski',
 'Leo Väisänen': 'Leo Väisänen',
 'Łukasz Skorupski': 'Lukasz Skorupski',
 'Lukáš Provod': 'Lukáš Provod',
 'Lyndon Dykes': 'Lyndon Dykes',
 'Magomed Ozdoev': 'Magomed Ozdoev',
 'Mário Fernandes': 'Mario Fernandes',
 'Mehmet Zeki Çelik': 'Mehmet Zeki Çelik',
 'Merih Demiral': 'Merih Demiral',
 'Mert Müldür': 'Mert Müldür',
 'Paweł Dawidowicz': 'Pawel Dawidowicz',
 'Petr Sevcik': 'Petr Sevcik',
 'Pyry Soiri': 'Pyry Soiri',
 'Rabbi Matondo': 'Rabbi Matondo',
 'Rıdvan Yılmaz': 'Ridvan Yilmaz',
 'Robert Bozenik': 'Robert Bozenik',
 'Robert Sanchez': 'Robert Sánchez',
 'Serhiy Sydorchuk': 'Serhiy Sydorchuk',
 'Tamas Cseri': 'Tamas Cseri',
 'Tomáš Kalas': 'Tomas Kalas',
 'Tomás Holes': 'Tomás Holes',
 'Tomáš Koubek': 'Tomáš Souček',
 'Ugurcan Çakir': 'Ugurcan Cakir',
 'Vitalii Mykolenko': 'Vitaliy Mykolenko',
 'Vladimir Coufal': 'Vladimir Coufal',
 'Vladimir Darida': 'Vladimír Darida',
 'Vlatko Stojanovski': 'Vlatko Stojanovski',
 'Wojciech Szczęsny': 'Wojciech Szczesny',
 'Simon Thorup Kjær': "Simon Kjaer",
 'Simon Kjær': "Simon Kjaer",
 "Simon Kjær": "Simon Kjaer",
 'Ádám Lang': 'Ádám Lang',
 'Luís Gayà': 'José Gayá',
 'João Félix Sequeira': 'João Félix',
 'De Gea':'David de Gea',
 'Ferrán Torres': 'Ferran Torres',
 'Mehmet Çelik': 'Mehmet Zeki Çelik',
 'Can Kahveci': 'Irfan Can Kahveci',
 'Mert Günok': 'Fehmi Mert Günok',
 'J. Stryger Larsen': 'Jens Stryger Larsen',
 'Jens Larsen': 'Jens Stryger Larsen',
 'José Guerreiro': 'Raphael Guerreiro',
 'D. Sow': 'Djibril Sow',
 'Ben Yedder': 'Wissam Ben Yedder',
 'Lukás Hrádecky': 'Lukas Hradecky',
 'Mikael Lustig' : 'Carl Mikael Lustig',
 'Thiago':'Thiago Alcántara',
 'Vladimír Darida' : "Vladimír Darida",
 'Tomáš Hubočan': 'Tomas Hubocan',
 'Anga Boyata': 'Dedryck Boyata',
 'Ilkay Gündogan': 'İlkay Gündoğan',
 "Morata":'Álvaro Morata',
 "I. Perišić" :"Ivan Perišić",
 "Andrew Robertson": "Andy Robertson",
 "Peter McLaughlin": "Jon McLaughlin",
 "Iván Rodríguez": "Ricardo Rodríguez",
 "Landry Mvogo":"Yvon Mvogo",
 "Alexander Granlund": "Albin Granlund"
}

In [110]:
def heuristic_match(df):
    if df["len_short_name"] == 1:
        return df["short_name"]
    if len(df["closest_match"].split()) == 0:
        return df["short_name"]
    elif df["min_char_in_short_name"] >= 4:
        return df["short_name"]
    elif df["len_name"] > 3:
        return df["closest_match"]
    elif df["min_char_in_name"] >= 3:
        return df["closest_match"]
    else:
        return df["long_name"]

In [111]:
fifa_21["player"] = fifa_21.apply(heuristic_match, axis=1)
fifa_20["player"] = fifa_20.apply(heuristic_match, axis=1)
fifa_19["player"] = fifa_19.apply(heuristic_match, axis=1)
fifa_18["player"] = fifa_18.apply(heuristic_match, axis=1)

In [112]:
def map_name(name):
    global name_mapping
    if name in name_mapping.keys():
        return name_mapping[name]
    else:
        return name

In [113]:
fifa_21['player'] = fifa_21['player'].apply(map_name)
fifa_20['player'] = fifa_20['player'].apply(map_name)
fifa_19['player'] = fifa_19['player'].apply(map_name)
fifa_18['player'] = fifa_18['player'].apply(map_name)

In [114]:
col = ["player", "nationality", "work_rate", "age", "height_cm", "weight_kg", "league_rank", "overall", "potential", "wage_eur", 
       "international_reputation", "pace", "shooting", "passing", "dribbling", "defending",  'nation_position', 'nation_jersey_number',
       "physic", "attacking_crossing", "attacking_finishing", "attacking_heading_accuracy", "attacking_short_passing",
       "attacking_volleys", "skill_dribbling", "skill_curve", "skill_fk_accuracy", "skill_long_passing", "skill_ball_control",
       "movement_acceleration", "movement_sprint_speed", "movement_agility", "movement_reactions", "movement_balance", "power_shot_power",
       "power_jumping", "power_stamina","power_strength", "power_long_shots", "mentality_aggression", "mentality_interceptions",
       "mentality_positioning", "mentality_vision", "mentality_penalties", "mentality_composure", "defending_standing_tackle",
       "defending_sliding_tackle", "goalkeeping_diving", "goalkeeping_handling", "goalkeeping_kicking", "goalkeeping_positioning", "goalkeeping_reflexes"]

In [115]:
fifa_21 = fifa_21[col]
fifa_20 = fifa_20[col]
fifa_19 = fifa_19[col]
fifa_18 = fifa_18[col]

In [116]:
fifa_21.rename(columns={"nationality": "team_name"}, inplace=True)
fifa_20.rename(columns={"nationality": "team_name"}, inplace=True)
fifa_19.rename(columns={"nationality": "team_name"}, inplace=True)
fifa_18.rename(columns={"nationality": "team_name"}, inplace=True)

In [117]:
fifa_21 = fifa_21.drop_duplicates(subset=["player", "team_name"], keep="first")
fifa_20 = fifa_20.drop_duplicates(subset=["player", "team_name"], keep="first")
fifa_19 = fifa_19.drop_duplicates(subset=["player", "team_name"], keep="first")
fifa_18 = fifa_18.drop_duplicates(subset=["player", "team_name"], keep="first")

In [118]:
fifa_21["year"] = 2021
fifa_20["year"] = 2020
fifa_19["year"] = 2019
fifa_18["year"] = 2018

In [119]:
fifa = pd.concat([fifa_21, fifa_20, fifa_19, fifa_18])

In [120]:
main_df = pd.merge(main_df, fifa, how="left", on=["player", "team_name", "year"])

## Feature Engineering

In [121]:
main_df.head()

Unnamed: 0,player,date,league_name,team_name,opponent_name,fantasy_points,min,position,year,month,day,points,value,skill,last_md_points,last_md_goals,last_md_assists,prev_mean_points,prev_mean_goals,prev_mean_assists,prev_max_points,prev_std_points,prev_std_goals,prev_std_assists,prev_median_min,prev_ratio_starter,count_played,goal_consistency,assist_consistency,clean_sheet_consistency,count_team_played,prev_max_goal_to_specific_opp,prev_max_points_to_specific_opp,prev_mean_points_to_specific_opp,prev_ratio_played,prev_team_highest_rank,team_rank,team_total_points,prev_opponent_highest_rank,opponent_rank,opponent_total_points,hth_team_win,hth_opp_win,hth_draw,hth_team_score,hth_opp_score,htt_max_margin,team_mean_squad_age,team_market_value,team_mean_market_value,opponent_mean_squad_age,opponent_market_value,opponent_mean_market_value,work_rate,age,height_cm,weight_kg,league_rank,overall,potential,wage_eur,international_reputation,pace,shooting,passing,dribbling,defending,nation_position,nation_jersey_number,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,Aaron Ramsey,2018-09-06,UEFA Nations League,Wales,Republic of Ireland,21.0,90.0,M,2018,9,6,7.0,8.5,3.0,,,,,,,,,,,,,,,,,,,,,,18,19,1536,26,29,1484,0.0,2.0,2.0,0.0,4.0,3.0,25.6,1767500000.0,68000000.0,25.5,776500000.0,29900000.0,High/High,26.0,183.0,76.0,1.0,82.0,83.0,130000.0,3.0,68.0,75.0,79.0,81.0,65.0,RF,10.0,75.0,75.0,72.0,58.0,84.0,79.0,81.0,70.0,70.0,80.0,82.0,67.0,68.0,76.0,81.0,75.0,81.0,67.0,89.0,69.0,74.0,73.0,69.0,82.0,80.0,75.0,81.0,70.0,67.0,6.0,11.0,5.0,10.0,8.0
1,Aaron Ramsey,2018-09-06,UEFA Nations League,Wales,Republic of Ireland,21.0,90.0,M,2018,9,6,7.0,4.0,1.0,7.0,1.0,0.0,,,,,,,,,,,,,,,,,,,18,19,1536,26,29,1484,0.0,2.0,2.0,0.0,4.0,3.0,25.6,1767500000.0,68000000.0,25.5,776500000.0,29900000.0,High/High,26.0,183.0,76.0,1.0,82.0,83.0,130000.0,3.0,68.0,75.0,79.0,81.0,65.0,RF,10.0,75.0,75.0,72.0,58.0,84.0,79.0,81.0,70.0,70.0,80.0,82.0,67.0,68.0,76.0,81.0,75.0,81.0,67.0,89.0,69.0,74.0,73.0,69.0,82.0,80.0,75.0,81.0,70.0,67.0,6.0,11.0,5.0,10.0,8.0
2,Aaron Ramsey,2018-09-09,UEFA Nations League,Wales,Denmark,13.5,90.0,M,2018,9,9,2.0,8.5,3.0,7.0,1.0,0.0,7.0,1.0,0.0,7.0,0.0,0.0,0.0,90.0,1.0,1.0,1.0,0.0,0.0,1.0,,,,1.0,18,19,1536,9,9,1580,0.0,0.0,0.0,0.0,0.0,,25.6,1767500000.0,68000000.0,27.8,3107000000.0,119500000.0,High/High,26.0,183.0,76.0,1.0,82.0,83.0,130000.0,3.0,68.0,75.0,79.0,81.0,65.0,RF,10.0,75.0,75.0,72.0,58.0,84.0,79.0,81.0,70.0,70.0,80.0,82.0,67.0,68.0,76.0,81.0,75.0,81.0,67.0,89.0,69.0,74.0,73.0,69.0,82.0,80.0,75.0,81.0,70.0,67.0,6.0,11.0,5.0,10.0,8.0
3,Aaron Ramsey,2018-09-09,UEFA Nations League,Wales,Denmark,13.5,90.0,M,2018,9,9,2.0,4.0,1.0,2.0,0.0,0.0,7.0,1.0,0.0,7.0,0.0,0.0,0.0,90.0,1.0,1.0,1.0,0.0,0.0,1.0,,,,1.0,18,19,1536,9,9,1580,0.0,0.0,0.0,0.0,0.0,,25.6,1767500000.0,68000000.0,27.8,3107000000.0,119500000.0,High/High,26.0,183.0,76.0,1.0,82.0,83.0,130000.0,3.0,68.0,75.0,79.0,81.0,65.0,RF,10.0,75.0,75.0,72.0,58.0,84.0,79.0,81.0,70.0,70.0,80.0,82.0,67.0,68.0,76.0,81.0,75.0,81.0,67.0,89.0,69.0,74.0,73.0,69.0,82.0,80.0,75.0,81.0,70.0,67.0,6.0,11.0,5.0,10.0,8.0
4,Aaron Ramsey,2018-10-11,International Friendlies,Wales,Spain,1.42,90.0,M,2018,10,11,2.0,8.5,3.0,2.0,0.0,0.0,4.5,0.5,0.0,7.0,2.672612,0.534522,0.0,90.0,1.0,2.0,0.5,0.0,0.0,2.0,,,,1.0,18,19,1536,6,9,1597,0.0,0.0,0.0,0.0,0.0,,,,,,,,High/High,26.0,183.0,76.0,1.0,82.0,83.0,130000.0,3.0,68.0,75.0,79.0,81.0,65.0,RF,10.0,75.0,75.0,72.0,58.0,84.0,79.0,81.0,70.0,70.0,80.0,82.0,67.0,68.0,76.0,81.0,75.0,81.0,67.0,89.0,69.0,74.0,73.0,69.0,82.0,80.0,75.0,81.0,70.0,67.0,6.0,11.0,5.0,10.0,8.0


In [122]:
main_df["diff_team_points"] = main_df['team_total_points'] - main_df['opponent_total_points']
main_df["diff_team_ranking"]= main_df['team_rank'] - main_df['opponent_rank']
main_df["diff_team_market_value"] = main_df['team_market_value'] - main_df['opponent_market_value']
main_df["diff_team_mean_market_value"] = main_df['team_mean_market_value'] - main_df['opponent_mean_market_value']
main_df["diff_team_mean_squad_age"] = main_df['team_mean_squad_age'] - main_df['opponent_mean_squad_age']
main_df["diff_team_ranking"]= main_df['team_rank'] - main_df['opponent_rank']
main_df["is_senior"] = main_df["age"] > main_df["team_mean_squad_age"]
main_df["is_imbalanced"]= main_df['diff_team_ranking'].apply(lambda x: abs(x) > 10)
main_df["gap_to_potential"] = main_df["potential"] - main_df["overall"]

main_df["roi"] = main_df["points"] / main_df["value"]
main_df["more_likely_to_win"] = (main_df["hth_team_win"] - main_df["hth_opp_win"]) >= 2

main_df["work_rate"] = main_df['work_rate'].fillna("")
main_df[["attacking_work_rate", "defending_work_rate"]] = main_df["work_rate"].apply(lambda x: pd.Series(x.split("/")))
main_df = main_df.drop(["work_rate"], axis=1)

In [123]:
main_df.drop_duplicates(subset=["player", "date"], inplace=True)

In [124]:
main_df.to_csv("{}/processed/dataset_md3.csv".format(DATA_DIR), index=False)