# Building Dataset for UEFA Euro 2020 Fantasy Football

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import json

In [2]:
from nltk import everygrams
import difflib

In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
DATA_DIR = '../data'

## Euro 2020 Dataset

In [5]:
with open('{}/raw/players.json'.format(DATA_DIR))as f:
    data = json.load(f)
    players = data["data"]["value"]["playerList"]

In [6]:
def construct_dataframe(list_of_players):
    player_name, team_name, opponent_name, value, skill, selected_perc, date = [], [] ,[], [], [], [], []
    for player in list_of_players:
        player_name.append(player["pFName"])
        date.append(player["currentMatchesList"][0]["matchDate"])
        team_name.append(player["tName"])
        opponent_name.append(player["currentMatchesList"][0]["vsTSCode"])
        value.append(player["value"])
        skill.append(player["skill"])
        selected_perc.append(player["selPer"])
    return pd.DataFrame({
        "Player": player_name,
        "Date": date,
        "LeagueName": "European Championship 2020",
        "TeamName": team_name,
        "OpponentName": opponent_name,
        "Price": value,
        "Skill": skill,
        "SelectedPercentage": selected_perc
    })
        

In [7]:
player_df = construct_dataframe(players)

In [8]:
player_df['Date'] = pd.to_datetime(player_df["Date"])

In [9]:
name_mapping = {'Matúš Bero': 'Matús Bero',
 'Ondřej Kúdela': 'Ondrej Kudela',
 'Nicholas Hämäläinen': 'Niko Hämäläinen',
 'Antonín Barák': 'Antonin Barak',
 'Mislav Oršić': 'Mislav Orsic',
 'Jan Bořil': 'Jan Boril',
 'Bruno Petković': 'Bruno Petkovic',
 'Willi Orbán': 'Willi Orban',
 'Ondřej Čelůstka': 'Ondrej Celustka',
 'Mario Gavranović': 'Mario Gavranovic',
 'Egzijan Alioski': 'Ezgjan Alioski',
 'Feran Hasani': 'Ferhan Hasani',
 'Jakub Pešek': 'Jakub Pesek',
 'Michael Krmenčík': 'Michael Krmencik',
 'Yusuf Yazıcı': 'Yusuf Yazici',
 'Jakub Holúbek': 'Jakub Holubek',
 'Mathias Jørgensen': 'Mathias Jensen',
 'Karol Świderski': 'Karol Swiderski',
 'Haris Seferović': 'Haris Seferovic',
 'Mykola Matviyenko': 'Mykola Matvyenko',
 'Joakim Mæhle': 'Joakim Maehle',
 'Umut Meraş': 'Umut Meras',
 'Kamil Jóźwiak': 'Kamil Jozwiak',
 'Przemysław Płacheta': 'Przemyslaw Placheta',
 'Pavel Kadeřábek': 'Pavel Kaderábek',
 'Matěj Vydra': 'Matej Vydra',
 'Orkun Kökçü': 'Orkun Kökcü',
 'Raphaël Guerreiro': 'Raphael Guerreiro',
 'Dorukhan Toköz': 'Dorukhan Tokoz',
 'Ivan Perišić': 'Ivan Perisic',
 'Aleksandar Dragović': 'Aleksandar Dragovic',
 'Alex Král': 'Alex Kral',
 'Andrei Mostovoy': 'Andrey Mostovoy',
 'Tomáš Pekhart': 'Tomas Pekhart',
 'Marko Arnautović': 'Marko Arnautovic',
 'Alexander Bah': 'Alexander Isak',
 'Viktor Kornienko': 'Viktor Kovalenko',
 'Jens-Lys Cajuste': 'Jens Cajuste',
 'Aleksandr Sobolev': 'Alexander Sobolev',
 'Jens Jønsson': 'Jens Jonsson',
 'Lukáš Masopust': 'Lukas Masopust',
 'Andy Robertson': 'Andrew Robertson',
 'Ché Adams': 'Che Adams',
 'Burak Yılmaz': 'Burak Yilmaz',
 'Mateo Kovačić': 'Mateo Kovacic',
 'Taylan Antalyalı': 'Taylan Antalyali',
 'Aleksei Ionov': 'Aleksey Ionov',
 'Marcelo Brozović': 'Marcelo Brozovic',
 'Andrei Semenov': 'Andrey Semenov',
 'Aleksei Miranchuk': 'Aleksey Miranchuk',
 'Vladimír Coufal': 'Vladimir Coufal',
 'Krste Velkovski': 'Krste Velkoski',
 'Bartosz Bereszyński': 'Bartosz Bereszynski',
 'Kacper Kozłowski': 'Kacper Kozlowski',
 'Nemanja Nikolić': 'Nemanja Nikolics',
 'Yevhen Makarenko': 'Yevhenii Makarenko',
 'Marek Hamšík': 'Marek Hamsik',
 'Mile Škorić': 'Mile Skoric',
 'Duje Ćaleta-Car': 'Duje Caleta-Car',
 'Tomáš Suslov': 'Tomas Suslov',
 'Ante Rebić': 'Ante Rebic',
 'Okay Yokuşlu': 'Okay Yokuslu',
 'Tomasz Kędziora': 'Tomasz Kedziora',
 'Milan Škriniar': 'Milan Skriniar',
 'Michał Helik': 'Michal Helik',
 'Šime Vrsaljko': 'Sime Vrsaljko',
 'Ivan Tričkovski': 'Ivan Trickovski',
 'Aaron Ramsdale': 'Aaron Ramsey',
 'Tomáš Holeš': 'Tomás Holes',
 'Tomáš Kalas': 'Tomas Kalas',
 'Róbert Mak': 'Robert Mak',
 'Josip Juranović': 'Josip Juranovic',
 'Luka Modrić': 'Luka Modric',
 'Rodri': 'Rodrigo',
 'Hakan Çalhanoğlu': 'Hakan Calhanoglu',
 'Gergő Lovrencsics': 'Gergo Lovrencsics',
 'Vladimír Weiss': 'Vladimir Weiss',
 'Mario Pašalić': 'Mario Pasalic',
 'Simon Kjær': 'Simon Kjaer',
 'Patrik Hrošovský': 'Patrik Hrosovsky',
 'Piotr Zieliński': 'Piotr Zielinski',
 'Dmitri Barinov': 'Dimitri Barinov',
 'Kamil Piątkowski': 'Kamil Piatkowski',
 'Tamás Cseri': 'Tamas Cseri',
 'Sebastian Andersson': 'Sebastian Larsson',
 'İrfan Can Kahveci': 'Irfan Can Kahveci',
 'Mário Fernandes': 'Mario Fernandes',
 'Dániel Gazdag': 'Daniel Gazdag',
 'Enis Bardi': 'Enis Bardhi',
 'Danilo': 'Dani Olmo',
 'Nikola Vlašić': 'Nikola Vlasic',
 'Ruslan Malinovskyi': 'Ruslan Malinovskiy',
 'Anders Christiansen': 'Andreas Christensen',
 'András Schäfer': 'Andras Schafer',
 'Óscar Rodríguez': 'Ricardo Rodríguez',
 'Çağlar Söyüncü': 'Çaglar Söyüncü',
 'Przemysław Frankowski': 'Przemyslaw Frankowski',
 'Jonny Williams': 'Jonathan Williams',
 'Serhiy Kryvtsov': 'Serhii Kryvtsov',
 'Andrej Kramarić': 'Andrej Kramaric',
 'Michal Ďuriš': 'Michal Duris',
 'Raphaël Varane': 'Raphael Varane',
 'Domagoj Bradarić': 'Domagoj Bradaric',
 'Borna Barišić': 'Borna Barisic',
 'İlkay Gündoğan': 'Ilkay Gündogan',
 'Dénes Dibusz': 'Dénes Dibusz',
 'Pavao Pervan': 'Pavao Pervan',
 'Lovre Kalinić': 'Lovre Kalinic',
 'Simon Sluga': 'Simon Sluga',
 'Martin Dúbravka': 'Martin Dubravka',
 'Jasper Cillessen': 'Jasper Cillessen',
 'Marco Bizot': 'Marco Bizot',
 'Thibaut Courtois': 'Thibaut Courtois',
 'Marek Rodák': 'Marek Rodák',
 'Salvatore Sirigu': 'Salvatore Sirigu',
 'Anatolii Trubin': 'Anatolii Trubin',
 'Kasper Schmeichel': 'Kasper Schmeichel',
 'Dean Henderson': 'Dean Henderson',
 'Wojciech Szczęsny': 'Wojciech Szczesny',
 'Anton Shunin': 'Anton Shunin',
 'Andrei Lunev': 'Andrey Lunev',
 'Anthony Lopes': 'Anthony Lopes',
 'Steve Mandanda': 'Steve Mandanda',
 'Alessio Cragno': 'Alessio Cragno',
 'Stole Dimitrievski': 'Stole Dimitrievski',
 'Mike Maignan': 'Mike Maignan',
 'Jiří Pavlenka': 'Jirí Pavlenka',
 'Aleš Mandous': 'Ales Mandous',
 'Manuel Neuer': 'Manuel Neuer',
 'Robin Olsen': 'Robin Olsen',
 'Craig Gordon': 'Craig Gordon',
 'Hugo Lloris': 'Hugo Lloris',
 'Tim Krul': 'Tim Krul',
 'Andriy Lunin': 'Andriy Lunin',
 'Uğurcan Çakır': 'Ugurcan Çakir',
 'Jordan Pickford': 'Jordan Pickford',
 'Lukas Hradecky': 'Lukás Hrádecky',
 'Alex Meret': 'Alex Meret',
 'Péter Gulácsi': 'Péter Gulácsi',
 'Unai Simón': 'Unai Simón',
 'Yvon Mvogo': 'Yvon Mvogo',
 'Jesse Joronen': 'Jesse Joronen',
 'Damjan Siskovski': 'Damjan Siskovski',
 'Jonas Omlin': 'Jonas Omlin',
 'Yann Sommer': 'Yann Sommer',
 'Kristoffer Nordfeldt': 'Kristoffer Nordfeldt',
 'Nick Pope': 'Nick Pope',
 'Karl-Johan Johnsson': 'Karl-Johan Johnsson',
 'Frederik Rønnow': 'Frederik Rönnow',
 'Adam Davies': 'Adam Davies',
 'Gianluigi Donnarumma': 'Gianluigi Donnarumma',
 'Wayne Hennessey': 'Wayne Hennessey',
 'Dušan Kuciak': 'Dusan Kuciak',
 'Bernd Leno': 'Bernd Leno',
 'Georgiy Bushchan': 'Georgi Bushchan',
 'Łukasz Fabiański': 'Lukasz Fabianski',
 'Rui Patrício': 'Rui Patrício',
 'Simon Mignolet': 'Simon Mignolet',
 'Dominik Livaković': 'Dominik Livakovic',
 'David Marshall': 'David Marshall',
 'David de Gea': 'David de Gea',
 'Łukasz Skorupski': 'Lukasz Skorupski',
 'Tomáš Vaclík': 'Tomás Vaclik',
 'Kevin Trapp': 'Kevin Trapp',
 'Andriy Pyatov': 'Andriy Pyatov',
 'Danny Ward': 'Danny Ward',
 'Alexander Schlager': 'Alexander Schlager',
 'Jon McLaughlin': 'Jon McLaughlin'}

In [10]:
def map_name(name):
    global name_mapping
    if name in name_mapping.keys():
        return name_mapping[name]
    else:
        return name

In [11]:
player_df['Player'] = player_df['Player'].apply(map_name)

In [12]:
player_df.to_csv("{}/interim/fantasy_euro.csv".format(DATA_DIR), index=False)

In [13]:
player_df

Unnamed: 0,Player,Date,LeagueName,TeamName,OpponentName,Price,Skill,SelectedPercentage
0,Cristiano Ronaldo,2021-06-15 18:00:00,European Championship 2020,Portugal,Hungary,12.0,4,26.0
1,Kylian Mbappé,2021-06-15 21:00:00,European Championship 2020,France,Germany,12.0,4,34.0
2,Harry Kane,2021-06-13 15:00:00,European Championship 2020,England,Croatia,11.5,4,43.0
3,Robert Lewandowski,2021-06-14 18:00:00,European Championship 2020,Poland,Slovakia,11.5,4,16.0
4,Romelu Lukaku,2021-06-12 21:00:00,European Championship 2020,Belgium,Russia,11.0,4,42.0
...,...,...,...,...,...,...,...,...
1052,Dénes Dibusz,2021-06-15 18:00:00,European Championship 2020,Hungary,Portugal,4.0,1,1.0
1053,Igor Diveev,2021-06-12 21:00:00,European Championship 2020,Russia,Belgium,4.0,2,1.0
1054,Yuri Dyupin,2021-06-12 21:00:00,European Championship 2020,Russia,Belgium,4.0,1,1.0
1055,Gjoko Zajkov,2021-06-13 18:00:00,European Championship 2020,North Macedonia,Austria,4.0,2,0.0


## Main Dataset

In [14]:
players = pd.read_excel("{}/raw/fantasy_data.xlsx".format(DATA_DIR), sheet_name=0)
goalie = pd.read_excel("{}/raw/fantasy_data.xlsx".format(DATA_DIR), sheet_name=1)

In [15]:
players["Player"] = players["Player"].apply(lambda x: x.strip())
goalie["Player"] = goalie["Player"].apply(lambda x: x.strip())

In [16]:
players.loc[players["Position"] == "M/F", "Position"] = "M"

In [17]:
position = players.groupby("Player").agg(Position=('Position',lambda x: x.value_counts().index[0])).reset_index()

In [18]:
players = pd.merge(players, position, how="left", on="Player")

In [19]:
players["Position_x"] = players["Position_y"]

In [20]:
players.rename(columns={"Position_x": "Position"}, inplace=True)
players.drop(["Position_y"], axis=1, inplace=True)

In [21]:
def euro_fantasy_score(df):
    score = 1
    if df["Game Started"] > 0:
        score += 1
    if df["Min"] >= 60:
        score += 1
    if df["Assists"] > 0:
        score += (df["Assists"] * 3)
    if df["Penalty Kick Miss"] > 0:
        score -= (df["Penalty Kick Miss"] * 2)
    if df["Yellow Cards"] > 0:
        score -= 1
    if df["Red Cards"] > 0:
        score -= 3
    if df["Position"] == "F":
        score += (df["Goals"] * 4)
    if df["Position"] == "M" or df['Position'] == "M/F":
        score += (df["Goals"] * 5)
        if df["Min"] >= 60 and df["Clean Sheet"] > 0:
            score += 1
    if df["Position"] == "D":
        score += (df["Goals"] * 6)
        if df["Min"] >= 60 and df["Clean Sheet"] > 0:
            score += 4
        score -= (df['Goals Allowed'] // 2)
    if df["Position"] == "GK":
        score += (df["Goals"] * 6)
        score += (df["Penalty Kick Saved"] * 5)
        if df["Min"] >= 60 and df["Clean Sheet"] > 0:
            score += 4
        score += (df["Saves"] // 3)
        score -= (df['Goals Allowed'] // 2)
    return score

In [22]:
def get_agg_before(df):
    df_new = df.copy()
    merged_df = pd.merge(df, df_new, on=["Player", "TeamName"])
    merged_df = merged_df[merged_df['Date_y'] < merged_df["Date_x"]]
    merged_df["is_scoring"] = merged_df["Goals_y"] > 0
    merged_df["is_assisting"] = merged_df["Assists_y"] > 0
    merged_df = merged_df.groupby(["Player", "TeamName", "Date_x"]).agg(prev_mean_euro_score=("euro_score_y", "mean"),
                                                                        prev_mean_goals=("Goals_y", "median"),
                                                                        prev_mean_assists=("Assists_y", "mean"),
                                                                        prev_median_min=("Min_y", "median"),
                                                                        prev_starter_rate=("Game Started_y", "mean"),
                                                                        count_play=("Date_y","nunique"),
                                                                        goal_consistency=("is_scoring", "mean"),
                                                                        assist_consistency=("is_assisting", "mean"),
                                                                        clean_sheet_consistency=("Clean Sheet_y", "mean")
                                                                       )
    merged_df = merged_df.reset_index()
    merged_df.rename(columns={"Date_x": "Date"}, inplace=True)
    return merged_df

In [23]:
players['euro_score'] = players.apply(euro_fantasy_score, axis=1)
goalie["euro_score"] = goalie.apply(euro_fantasy_score, axis=1)

In [24]:
temp_player = players[["Player", "Date", "LeagueName", "TeamName", "OpponentName", "Position", "euro_score", "Goals", "Assists", "Clean Sheet", "Min", "Game Started"]]
temp_goalie = goalie[["Player", "Date", "LeagueName", "TeamName", "OpponentName", "Position", "euro_score", "Goals", "Assists", "Clean Sheet", "Min", "Game Started"]]

In [25]:
df = pd.concat([temp_player, temp_goalie])

In [26]:
position = df[["Player", "Position"]]
position = position.drop_duplicates(subset=["Player", "Position"])

In [27]:
player_df = pd.merge(player_df, position, how="left", on="Player")

In [28]:
df = pd.concat([df, player_df.drop(["Price", "Skill", "SelectedPercentage"], axis=1)])

In [29]:
df

Unnamed: 0,Player,Date,LeagueName,TeamName,OpponentName,Position,euro_score,Goals,Assists,Clean Sheet,Min,Game Started
0,Artem Dzyuba,2019-06-08 00:00:00,European Championship Qualifiers,Russia,San Marino,F,20.0,4.0,1.0,1.0,90.0,1.0
1,Cristiano Ronaldo,2019-09-10 00:00:00,European Championship Qualifiers,Portugal,Lithuania,F,19.0,4.0,0.0,0.0,79.0,1.0
2,Memphis Depay,2019-03-21 00:00:00,European Championship Qualifiers,Netherlands,Belarus,F,17.0,2.0,2.0,1.0,90.0,1.0
3,Denis Cheryshev,2019-10-13 00:00:00,European Championship Qualifiers,Russia,Cyprus,M,20.0,2.0,2.0,1.0,90.0,1.0
4,Cristiano Ronaldo,2019-11-14 00:00:00,European Championship Qualifiers,Portugal,Lithuania,F,15.0,3.0,0.0,1.0,83.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1052,Dénes Dibusz,2021-06-15 18:00:00,European Championship 2020,Hungary,Portugal,GK,,,,,,
1053,Igor Diveev,2021-06-12 21:00:00,European Championship 2020,Russia,Belgium,D,,,,,,
1054,Yuri Dyupin,2021-06-12 21:00:00,European Championship 2020,Russia,Belgium,,,,,,,
1055,Gjoko Zajkov,2021-06-13 18:00:00,European Championship 2020,North Macedonia,Austria,D,,,,,,


In [30]:
agg = get_agg_before(df)

In [31]:
df = df.drop(["Goals", "Assists", "Min", "Game Started", "Clean Sheet"], axis=1)
df = pd.merge(df, agg, how="left", on=["Player", "TeamName", "Date"])

In [32]:
df["is_friendlies"] = df["LeagueName"] == "International Friendlies"

In [33]:
df.columns

Index(['Player', 'Date', 'LeagueName', 'TeamName', 'OpponentName', 'Position',
       'euro_score', 'prev_mean_euro_score', 'prev_mean_goals',
       'prev_mean_assists', 'prev_median_min', 'prev_starter_rate',
       'count_play', 'goal_consistency', 'assist_consistency',
       'clean_sheet_consistency', 'is_friendlies'],
      dtype='object')

In [34]:
df.columns = ["player_name", "date", "league_name", "team_name", "opponent_name", "position", "euro_score", 'prev_mean_euro_score', 'prev_mean_goals', 'prev_mean_assists', 'prev_median_min',
              'prev_starter_rate', "count_play", "goal_consistency", "assist_consistency", "clean_sheet_consistency", "is_friendlies"]
df = df.sort_values(by=["player_name","date"])

## FIFA Rank Dataset

In [35]:
df["date"].min()

Timestamp('2018-09-05 00:00:00')

In [36]:
fifa_rank = pd.read_csv("{}/raw/fifa_ranking-2021-05-27.csv".format(DATA_DIR))

In [37]:
fifa_rank = fifa_rank[fifa_rank["rank_date"] > '2016-09-09']
fifa_rank = fifa_rank[["country_full", "rank", "total_points", "rank_date"]]
fifa_rank = fifa_rank.sort_values(by=["country_full", "rank_date"])

In [38]:
fifa_rank

Unnamed: 0,country_full,rank,total_points,rank_date
53348,Afghanistan,149,201,2016-09-15
53563,Afghanistan,145,208,2016-10-20
53770,Afghanistan,147,189,2016-11-24
53984,Afghanistan,146,189,2016-12-22
54190,Afghanistan,151,179,2017-01-12
...,...,...,...,...
62107,Zimbabwe,108,1181,2020-11-26
62241,Zimbabwe,108,1181,2020-12-10
62455,Zimbabwe,112,1176,2021-02-18
62794,Zimbabwe,107,1175,2021-04-07


In [39]:
df_with_rank = pd.merge(df[["team_name", "date"]], fifa_rank, how="left", left_on="team_name", right_on="country_full")
df_with_rank["rank_date"] = pd.to_datetime(df_with_rank['rank_date'])
df_with_rank["time_diff"] = df_with_rank.apply(lambda x: (x['date']-x['rank_date']).total_seconds(), axis=1)
df_with_rank = df_with_rank[df_with_rank["time_diff"] > 0]
df_with_rank = df_with_rank.sort_values(by=["team_name", "time_diff"], ascending=False)
df_with_rank = df_with_rank.groupby(["team_name", "date"]).agg(team_rank=("rank", "last"),
                                                               team_total_points=("total_points", "last")).reset_index()

In [40]:
df = pd.merge(df, df_with_rank, on=["team_name", "date"], how="left")

In [41]:
df_with_rank = pd.merge(df[["opponent_name", "date"]], fifa_rank, how="left", left_on="opponent_name", right_on="country_full")
df_with_rank["rank_date"] = pd.to_datetime(df_with_rank['rank_date'])
df_with_rank["time_diff"] = df_with_rank.apply(lambda x: (x['date']-x['rank_date']).total_seconds(), axis=1)
df_with_rank = df_with_rank[df_with_rank["time_diff"] > 0]
df_with_rank = df_with_rank.sort_values(by=["opponent_name", "time_diff"], ascending=False)
df_with_rank = df_with_rank.groupby(["opponent_name", "date"]).agg(opponent_rank=("rank", "last"),
                                                                   opponent_total_points=("total_points", "last")).reset_index()

In [42]:
df = pd.merge(df, df_with_rank, on=["opponent_name", "date"], how="left")

In [43]:
df.head()

Unnamed: 0,player_name,date,league_name,team_name,opponent_name,position,euro_score,prev_mean_euro_score,prev_mean_goals,prev_mean_assists,prev_median_min,prev_starter_rate,count_play,goal_consistency,assist_consistency,clean_sheet_consistency,is_friendlies,team_rank,team_total_points,opponent_rank,opponent_total_points
0,Aaron Ramsey,2018-09-06,UEFA Nations League,Wales,Republic of Ireland,M,8.0,,,,,,,,,,False,19,1536,29,1484
1,Aaron Ramsey,2018-09-09,UEFA Nations League,Wales,Denmark,M,3.0,8.0,1.0,0.0,90.0,1.0,1.0,1.0,0.0,0.0,False,19,1536,9,1580
2,Aaron Ramsey,2018-10-11,International Friendlies,Wales,Spain,M,3.0,5.5,0.5,0.0,90.0,1.0,2.0,0.5,0.0,0.0,True,19,1536,9,1597
3,Aaron Ramsey,2018-11-16,UEFA Nations League,Wales,Denmark,M,3.0,4.666667,0.0,0.0,90.0,1.0,3.0,0.333333,0.0,0.0,False,18,1538,10,1584
4,Aaron Ramsey,2018-11-20,International Friendlies,Wales,Albania,M,1.0,4.25,0.0,0.0,90.0,1.0,4.0,0.25,0.0,0.0,True,18,1538,60,1372


In [44]:
df["fifa_rank_diff"] = df["team_rank"] - df["opponent_rank"]
df["fifa_points_diff"] = df["team_total_points"] - df["opponent_total_points"]

## TransferMarkt Market Value Data

In [45]:
df['league_name'].unique()

array(['UEFA Nations League', 'International Friendlies',
       'European Championship Qualifiers', 'European Championship 2020',
       'European World Cup Qualifiers'], dtype=object)

In [46]:
euro = pd.read_excel("{}/raw/transfermarkt-market-value.xlsx".format(DATA_DIR), sheet_name=0)

In [47]:
df = df.merge(euro[["Club", "EURO participations"]], how="left", left_on="team_name", right_on="Club")
df.rename(columns={"EURO participations" : "team_euro_participations"}, inplace=True)
df.drop(["Club"], axis=1, inplace=True)
df = df.merge(euro[["Club","EURO participations"]], how="left", left_on="opponent_name", right_on="Club")
df.rename(columns={"EURO participations" : "opponents_euro_participations"}, inplace=True)
df.drop(["Club"], axis=1, inplace=True)

In [48]:
df['opponents_euro_participations'] = df['opponents_euro_participations'].fillna(0)

In [49]:
nations_league = pd.read_excel("{}/raw/transfermarkt-market-value.xlsx".format(DATA_DIR), sheet_name=1)
euro_qual = pd.read_excel("{}/raw/transfermarkt-market-value.xlsx".format(DATA_DIR), sheet_name=2)
wc_euro_qual = pd.read_excel("{}/raw/transfermarkt-market-value.xlsx".format(DATA_DIR), sheet_name=3)

In [50]:
nations_league["league_name"] = "UEFA Nations League"
euro_qual["league_name"] = "European Championship Qualifiers"
wc_euro_qual["league_name"] = "European World Cup Qualifiers"
euro["league_name"] = "European Championship 2020"

In [51]:
euro.rename(columns={"Average Age": "Age"}, inplace=True)

In [52]:
mv_df = pd.concat([nations_league, euro_qual, wc_euro_qual, euro.drop(["EURO participations"], axis=1)])

In [53]:
import re
def preprocess_market_value(text):
    match = re.sub("€", "", text)
    match = re.search("(\d+(?:\.\d+)?)", text)
    val = float(match.group())
    num = text[match.end():]
    if num == "bn":
        val *= 10e9
    elif num == "m":
        val *= 10e6
    elif num == "Th.":
        val *= 10e3
    return val
    

In [54]:
mv_df["market_value"] = mv_df["Market Value"].apply(preprocess_market_value)
mv_df["mean_market_value"] = mv_df["Average Market Value"].apply(preprocess_market_value)

In [55]:
mv_df.rename(columns={"Age": "mean_squad_age", "Club": "team_name"}, inplace=True)

In [56]:
mv_df = mv_df.drop_duplicates(subset=["team_name", "league_name"], keep="first")

In [57]:
df = df.merge(mv_df.drop(["Squad", "Market Value", "Average Market Value"], axis=1), how="left", on=["team_name", "league_name"])
df.rename(columns={"mean_squad_age": "team_mean_squad_age", "market_value": "team_market_value", "mean_market_value": "team_mean_market_value"}, inplace=True)

In [58]:
mv_df.rename(columns={"team_name": "opponent_name"}, inplace=True)

In [59]:
df = df.merge(mv_df.drop(["Squad", "Market Value", "Average Market Value"], axis=1), how="left", on=["opponent_name", "league_name"])
df.rename(columns={"mean_squad_age": "opponent_mean_squad_age", "market_value": "opponent_market_value", "mean_market_value": "opponent_mean_market_value"}, inplace=True)

In [60]:
df["diff_market_value"] = df["team_market_value"] - df["opponent_market_value"]
df["diff_mean_market_value"] = df["team_mean_market_value"] - df["opponent_mean_market_value"]
df["diff_mean_squad_age"] = df["team_mean_squad_age"] - df["opponent_mean_squad_age"]

## FIFA Players

In [61]:
fifa =  pd.read_csv("{}/raw/fifa-players_21.csv".format(DATA_DIR))

In [62]:
fifa = fifa[fifa['nationality'].isin(df['team_name'].unique())]

In [63]:
def join_tuple_string(strings_tuple):
    return ' '.join(strings_tuple)

def create_unigram_bigram_trigram_quadgram(text, x):
    tuple_gram = list(everygrams(text.split(" "), 2, x))
    result = map(join_tuple_string, tuple_gram)
    return list(result)

def calculate_closest_token(df):
    everygram = create_unigram_bigram_trigram_quadgram(df["long_name"], df['len_name'])
    closest = difflib.get_close_matches(df["short_name"], everygram, n=1)
    return ''.join(closest)

In [64]:
fifa['len_name'] = fifa["long_name"].apply(lambda x: len(x.split(" ")))
fifa['len_short_name'] = fifa["short_name"].apply(lambda x: len(x.split(" ")))

In [65]:
df['player_name'] = df['player_name'].apply(lambda x: x.strip())

In [66]:
fifa['closest_name'] = fifa.apply(calculate_closest_token, axis=1)

In [67]:
fifa['min_char_in_name'] = fifa['long_name'].apply(lambda x: min(len(y) for y in x.split()))

In [68]:
name_mapping = {
 'Aleksandar Dragović': 'Aleksandar Dragovic',
 'Aleš Matějů': 'Ales Mateju',
 'Alex Král': 'Alex Kral',
 'Anatoliy Trubin': 'Anatolii Trubin',
 'András Schäfer': 'Andras Schafer',
 'Dean Cornelius': 'Andreas Cornelius',
 'Andrej Kramarić': 'Andrej Kramaric',
 'Ante Rebić': 'Ante Rebic',
 'Bartosz Bereszyński': 'Bartosz Bereszynski',
 'Bećir Omeragić': 'Becir Omeragic',
 'Bogdan Mykhaylychenko': 'Bogdan Mykhaylichenko',
 'Borna Barišić': 'Borna Barisic',
 'B. Embolo': 'Breel Embolo',
 'Mango Fernandes': 'Mario Fernandes',
 'Bruno Petković': 'Bruno Petkovic',
 'Burak Yılmaz': 'Burak Yilmaz',
 'Che Adams': 'Che Adams',
 'Christian Günter': 'Chris Gunter',
 'Liam Craig Gordon': 'Craig Gordon',
 'Azpilicueta': 'César Azpilicueta',
 'David Allan': 'David Alaba',
 'Anga Dedryck Boyata': 'Dedryck Boyata',
 'Davor Lovren': 'Dejan Lovren',
 'Lemi Zakaria': 'Denis Zakaria',
 'Diego Javier Llorente': 'Diego Llorente',
 'Dmitriy Barinov': 'Dimitri Barinov',
 'Domagoj Bradarić': 'Domagoj Bradaric',
 'Dominik Livaković': 'Dominik Livakovic',
 'van de Beek': 'Donny van de Beek',
 'Dorukhan Toköz': 'Dorukhan Tokoz',
 'Duje Ćaleta-Car': 'Duje Caleta-Car',
 'Dušan Kuciak': 'Dusan Kuciak',
 'Miklós Sigér': 'Dávid Miklós Sigér',
 'Eray Ervin Cömert': 'Eray Cömert',
 'Filip Holender': 'Filip Helander',
 'Frederik Rønnow': 'Frederik Rönnow',
 'Georgiy Bushchan': 'Georgi Bushchan',
 'Georgiy Dzhikiya': 'Georgi Dzhikiya',
 'Glen Adjei Kamara': 'Glen Kamara',
 'Greg Taylor': 'Greg Taylor',
 'Hakan Çalhanoğlu': 'Hakan Calhanoglu',
 'Haris Seferović': 'Haris Seferovic',
 'İlkay Gündoğan': 'Ilkay Gündogan',
 'İrfan Can Kahveci': 'Irfan Kahveci',
 'Ivan Perišić': 'Ivan Perisic',
 'Ivan Fiolić': 'Ivan Trickovski',
 'Jakub Holúbek': 'Jakub Holubek',
 'Jamal Musiala': 'Jamal Musiala',
 'Alexander Lawrence': 'James Alexander Lawrence',
 'Jan Bořil': 'Jan Boril',
 'Jens Jønsson': 'Jens Jonsson',
 'Jere Juhani Uronen': 'Jere Uronen',
 'Jiří Pavlenka': 'Jirí Pavlenka',
 'Joakim Mæhle': 'Joakim Maehle',
 'Joseff Morrell': 'Joe Morrell',
 'Jordi Alba Ramos': 'Jordi Alba',
 'Josip Juranović': 'Josip Juranovic',
 'Palhinha': 'João Palhinha',
 'Jérémy Doku': 'Jéremy Doku',
 'Kamil Jóźwiak': 'Kamil Jozwiak',
 'Karol Świderski': 'Karol Swiderski',
 'Stefan Ristovski': 'Stefan Spirovski',
 'Kurt Happy Zouma': 'Kurt Zouma',
 'Lasse Schøne': 'Lasse Schöne',
 'Lovre Kalinić': 'Lovre Kalinic',
 'Lucas Hernández Pi': 'Lucas Hernández',
 'Luka Modrić': 'Luka Modric',
 'Lukáš Haraslín': 'Lukas Haraslin',
 'Lukáš Masopust': 'Lukas Masopust',
 'Łukasz Fabiański': 'Lukasz Fabianski',
 'Lukáš Hrádecký': 'Lukás Hrádecky',
 'Manuel Viana': 'Manuel Akanji',
 'Marcelo Brozović': 'Marcelo Brozovic',
 'Marcus Danielsson': 'Marcus Danielson',
 'Marek Hamšík': 'Marek Hamsik',
 'Mario Gavranović': 'Mario Gavranovic',
 'Mario Pašalić': 'Mario Pasalic',
 'Marcin Kamiński': 'Marjan Radeski',
 'Marko Arnautović': 'Marko Arnautovic',
 'Martin Dúbravka': 'Martin Dubravka',
 'Matěj Vydra': 'Matej Vydra',
 'Mateo Kovačić': 'Mateo Kovacic',
 'Matúš Bero': 'Matús Bero',
 'Michael Krmenčík': 'Michal Krmencik',
 'Michael Gurski': 'Michal Duris',
 'Michał Helik': 'Michal Helik',
 'Carl Mikael Lustig': 'Mikael Lustig',
 'Oyarzabal': 'Mikel Oyarzabal',
 'Milan Škriniar': 'Milan Skriniar',
 'Mile Svilar': 'Mile Skoric',
 'Mislav Oršić': 'Mislav Orsic',
 'M. Kean': 'Moise Kean',
 'Mykola Matvienko': 'Mykola Matvyenko',
 'Nemanja Nikolić': 'Nemanja Nikolics',
 'N. Hämäläinen': 'Niko Hämäläinen',
 'Nikola Vlašić': 'Nikola Vlasic',
 'Nélson Cabral Semedo': 'Nélson Semedo',
 'Okay Yokuşlu': 'Okay Yokuslu',
 'Aleksandr Zhirov': 'Oleksandr Zubkov',
 'Ondřej Čelůstka': 'Ondrej Celustka',
 'Ondřej Kúdela': 'Ondrej Kudela',
 'Orkun Kökçü': 'Orkun Kökcü',
 'O. Kabak': 'Ozan Kabak',
 'Patrik Hrošovský': 'Patrik Hrosovsky',
 'Pavel Kadeřábek': 'Pavel Kaderábek',
 'Petr Ševčík': 'Petr Sevcik',
 'Philip Foden': 'Phil Foden',
 'Leo Bengtsson': 'Pierre Bengtsson',
 'Piotr Zieliński': 'Piotr Zielinski',
 'Przemysław Frankowski': 'Przemyslaw Frankowski',
 'Przemysław Płacheta': 'Przemyslaw Placheta',
 'Raphaël Varane': 'Raphael Varane',
 'Renato Júnior Luz Sanches': 'Renato Sanches',
 'Róbert Boženík': 'Robert Bozenik',
 'Ruslan Malinovskyi': 'Ruslan Malinovskiy',
 'Ryan Jiro Gravenberch': 'Ryan Gravenberch',
 'Saša Kalajdžić': 'Sasa Kalajdzic',
 'Sergiy Kryvtsov': 'Serhii Kryvtsov',
 'Šime Vrsaljko': 'Sime Vrsaljko',
 'Tamás Cseri': 'Tamas Cseri',
 'Taylan Antalyalı': 'Taylan Antalyali',
 'Tomáš Pekhart': 'Tomas Pekhart',
 'Tomáš Souček': 'Tomas Soucek',
 'Tomáš Suslov': 'Tomas Suslov',
 'Tomasz Kędziora': 'Tomasz Kedziora',
 'Thomas Holmes': 'Tomás Holes',
 'Tomáš Vaclík': 'Tomás Vaclik',
 'Uğurcan Çakır': 'Ugurcan Çakir',
 'Cengiz Umut Meraş': 'Umut Meras',
 'Vitaliy Mykolenko': 'Vitalii Mykolenko',
 'Vladimír Coufal': 'Vladimir Coufal',
 'Vladimír Darida': 'Vladimir Darida',
 'Alain Wiss': 'Vladimir Weiss',
 'Will Dean': 'Willi Orban',
 'William Silva de Carvalho': 'William Carvalho',
 'Yuriy Zhirkov': 'Yuri Zhirkov',
 'Yusuf Yazıcı': 'Yusuf Yazici',
 'Álvaro Traver': 'Álvaro Morata',
 'Çağlar Söyüncü': 'Çaglar Söyüncü'}

In [69]:
def heuristic_match(df):
    if df["len_short_name"] == 1:
        return df["short_name"]
    if len(df["closest_name"].split()) == 0:
        return df["short_name"]
    elif df["len_name"] > 3:
        return df["closest_name"]
    elif df["len_name"] == 3 and df["min_char_in_name"] > 4:
        return df["closest_name"]
    else:
        return df["long_name"]

In [70]:
fifa['closest_name'] = fifa.apply(heuristic_match, axis=1)

In [71]:
def map_name(name):
    global name_mapping
    if name in name_mapping.keys():
        return name_mapping[name]
    else:
        return name

In [72]:
fifa['closest_name'] = fifa['closest_name'].apply(map_name)

In [73]:
fifa.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club_name,league_name,league_rank,overall,potential,value_eur,wage_eur,player_positions,preferred_foot,international_reputation,weak_foot,skill_moves,work_rate,body_type,real_face,release_clause_eur,player_tags,team_position,team_jersey_number,loaned_from,joined,contract_valid_until,nation_position,nation_jersey_number,pace,shooting,passing,dribbling,defending,physic,gk_diving,gk_handling,gk_kicking,gk_reflexes,gk_speed,gk_positioning,player_traits,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,len_name,len_short_name,closest_name,min_char_in_name
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,35,1985-02-05,187,83,Portugal,Juventus,Italian Serie A,1.0,92,92,46000000,220000,"ST, LW",Right,5,4,5,High/Low,C. Ronaldo,Yes,75900000.0,"#Aerial Threat, #Dribbler, #Distance Shooter, ...",LS,7.0,,2018-07-10,2022.0,LS,7.0,89.0,93.0,81.0,89.0,35.0,77.0,,,,,,,"Power Free-Kick, Flair, Long Shot Taker (AI), ...",84,95,90,82,86,88,81,76,77,92,87,91,87,95,71,94,95,84,78,93,63,29,95,82,84,95,,32,24,7,11,15,14,11,91+1,91+1,91+1,89+0,91+0,91+0,91+0,89+0,88+3,88+3,88+3,88+3,81+3,81+3,81+3,88+3,65+3,61+3,61+3,61+3,65+3,61+3,54+3,54+3,54+3,61+3,5,2,Cristiano Ronaldo,3
3,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,31,1988-08-21,184,80,Poland,FC Bayern München,German 1. Bundesliga,1.0,91,91,80000000,240000,ST,Right,4,4,4,High/Medium,PLAYER_BODY_TYPE_276,Yes,132000000.0,"#Distance Shooter, #Clinical Finisher",ST,9.0,,2014-07-01,2023.0,,,78.0,91.0,78.0,85.0,43.0,82.0,,,,,,,"Solid Player, Finesse Shot, Outside Foot Shot,...",71,94,85,84,89,85,79,85,70,88,77,78,77,93,82,89,84,76,86,85,81,49,94,79,88,88,,42,19,15,6,12,8,10,89+2,89+2,89+2,85+0,87+0,87+0,87+0,85+0,85+3,85+3,85+3,83+3,79+3,79+3,79+3,83+3,64+3,65+3,65+3,65+3,64+3,61+3,60+3,60+3,60+3,61+3,2,2,Robert Lewandowski,6
5,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,29,1991-06-28,181,70,Belgium,Manchester City,English Premier League,1.0,91,91,87000000,370000,"CAM, CM",Right,4,5,4,High/High,PLAYER_BODY_TYPE_321,Yes,161000000.0,"#Dribbler, #Playmaker, #Engine, #Distance Shoo...",RCM,17.0,,2015-08-30,2023.0,RCM,7.0,76.0,86.0,93.0,88.0,64.0,78.0,,,,,,,"Injury Prone, Leadership, Early Crosser, Long ...",94,82,55,94,82,88,85,83,93,92,77,76,78,91,76,91,63,89,74,91,76,66,88,94,84,91,,65,53,15,13,5,10,13,83+3,83+3,83+3,88+0,88+0,88+0,88+0,88+0,89+2,89+2,89+2,89+2,89+2,89+2,89+2,89+2,79+3,80+3,80+3,80+3,79+3,75+3,69+3,69+3,69+3,75+3,3,3,Kevin De Bruyne,2
6,231747,https://sofifa.com/player/231747/kylian-mbappe...,K. Mbappé,Kylian Mbappé Lottin,21,1998-12-20,178,73,France,Paris Saint-Germain,French Ligue 1,1.0,90,95,105500000,160000,"ST, LW, RW",Right,3,4,5,High/Low,PLAYER_BODY_TYPE_343,Yes,203100000.0,"#Speedster, #Dribbler, #Acrobat",LS,7.0,,2018-07-01,2022.0,RM,10.0,96.0,86.0,78.0,91.0,39.0,76.0,,,,,,,"Finesse Shot, Flair, Speed Dribbler (AI), Outs...",78,91,73,83,83,92,79,63,70,90,96,96,92,92,82,86,77,86,76,79,62,38,91,80,70,84,,34,32,13,5,7,11,6,88+3,88+3,88+3,89+0,89+0,89+0,89+0,89+0,87+3,87+3,87+3,87+3,79+3,79+3,79+3,87+3,67+3,63+3,63+3,63+3,67+3,63+3,55+3,55+3,55+3,63+3,3,2,Kylian Mbappé,6
7,192448,https://sofifa.com/player/192448/marc-andre-te...,M. ter Stegen,Marc-André ter Stegen,28,1992-04-30,187,85,Germany,FC Barcelona,Spain Primera Division,1.0,90,93,69500000,260000,GK,Right,3,4,1,Medium/Medium,PLAYER_BODY_TYPE_262,Yes,147700000.0,,GK,1.0,,2014-07-01,2022.0,SUB,22.0,,,,,,,88.0,85.0,88.0,90.0,45.0,88.0,"Rushes Out Of Goal, Comes For Crosses, Saves w...",18,14,11,61,14,21,18,12,63,30,38,50,37,86,43,66,79,35,78,10,43,22,11,70,25,70,,13,10,88,85,88,88,90,35+3,35+3,35+3,34+0,38+0,38+0,38+0,34+0,42+3,42+3,42+3,39+3,45+3,45+3,45+3,39+3,33+3,41+3,41+3,41+3,33+3,31+3,33+3,33+3,33+3,31+3,3,3,Marc-André ter Stegen,3


In [74]:
fifa = fifa[["closest_name", "nationality", "age", "height_cm", "weight_kg", "league_rank", "overall", "potential", "wage_eur", 
             "international_reputation", "pace", "shooting", "passing", "dribbling", "defending",  'nation_position', 'nation_jersey_number',
             "physic", "attacking_crossing", "attacking_finishing", "attacking_heading_accuracy", "attacking_short_passing",
             "attacking_volleys", "skill_dribbling", "skill_curve", "skill_fk_accuracy", "skill_long_passing", "skill_ball_control",
             "movement_acceleration", "movement_sprint_speed", "movement_agility", "movement_reactions", "movement_balance", "power_shot_power",
             "power_jumping", "power_stamina","power_strength", "power_long_shots", "mentality_aggression", "mentality_interceptions",
             "mentality_positioning", "mentality_vision", "mentality_penalties", "mentality_composure", "defending_standing_tackle",
             "defending_sliding_tackle", "goalkeeping_diving", "goalkeeping_handling", "goalkeeping_kicking", "goalkeeping_positioning", "goalkeeping_reflexes"]]

In [75]:
fifa.rename(columns={"closest_name": "player_name", "nationality": "team_name"}, inplace=True)

In [76]:
fifa = fifa.drop_duplicates(subset=["player_name", "team_name"])

In [77]:
df = pd.merge(df, fifa, how="left", on=["player_name", "team_name"])

In [78]:
df.head()

Unnamed: 0,player_name,date,league_name,team_name,opponent_name,position,euro_score,prev_mean_euro_score,prev_mean_goals,prev_mean_assists,prev_median_min,prev_starter_rate,count_play,goal_consistency,assist_consistency,clean_sheet_consistency,is_friendlies,team_rank,team_total_points,opponent_rank,opponent_total_points,fifa_rank_diff,fifa_points_diff,team_euro_participations,opponents_euro_participations,team_mean_squad_age,team_market_value,team_mean_market_value,opponent_mean_squad_age,opponent_market_value,opponent_mean_market_value,diff_market_value,diff_mean_market_value,diff_mean_squad_age,age,height_cm,weight_kg,league_rank,overall,potential,wage_eur,international_reputation,pace,shooting,passing,dribbling,defending,nation_position,nation_jersey_number,physic,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,Aaron Ramsey,2018-09-06,UEFA Nations League,Wales,Republic of Ireland,M,8.0,,,,,,,,,,False,19,1536,29,1484,-10,52,2,0.0,25.6,1767500000.0,68000000.0,25.5,776500000.0,29900000.0,991000000.0,38100000.0,0.1,29.0,183.0,76.0,1.0,82.0,82.0,98000.0,3.0,68.0,77.0,79.0,80.0,68.0,LDM,10.0,73.0,75.0,75.0,58.0,84.0,79.0,81.0,70.0,66.0,80.0,83.0,67.0,68.0,72.0,81.0,74.0,81.0,66.0,89.0,66.0,74.0,74.0,69.0,84.0,81.0,75.0,81.0,72.0,68.0,6.0,11.0,5.0,10.0,8.0
1,Aaron Ramsey,2018-09-09,UEFA Nations League,Wales,Denmark,M,3.0,8.0,1.0,0.0,90.0,1.0,1.0,1.0,0.0,0.0,False,19,1536,9,1580,10,-44,2,9.0,25.6,1767500000.0,68000000.0,27.8,3107000000.0,119500000.0,-1339500000.0,-51500000.0,-2.2,29.0,183.0,76.0,1.0,82.0,82.0,98000.0,3.0,68.0,77.0,79.0,80.0,68.0,LDM,10.0,73.0,75.0,75.0,58.0,84.0,79.0,81.0,70.0,66.0,80.0,83.0,67.0,68.0,72.0,81.0,74.0,81.0,66.0,89.0,66.0,74.0,74.0,69.0,84.0,81.0,75.0,81.0,72.0,68.0,6.0,11.0,5.0,10.0,8.0
2,Aaron Ramsey,2018-10-11,International Friendlies,Wales,Spain,M,3.0,5.5,0.5,0.0,90.0,1.0,2.0,0.5,0.0,0.0,True,19,1536,9,1597,10,-61,2,11.0,,,,,,,,,,29.0,183.0,76.0,1.0,82.0,82.0,98000.0,3.0,68.0,77.0,79.0,80.0,68.0,LDM,10.0,73.0,75.0,75.0,58.0,84.0,79.0,81.0,70.0,66.0,80.0,83.0,67.0,68.0,72.0,81.0,74.0,81.0,66.0,89.0,66.0,74.0,74.0,69.0,84.0,81.0,75.0,81.0,72.0,68.0,6.0,11.0,5.0,10.0,8.0
3,Aaron Ramsey,2018-11-16,UEFA Nations League,Wales,Denmark,M,3.0,4.666667,0.0,0.0,90.0,1.0,3.0,0.333333,0.0,0.0,False,18,1538,10,1584,8,-46,2,9.0,25.6,1767500000.0,68000000.0,27.8,3107000000.0,119500000.0,-1339500000.0,-51500000.0,-2.2,29.0,183.0,76.0,1.0,82.0,82.0,98000.0,3.0,68.0,77.0,79.0,80.0,68.0,LDM,10.0,73.0,75.0,75.0,58.0,84.0,79.0,81.0,70.0,66.0,80.0,83.0,67.0,68.0,72.0,81.0,74.0,81.0,66.0,89.0,66.0,74.0,74.0,69.0,84.0,81.0,75.0,81.0,72.0,68.0,6.0,11.0,5.0,10.0,8.0
4,Aaron Ramsey,2018-11-20,International Friendlies,Wales,Albania,M,1.0,4.25,0.0,0.0,90.0,1.0,4.0,0.25,0.0,0.0,True,18,1538,60,1372,-42,166,2,0.0,,,,,,,,,,29.0,183.0,76.0,1.0,82.0,82.0,98000.0,3.0,68.0,77.0,79.0,80.0,68.0,LDM,10.0,73.0,75.0,75.0,58.0,84.0,79.0,81.0,70.0,66.0,80.0,83.0,67.0,68.0,72.0,81.0,74.0,81.0,66.0,89.0,66.0,74.0,74.0,69.0,84.0,81.0,75.0,81.0,72.0,68.0,6.0,11.0,5.0,10.0,8.0


In [79]:
df = df.drop_duplicates()

In [80]:
df.to_csv("{}/interim/all_data.csv".format(DATA_DIR), index=False)