# ToDos from previous notebook

#### thoughts for feature engineering

1. aggregate nationality into continents
2. make some sort of ranking with club, like "top 50 in market value", etc. maybe use for "superstars"
3. convert all positions into some sort of "the more positions able to play the more flexible the player is"
4. investigate what work rate, international reputation and skill_moves refer to
5. investigate upon player traits and aggregate into broader categories
6. investigate upon national team appearance

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_fp = pd.read_csv("data/fp_data_v2.csv", index_col=0)
df_gk = pd.read_csv("data/gk_data_v2.csv", index_col=0)
pd.set_option('display.max_columns', None)

In [3]:
df_fp.head(3)

Unnamed: 0,market_value_€,player_name,long_name,year_of_birth,player_age,height_cm,weight_kg,nationality,main_position,all_positions,club,league,divison,national_team_current_or_past,national_team_country,current_national_player,no_current_national_player,national_team_appearances,international_reputation,overall,potential,preferred_foot,weak_foot,skill_moves,work_rate,pace,shooting,passing,dribbling,defending,physic,player_traits,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,112000000.0,Lionel Messi,Lionel Andrés Messi Cuccittini,1987,32,170,72,Argentina,Rechtsaußen,"RW, CF, ST",FC Barcelona,LaLiga,1,1,Argentinien,1,0,138,5,94,94,Left,4,4,Medium/Low,87,92,92,96,39,66,"Beat Offside Trap, Argues with Officials, Earl...",88,95,70,92,88,97,93,94,92,96,91,84,93,95,95,86,68,75,68,94,48,40,94,94,75,96,33,37,26,89,89,89,93,93,93,93,93,93,93,93,92,87,87,87,92,68,66,66,66,68,63,52,52,52,63
1,60000000.0,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,1985,35,187,83,Portugal,Linksaußen,"ST, LW",Juventus Turin,Serie A,1,1,Portugal,1,0,164,5,93,93,Right,4,5,High/Low,90,93,82,89,35,78,"Long Throw-in, Selfish, Argues with Officials,...",84,94,89,83,87,89,81,76,77,92,89,91,87,96,71,95,95,85,78,93,63,29,95,82,85,95,28,32,24,91,91,91,89,90,90,90,89,88,88,88,88,81,81,81,88,65,61,61,61,65,61,53,53,53,61
2,128000000.0,Neymar,Neymar da Silva Santos Junior,1992,28,175,68,Brazil,Linksaußen,"LW, CAM",FC Paris Saint-Germain,Ligue 1,1,1,Brasilien,1,0,102,5,92,92,Right,5,5,High/Medium,91,85,87,95,32,58,"Power Free-Kick, Injury Free, Selfish, Early C...",87,87,62,87,87,96,88,87,81,95,94,89,96,92,84,80,61,81,49,84,51,36,87,90,90,94,27,26,29,84,84,84,90,89,89,89,90,90,90,90,89,82,82,82,89,66,61,61,61,66,61,46,46,46,61


## Nationality and continent

In [4]:
# 1. Nationality and National team country
translations = pd.read_csv("data/countries_translations_ger_eng.csv")
translations_dict = pd.Series(translations.tm_english.values,index=translations.tm_deutsch).to_dict()
translations_dict


{'Algerien': 'Algeria',
 'Angola': 'Angola',
 'Benin': 'Benin',
 'Burkina Faso': 'Burkina Faso',
 'Burundi': 'Burundi',
 'Kamerun': 'Cameroon',
 'Kap Verde': 'Cape Verde',
 'Komoren': 'Comoros',
 'Ägypten': 'Egypt',
 'Eritrea': 'Eritrea',
 'Gabun': 'Gabon',
 'Ghana': 'Ghana',
 'Guinea': 'Guinea',
 'Kenia': 'Kenya',
 'Liberia': 'Liberia',
 'Libyen': 'Libya',
 'Madagaskar': 'Madagascar',
 'Malawi': 'Malawi',
 'Mali': 'Mali',
 'Mauretanien': 'Mauritania',
 'Marokko': 'Morocco',
 'Mosambik': 'Mozambique',
 'Namibia': 'Namibia',
 'Niger': 'Niger',
 'Nigeria': 'Nigeria',
 'Ruanda': 'Rwanda',
 'Senegal': 'Senegal',
 'Sierra Leone': 'Sierra Leone',
 'Südafrika': 'South Africa',
 'Südsudan': 'South Sudan',
 'Togo': 'Togo',
 'Tunesien': 'Tunisia',
 'Uganda': 'Uganda',
 'Sambia': 'Zambia',
 'Simbabwe': 'Zimbabwe',
 'Äquatorialguinea': 'Equatorial Guinea',
 'Afghanistan': 'Afghanistan',
 'Armenien': 'Armenia',
 'Aserbaidschan': 'Azerbaijan',
 'Bahrain': 'Bahrain',
 'China': 'China PR',
 'Georgien'

In [5]:
df_fp["national_team_country"] = df_fp["national_team_country"].replace(translations_dict)
df_gk["national_team_country"] = df_gk["national_team_country"].replace(translations_dict)

df_fp.head(3)

Unnamed: 0,market_value_€,player_name,long_name,year_of_birth,player_age,height_cm,weight_kg,nationality,main_position,all_positions,club,league,divison,national_team_current_or_past,national_team_country,current_national_player,no_current_national_player,national_team_appearances,international_reputation,overall,potential,preferred_foot,weak_foot,skill_moves,work_rate,pace,shooting,passing,dribbling,defending,physic,player_traits,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,112000000.0,Lionel Messi,Lionel Andrés Messi Cuccittini,1987,32,170,72,Argentina,Rechtsaußen,"RW, CF, ST",FC Barcelona,LaLiga,1,1,Argentina,1,0,138,5,94,94,Left,4,4,Medium/Low,87,92,92,96,39,66,"Beat Offside Trap, Argues with Officials, Earl...",88,95,70,92,88,97,93,94,92,96,91,84,93,95,95,86,68,75,68,94,48,40,94,94,75,96,33,37,26,89,89,89,93,93,93,93,93,93,93,93,92,87,87,87,92,68,66,66,66,68,63,52,52,52,63
1,60000000.0,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,1985,35,187,83,Portugal,Linksaußen,"ST, LW",Juventus Turin,Serie A,1,1,Portugal,1,0,164,5,93,93,Right,4,5,High/Low,90,93,82,89,35,78,"Long Throw-in, Selfish, Argues with Officials,...",84,94,89,83,87,89,81,76,77,92,89,91,87,96,71,95,95,85,78,93,63,29,95,82,85,95,28,32,24,91,91,91,89,90,90,90,89,88,88,88,88,81,81,81,88,65,61,61,61,65,61,53,53,53,61
2,128000000.0,Neymar,Neymar da Silva Santos Junior,1992,28,175,68,Brazil,Linksaußen,"LW, CAM",FC Paris Saint-Germain,Ligue 1,1,1,Brazil,1,0,102,5,92,92,Right,5,5,High/Medium,91,85,87,95,32,58,"Power Free-Kick, Injury Free, Selfish, Early C...",87,87,62,87,87,96,88,87,81,95,94,89,96,92,84,80,61,81,49,84,51,36,87,90,90,94,27,26,29,84,84,84,90,89,89,89,90,90,90,90,89,82,82,82,89,66,61,61,61,66,61,46,46,46,61


In [6]:
country_continents1 = pd.read_csv("data/countries_continent.csv")
country_continents1 = country_continents1.rename(columns={"country": "nationality"})

df_fp = pd.merge(df_fp,country_continents1,on='nationality')
df_gk = pd.merge(df_gk,country_continents1,on='nationality')

df_fp.head(3)

Unnamed: 0,market_value_€,player_name,long_name,year_of_birth,player_age,height_cm,weight_kg,nationality,main_position,all_positions,club,league,divison,national_team_current_or_past,national_team_country,current_national_player,no_current_national_player,national_team_appearances,international_reputation,overall,potential,preferred_foot,weak_foot,skill_moves,work_rate,pace,shooting,passing,dribbling,defending,physic,player_traits,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,geographical_continent
0,112000000.0,Lionel Messi,Lionel Andrés Messi Cuccittini,1987,32,170,72,Argentina,Rechtsaußen,"RW, CF, ST",FC Barcelona,LaLiga,1,1,Argentina,1,0,138,5,94,94,Left,4,4,Medium/Low,87,92,92,96,39,66,"Beat Offside Trap, Argues with Officials, Earl...",88,95,70,92,88,97,93,94,92,96,91,84,93,95,95,86,68,75,68,94,48,40,94,94,75,96,33,37,26,89,89,89,93,93,93,93,93,93,93,93,92,87,87,87,92,68,66,66,66,68,63,52,52,52,63,South America
1,52000000.0,Sergio Agüero,Sergio Leonel Agüero del Castillo,1988,32,173,70,Argentina,Mittelstürmer,ST,Manchester City,Premier League,1,1,Argentina,1,0,97,4,89,89,Right,4,4,High/Medium,80,90,77,88,33,74,"Avoids Using Weaker Foot, Outside Foot Shot",70,93,78,83,85,88,83,73,64,89,82,78,84,92,91,89,81,79,74,84,65,24,93,83,83,90,30,29,24,87,87,87,85,87,87,87,85,85,85,85,83,77,77,77,83,60,58,58,58,60,56,50,50,50,56,South America
2,72000000.0,Paulo Dybala,Paulo Bruno Exequiel Dybala,1993,26,177,75,Argentina,Hängende Spitze,"CAM, RW",Juventus Turin,Serie A,1,1,Argentina,1,0,29,3,88,92,Left,3,4,Medium/Medium,83,82,84,90,43,64,"Beat Offside Trap, Selfish, Finesse Shot, Spee...",82,80,64,87,88,90,88,88,75,93,86,81,91,84,85,80,75,79,61,86,48,42,80,87,86,84,32,48,40,80,80,80,85,85,85,85,85,86,86,86,85,80,80,80,85,68,64,64,64,68,64,54,54,54,64,South America


In [8]:
def movecol(df, cols_to_move=[], ref_col='', place='After'):
    
    cols = df.columns.tolist()    
    if place == 'After':
        seg1 = cols[:list(cols).index(ref_col) + 1]
        seg2 = cols_to_move
    if place == 'Before':
        seg1 = cols[:list(cols).index(ref_col)]
        seg2 = cols_to_move + [ref_col]
    
    seg1 = [i for i in seg1 if i not in seg2]
    seg3 = [i for i in cols if i not in seg1 + seg2]
    
    return(df[seg1 + seg2 + seg3])

df_fp = movecol(df_fp, cols_to_move=["geographical_continent"], ref_col="nationality", place="After")
df_gk = movecol(df_gk, cols_to_move=["geographical_continent"], ref_col="nationality", place="After")

df_fp = df_fp.sort_values(by='overall', ascending=False)
df_gk = df_gk.sort_values(by='overall', ascending=False)
df_fp = df_fp.reset_index(drop=True)
df_gk = df_gk.reset_index(drop=True)

df_fp.head(3)

Unnamed: 0,market_value_€,player_name,long_name,year_of_birth,player_age,height_cm,weight_kg,nationality,geographical_continent,main_position,all_positions,club,league,divison,national_team_current_or_past,national_team_country,current_national_player,no_current_national_player,national_team_appearances,international_reputation,overall,potential,preferred_foot,weak_foot,skill_moves,work_rate,pace,shooting,passing,dribbling,defending,physic,player_traits,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,skill_dribbling,skill_curve,skill_fk_accuracy,skill_long_passing,skill_ball_control,movement_acceleration,movement_sprint_speed,movement_agility,movement_reactions,movement_balance,power_shot_power,power_jumping,power_stamina,power_strength,power_long_shots,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,ls,st,rs,lw,lf,cf,rf,rw,lam,cam,ram,lm,lcm,cm,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,112000000.0,Lionel Messi,Lionel Andrés Messi Cuccittini,1987,32,170,72,Argentina,South America,Rechtsaußen,"RW, CF, ST",FC Barcelona,LaLiga,1,1,Argentina,1,0,138,5,94,94,Left,4,4,Medium/Low,87,92,92,96,39,66,"Beat Offside Trap, Argues with Officials, Earl...",88,95,70,92,88,97,93,94,92,96,91,84,93,95,95,86,68,75,68,94,48,40,94,94,75,96,33,37,26,89,89,89,93,93,93,93,93,93,93,93,92,87,87,87,92,68,66,66,66,68,63,52,52,52,63
1,60000000.0,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,1985,35,187,83,Portugal,Europe,Linksaußen,"ST, LW",Juventus Turin,Serie A,1,1,Portugal,1,0,164,5,93,93,Right,4,5,High/Low,90,93,82,89,35,78,"Long Throw-in, Selfish, Argues with Officials,...",84,94,89,83,87,89,81,76,77,92,89,91,87,96,71,95,95,85,78,93,63,29,95,82,85,95,28,32,24,91,91,91,89,90,90,90,89,88,88,88,88,81,81,81,88,65,61,61,61,65,61,53,53,53,61
2,128000000.0,Neymar,Neymar da Silva Santos Junior,1992,28,175,68,Brazil,South America,Linksaußen,"LW, CAM",FC Paris Saint-Germain,Ligue 1,1,1,Brazil,1,0,102,5,92,92,Right,5,5,High/Medium,91,85,87,95,32,58,"Power Free-Kick, Injury Free, Selfish, Early C...",87,87,62,87,87,96,88,87,81,95,94,89,96,92,84,80,61,81,49,84,51,36,87,90,90,94,27,26,29,84,84,84,90,89,89,89,90,90,90,90,89,82,82,82,89,66,61,61,61,66,61,46,46,46,61


#### Work to do next notebook
- truncate dataframe
- do second EDA, after dataframe has been truncated and features have been engineered

#### further work
- build preprocessing pipeline for numerical and categorical vars separately