In [14]:
import pandas as pd
import numpy as np

import pandas_profiling
from pandas_profiling.utils.cache import cache_file

from pathlib import Path

import datetime

In [2]:
# Loading in csv files into DataFrames to explore
# First, we will load our data sets containing transfer data

transfers_2000_19 = pd.read_csv('transfers.csv', delimiter=';')
transfers_2007_17 = pd.read_csv('transfer_data.csv')
transfers_top_250 = pd.read_csv('top250-00-19.csv')

In [3]:
transfers_2000_19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111602 entries, 0 to 111601
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   111602 non-null  int64  
 1   player_id            111602 non-null  int64  
 2   player_name          111602 non-null  object 
 3   season               111602 non-null  int64  
 4   date                 111530 non-null  object 
 5   from_club_id         111602 non-null  int64  
 6   from_club_name       111602 non-null  object 
 7   to_club_id           111602 non-null  int64  
 8   to_club_name         111602 non-null  object 
 9   market_value         72589 non-null   float64
 10  fee                  42799 non-null   float64
 11  from_coach_name      38687 non-null   object 
 12  to_coach_name        38690 non-null   object 
 13  from_sport_dir_name  17854 non-null   object 
 14  to_sport_dir_name    18226 non-null   object 
 15  contract_was_till

In [4]:
transfers_2007_17.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6237 entries, 0 to 6236
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PLAYER       6236 non-null   object 
 1   WINDOW       6236 non-null   object 
 2   POSITION     5109 non-null   object 
 3   COUNTRY      5411 non-null   object 
 4   FROM         6235 non-null   object 
 5   TO           6236 non-null   object 
 6   DESCRIPTION  6236 non-null   object 
 7   PRICE        6236 non-null   float64
 8   LEAGUE       6236 non-null   object 
 9   SEASON       6236 non-null   object 
dtypes: float64(1), object(9)
memory usage: 487.4+ KB


In [5]:
transfers_top_250.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4700 entries, 0 to 4699
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          4700 non-null   object 
 1   Position      4700 non-null   object 
 2   Age           4700 non-null   int64  
 3   Team_from     4700 non-null   object 
 4   League_from   4700 non-null   object 
 5   Team_to       4700 non-null   object 
 6   League_to     4700 non-null   object 
 7   Season        4700 non-null   object 
 8   Market_value  3440 non-null   float64
 9   Transfer_fee  4700 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 367.3+ KB


In [6]:
transfers_2000_19.head()

Unnamed: 0,id,player_id,player_name,season,date,from_club_id,from_club_name,to_club_id,to_club_name,market_value,fee,from_coach_name,to_coach_name,from_sport_dir_name,to_sport_dir_name,contract_was_till,is_loan,is_end_of_loan,is_future_transfer
0,1,1,Jermaine Beckford,2017,2017-07-01,391,Preston NE,392,Bury,500000.0,0.0,,,,,,0,0,0
1,2,1,Jermaine Beckford,2015,2015-07-01,289,Bolton,391,Preston NE,750000.0,0.0,,,,,,0,0,0
2,3,1,Jermaine Beckford,2014,2015-06-30,391,Preston NE,289,Bolton,750000.0,,,,,,,0,1,0
3,4,1,Jermaine Beckford,2014,2014-11-20,289,Bolton,391,Preston NE,1200000.0,,Neil Lennon,Simon Grayson,,,2015-06-30,1,0,0
4,5,1,Jermaine Beckford,2013,2013-07-17,271,Leicester,289,Bolton,1500000.0,,Nigel Pearson,Dougie Freedman,,,2015-06-30,0,0,0


In [7]:
transfers_2000_19.columns

Index(['id', 'player_id', 'player_name', 'season', 'date', 'from_club_id',
       'from_club_name', 'to_club_id', 'to_club_name', 'market_value', 'fee',
       'from_coach_name', 'to_coach_name', 'from_sport_dir_name',
       'to_sport_dir_name', 'contract_was_till', 'is_loan', 'is_end_of_loan',
       'is_future_transfer'],
      dtype='object')

We see that the dataframe 'transfers_2000_19' contains columns that we will likely not need. These include:
* from_coach_name
* to_coach_name
* from_sport_dir_name
* to_sport_dir_name
* contract_was_till

Additionally, we may need some information from the following columns to narrow our search for transfers only.
* is_loan
* is_end_of_loan
* is_future_transfer

In [13]:
cols = ['from_coach_name', 'to_coach_name', 'from_sport_dir_name', 'to_sport_dir_name', 'contract_was_till']
transfers_2000_19 = transfers_2000_19.drop(cols, axis=1)
transfers_2000_19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111602 entries, 0 to 111601
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  111602 non-null  int64  
 1   player_id           111602 non-null  int64  
 2   player_name         111602 non-null  object 
 3   season              111602 non-null  int64  
 4   date                111530 non-null  object 
 5   from_club_id        111602 non-null  int64  
 6   from_club_name      111602 non-null  object 
 7   to_club_id          111602 non-null  int64  
 8   to_club_name        111602 non-null  object 
 9   market_value        72589 non-null   float64
 10  fee                 42799 non-null   float64
 11  is_loan             111602 non-null  int64  
 12  is_end_of_loan      111602 non-null  int64  
 13  is_future_transfer  111602 non-null  int64  
dtypes: float64(2), int64(8), object(4)
memory usage: 11.9+ MB


We now see that the largest null values occur in the fee and market value columns, as well as a handfull missing the date information. Since the season is included for every entry in the dataframe, we will remove the date column, as the information it provides is not necessary.

In [24]:
transfers_2000_19 = transfers_2000_19.drop('date',axis=1)

In [29]:
transfers_2000_19[(transfers_2000_19['fee'].isnull())&(transfers_2000_19['is_loan'] == True)].count()

id                    15296
player_id             15296
player_name           15296
season                15296
from_club_id          15296
from_club_name        15296
to_club_id            15296
to_club_name          15296
market_value          11475
fee                       0
is_loan               15296
is_end_of_loan        15296
is_future_transfer    15296
dtype: int64

In [30]:
transfers_2000_19[transfers_2000_19['player_name'].str.contains('Coutinho')]

Unnamed: 0,id,player_id,player_name,season,from_club_id,from_club_name,to_club_id,to_club_name,market_value,fee,is_loan,is_end_of_loan,is_future_transfer
6828,6829,828,Philippe Coutinho,2017,267,Liverpool,79,FC Barcelona,90000000.0,135000000.0,0,0,0
6829,6830,828,Philippe Coutinho,2012,198,Inter,267,Liverpool,9000000.0,13000000.0,0,0,0
6830,6831,828,Philippe Coutinho,2011,91,Espanyol,198,Inter,7000000.0,,0,1,0
6831,6832,828,Philippe Coutinho,2011,198,Inter,91,Espanyol,8000000.0,,1,0,0
6832,6833,828,Philippe Coutinho,2009,1024,Vasco da Gama,198,Inter,2500000.0,,0,1,0
6833,6834,828,Philippe Coutinho,2008,198,Inter,1024,Vasco da Gama,,,1,0,0
6834,6835,828,Philippe Coutinho,2008,1932,Vasco U17,198,Inter,,3800000.0,0,0,0
109985,109986,11214,Douglas Coutinho,2018,2444,Fortaleza,863,Athletico-PR,700000.0,,0,1,0
109986,109987,11214,Douglas Coutinho,2018,863,Athletico-PR,2444,Fortaleza,700000.0,,1,0,0
109987,109988,11214,Douglas Coutinho,2018,2818,Ceará SC,863,Athletico-PR,700000.0,,0,1,0


In [33]:
transfers_2000_19.groupby('is_future_transfer')['is_future_transfer'].count()

is_future_transfer
0    110669
1       933
Name: is_future_transfer, dtype: int64

In [36]:
transfers_2000_19.groupby('from_club_name')['from_club_name'].count().sort_values(ascending=False).head(10)

from_club_name
Without Club    3685
Inter            469
Juventus         467
FC Porto         455
Genoa            449
Benfica          424
AS Roma          410
Sporting CP      405
Chelsea          382
Atalanta         372
Name: from_club_name, dtype: int64

In [38]:
transfers_2000_19.groupby('from_club_name')['fee'].mean().sort_values(ascending=False).head(10)

from_club_name
TJ Tianhai         2.000000e+07
AC Parma           1.167955e+07
Atlanta United     8.000000e+06
Real Madrid        7.969754e+06
FC Barcelona       7.866585e+06
Shakhtar D.        7.246489e+06
Chelsea            6.336169e+06
Monaco             6.327912e+06
Liverpool          6.180260e+06
BJ Sinobo Guoan    6.110000e+06
Name: fee, dtype: float64

In [39]:
transfers_2000_19[transfers_2000_19['from_club_name'] == 'TJ Tianhai']

Unnamed: 0,id,player_id,player_name,season,from_club_id,from_club_name,to_club_id,to_club_name,market_value,fee,is_loan,is_end_of_loan,is_future_transfer
5428,5429,653,Axel Witsel,2018,1464,TJ Tianhai,11,Bor. Dortmund,20000000.0,20000000.0,0,0,0
16672,16673,1943,Anthony Modeste,2018,1464,TJ Tianhai,425,Without Club,16000000.0,,0,0,0


In [41]:
transfers_2000_19.groupby('player_name')['fee'].count().sort_values(ascending=False).head(10)

player_name
Júlio César           35
Marcelinho Paraíba    24
Paulinho              24
Danilo                24
Adriano               23
Zé Roberto            23
Aílton                22
Marcinho              20
Elias                 20
Pelé                  20
Name: fee, dtype: int64

In [56]:
transfers_2000_19[transfers_2000_19['season'] >= 2009].groupby('season')['fee'].mean().sort_values(ascending=False)

season
2019    8.076136e+06
2020    3.000000e+06
2018    2.521420e+06
2017    2.428274e+06
2016    1.940652e+06
2015    1.574316e+06
2013    1.386053e+06
2014    1.294176e+06
2009    1.253007e+06
2012    1.113612e+06
2011    1.106257e+06
2010    1.046262e+06
Name: fee, dtype: float64

In [58]:
IM_sales = transfers_2000_19[transfers_2000_19['from_club_name'] == 'Inter']
IM_sales.sort_values('fee',ascending=False).head()

Unnamed: 0,id,player_id,player_name,season,from_club_id,from_club_name,to_club_id,to_club_name,market_value,fee,is_loan,is_end_of_loan,is_future_transfer
878,879,98,Zlatan Ibrahimovic,2009,198,Inter,79,FC Barcelona,45000000.0,69500000.0,0,0,0
5358,5359,644,Ronaldo,2002,198,Inter,82,Real Madrid,,45000000.0,0,0,0
5332,5333,641,Mateo Kovacic,2015,198,Inter,82,Real Madrid,22000000.0,38000000.0,0,0,0
2586,2587,308,Mario Balotelli,2010,198,Inter,286,Man City,26000000.0,29500000.0,0,0,0
2289,2290,272,Samuel Eto'o,2011,198,Inter,312,Anzhi,42000000.0,27000000.0,0,0,0


In [61]:
IM_sales[(IM_sales['fee'] < 15000000) & (IM_sales['fee'] >= 5000000)].head()

Unnamed: 0,id,player_id,player_name,season,from_club_id,from_club_name,to_club_id,to_club_name,market_value,fee,is_loan,is_end_of_loan,is_future_transfer
1813,1814,217,Maxwell,2009,198,Inter,79,FC Barcelona,13500000.0,5000000.0,0,0,0
2150,2151,254,Éver Banega,2017,198,Inter,97,Sevilla FC,16000000.0,7000000.0,0,0,0
2448,2449,292,Emre Belözoglu,2005,198,Inter,274,Newcastle,12000000.0,5000000.0,0,0,0
2548,2549,303,Andy van der Meyde,2005,198,Inter,276,Everton,12500000.0,9000000.0,0,0,0
2784,2785,333,Ricardo Quaresma,2010,198,Inter,121,Besiktas,12000000.0,7300000.0,0,0,0


Now, we read in the individual and team stats files to have a look.

In [66]:
player_stats = pd.read_csv('stats_of_players.csv', delimiter=';')
clubs_df = pd.read_csv('clubs_in_leagues.csv', delimiter=';')

player_dict = pd.read_csv('dict_players.csv', delimiter=';')
club_dict = pd.read_csv('dict_clubs.csv', delimiter=';')

In [63]:
player_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231379 entries, 0 to 231378
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                231379 non-null  int64  
 1   player_id         231379 non-null  int64  
 2   player_name       231379 non-null  object 
 3   season            231379 non-null  int64  
 4   league_id         231379 non-null  int64  
 5   league_name       231379 non-null  object 
 6   club_id           231379 non-null  int64  
 7   club_name         231379 non-null  object 
 8   apps              231379 non-null  int64  
 9   points_per_match  225422 non-null  float64
 10  goals             166957 non-null  float64
 11  assists           159543 non-null  float64
 12  conceded_goals    118416 non-null  float64
 13  clean_sheets      117263 non-null  float64
 14  yellow_card       190724 non-null  float64
 15  two_yellow_cards  117566 non-null  float64
 16  red_card          11

In [64]:
clubs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3578 entries, 0 to 3577
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       3578 non-null   int64  
 1   id               3578 non-null   int64  
 2   club_id          3578 non-null   int64  
 3   club_name        3578 non-null   object 
 4   league_id        3578 non-null   int64  
 5   season           3578 non-null   int64  
 6   matches_played   3578 non-null   int64  
 7   matches_overall  3578 non-null   int64  
 8   wins             3578 non-null   int64  
 9   draws            3578 non-null   int64  
 10  loses            3578 non-null   int64  
 11  goals_scored     3578 non-null   int64  
 12  goals_cons       3578 non-null   int64  
 13  goals_diff       3578 non-null   int64  
 14  points           3578 non-null   int64  
 15  place            3578 non-null   int64  
 16  qualified_to     1283 non-null   object 
 17  is_champion   

In [65]:
player_stats.head()

Unnamed: 0,id,player_id,player_name,season,league_id,league_name,club_id,club_name,apps,points_per_match,goals,assists,conceded_goals,clean_sheets,yellow_card,two_yellow_cards,red_card,minutes_played
0,1,1,Jermaine Beckford,2011,1,Premier League,276,Everton FC,2,1.5,,,,,,,,68
1,2,1,Jermaine Beckford,2010,1,Premier League,276,Everton FC,32,1.34,8.0,2.0,,,3.0,,,1320
2,3,1,Jermaine Beckford,2018,59,Others,392,Bury FC,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13
3,4,1,Jermaine Beckford,2017,59,Others,392,Bury FC,16,0.68,8.0,3.0,0.0,0.0,2.0,0.0,0.0,1303
4,5,1,Jermaine Beckford,2016,59,Others,391,Preston North End,18,0.89,1.0,3.0,0.0,0.0,0.0,0.0,2.0,543


In [67]:
player_dict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11382 entries, 0 to 11381
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           11382 non-null  int64  
 1   name                         11382 non-null  object 
 2   original_name                6456 non-null   object 
 3   club_id                      11382 non-null  int64  
 4   club_name                    11382 non-null  object 
 5   position_main                11362 non-null  object 
 6   other_positions              8043 non-null   object 
 7   nationality_name             11382 non-null  object 
 8   nationality_code             11088 non-null  object 
 9   other_nationality_name       3427 non-null   object 
 10  other_nationality_code       3215 non-null   object 
 11  date_of_birth                11362 non-null  object 
 12  place_of_birth_name          11278 non-null  object 
 13  place_of_birth_c

In [68]:
club_dict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9381 entries, 0 to 9380
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             9381 non-null   int64  
 1   name           9381 non-null   object 
 2   off_name       390 non-null    object 
 3   country_id     390 non-null    object 
 4   city           517 non-null    object 
 5   stadium        517 non-null    object 
 6   stadium_id     517 non-null    float64
 7   is_first_team  9381 non-null   int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 586.4+ KB


In [69]:
club_dict.head()

Unnamed: 0,id,name,off_name,country_id,city,stadium,stadium_id,is_first_team
0,1,Bayern Munich,Bayern Munich,DEU,,,,1
1,2,Bay. Leverkusen,Bayer 04 Leverkusen,DEU,,,,1
2,3,Hamburger SV,Hamburger SV,DEU,,,,1
3,4,1860 Munich,TSV 1860 Munich,DEU,,,,1
4,5,1.FC K'lautern,1.FC Kaiserslautern,DEU,,,,1


In [70]:
player_dict.head()

Unnamed: 0,id,name,original_name,club_id,club_name,position_main,other_positions,nationality_name,nationality_code,other_nationality_name,...,place_of_birth_country_code,foot,height,player_agent,joined,contract_until,outfiter,last_extention,contract_options,current_market_value
0,1,Jermaine Beckford,Jermaine Paul Alexander Beckford,289,Bolton,Centre-Forward,Right Winger,Jamaica,JAM,England,...,GBR,right,188.0,,2017-07-01,2019-06-30,,,,500000.0
1,2,Harry Charsley,Henry William James Charsley,289,Bolton,Central Midfield,"Right Midfield, Left Midfield",Ireland,IRL,England,...,GBR,right,,,2014-07-01,2019-06-30,,2016-07-15,,250000.0
2,3,Mark Davies,Mark Nicholas Davies,289,Bolton,Central Midfield,"Attacking Midfield, Right Midfield",England,GBR,,...,GBR,right,180.0,,2017-07-01,,,,,750000.0
3,4,Alex McQuade,Alexander Michael McQuade,289,Bolton,Centre-Back,Left-Back,England,GBR,,...,GBR,left,,,2018-07-01,,,,,50000.0
4,5,Przemyslaw Kazimierczak,Przemysław Kazimierczak,289,Bolton,Goalkeeper,,Poland,POL,,...,POL,right,191.0,SOLSPORT,2018-08-08,,,,,50000.0


We can definitely drop several columns from the player_dict to help condense what we would like to see.

In [72]:
player_dict.columns

Index(['id', 'name', 'original_name', 'club_id', 'club_name', 'position_main',
       'other_positions', 'nationality_name', 'nationality_code',
       'other_nationality_name', 'other_nationality_code', 'date_of_birth',
       'place_of_birth_name', 'place_of_birth_country_name',
       'place_of_birth_country_code', 'foot', 'height', 'player_agent',
       'joined', 'contract_until', 'outfiter', 'last_extention',
       'contract_options', 'current_market_value'],
      dtype='object')

In [73]:
cols = ['other_nationality_name', 'other_nationality_code', 'player_agent', 'contract_until', 'outfiter', 'last_extention',
       'contract_options']

player_dict = player_dict.drop(cols, axis=1)

player_dict.head()

Unnamed: 0,id,name,original_name,club_id,club_name,position_main,other_positions,nationality_name,nationality_code,date_of_birth,place_of_birth_name,place_of_birth_country_name,place_of_birth_country_code,foot,height,joined,current_market_value
0,1,Jermaine Beckford,Jermaine Paul Alexander Beckford,289,Bolton,Centre-Forward,Right Winger,Jamaica,JAM,1983-12-09,London,England,GBR,right,188.0,2017-07-01,500000.0
1,2,Harry Charsley,Henry William James Charsley,289,Bolton,Central Midfield,"Right Midfield, Left Midfield",Ireland,IRL,1996-11-01,Wirral,England,GBR,right,,2014-07-01,250000.0
2,3,Mark Davies,Mark Nicholas Davies,289,Bolton,Central Midfield,"Attacking Midfield, Right Midfield",England,GBR,1988-02-18,Willenhall,England,GBR,right,180.0,2017-07-01,750000.0
3,4,Alex McQuade,Alexander Michael McQuade,289,Bolton,Centre-Back,Left-Back,England,GBR,1992-11-07,Manchester,England,GBR,left,,2018-07-01,50000.0
4,5,Przemyslaw Kazimierczak,Przemysław Kazimierczak,289,Bolton,Goalkeeper,,Poland,POL,1988-05-05,Łódź,Poland,POL,right,191.0,2018-08-08,50000.0


In [76]:
player_dict = player_dict[player_dict['position_main'].notnull()]

In [77]:
player_stats.columns

Index(['id', 'player_id', 'player_name', 'season', 'league_id', 'league_name',
       'club_id', 'club_name', 'apps', 'points_per_match', 'goals', 'assists',
       'conceded_goals', 'clean_sheets', 'yellow_card', 'two_yellow_cards',
       'red_card', 'minutes_played'],
      dtype='object')

In [78]:
player_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231379 entries, 0 to 231378
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                231379 non-null  int64  
 1   player_id         231379 non-null  int64  
 2   player_name       231379 non-null  object 
 3   season            231379 non-null  int64  
 4   league_id         231379 non-null  int64  
 5   league_name       231379 non-null  object 
 6   club_id           231379 non-null  int64  
 7   club_name         231379 non-null  object 
 8   apps              231379 non-null  int64  
 9   points_per_match  225422 non-null  float64
 10  goals             166957 non-null  float64
 11  assists           159543 non-null  float64
 12  conceded_goals    118416 non-null  float64
 13  clean_sheets      117263 non-null  float64
 14  yellow_card       190724 non-null  float64
 15  two_yellow_cards  117566 non-null  float64
 16  red_card          11

In [79]:
player_stats[player_stats['goals'].isnull()].head()

Unnamed: 0,id,player_id,player_name,season,league_id,league_name,club_id,club_name,apps,points_per_match,goals,assists,conceded_goals,clean_sheets,yellow_card,two_yellow_cards,red_card,minutes_played
0,1,1,Jermaine Beckford,2011,1,Premier League,276,Everton FC,2,1.5,,,,,,,,68
16,17,2,Harry Charsley,2017,57,Europa League,276,Everton FC,1,3.0,,,,,,,,90
25,26,3,Mark Davies,2009,1,Premier League,289,Bolton Wanderers,17,0.71,,,,,1.0,,,553
26,27,3,Mark Davies,2008,1,Premier League,289,Bolton Wanderers,10,1.2,,2.0,,,3.0,,,758
51,52,6,Jaroslaw Fojut,2017,19,Ekstraklasa,423,Pogon Szczecin,17,0.94,,,,,5.0,,,1526
