In [225]:
import pandas as pd
import numpy as np

import pandas_profiling
from pandas_profiling.utils.cache import cache_file

from pathlib import Path

import datetime

In [226]:
# Loading in csv files into DataFrames to explore
# First, we will load our data sets containing transfer data

transfers= pd.read_csv('transfers.csv', delimiter=';')

In [227]:
transfers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111602 entries, 0 to 111601
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   111602 non-null  int64  
 1   player_id            111602 non-null  int64  
 2   player_name          111602 non-null  object 
 3   season               111602 non-null  int64  
 4   date                 111530 non-null  object 
 5   from_club_id         111602 non-null  int64  
 6   from_club_name       111602 non-null  object 
 7   to_club_id           111602 non-null  int64  
 8   to_club_name         111602 non-null  object 
 9   market_value         72589 non-null   float64
 10  fee                  42799 non-null   float64
 11  from_coach_name      38687 non-null   object 
 12  to_coach_name        38690 non-null   object 
 13  from_sport_dir_name  17854 non-null   object 
 14  to_sport_dir_name    18226 non-null   object 
 15  contract_was_till

In [228]:
transfers.head()

Unnamed: 0,id,player_id,player_name,season,date,from_club_id,from_club_name,to_club_id,to_club_name,market_value,fee,from_coach_name,to_coach_name,from_sport_dir_name,to_sport_dir_name,contract_was_till,is_loan,is_end_of_loan,is_future_transfer
0,1,1,Jermaine Beckford,2017,2017-07-01,391,Preston NE,392,Bury,500000.0,0.0,,,,,,0,0,0
1,2,1,Jermaine Beckford,2015,2015-07-01,289,Bolton,391,Preston NE,750000.0,0.0,,,,,,0,0,0
2,3,1,Jermaine Beckford,2014,2015-06-30,391,Preston NE,289,Bolton,750000.0,,,,,,,0,1,0
3,4,1,Jermaine Beckford,2014,2014-11-20,289,Bolton,391,Preston NE,1200000.0,,Neil Lennon,Simon Grayson,,,2015-06-30,1,0,0
4,5,1,Jermaine Beckford,2013,2013-07-17,271,Leicester,289,Bolton,1500000.0,,Nigel Pearson,Dougie Freedman,,,2015-06-30,0,0,0


In [229]:
transfers.columns

Index(['id', 'player_id', 'player_name', 'season', 'date', 'from_club_id',
       'from_club_name', 'to_club_id', 'to_club_name', 'market_value', 'fee',
       'from_coach_name', 'to_coach_name', 'from_sport_dir_name',
       'to_sport_dir_name', 'contract_was_till', 'is_loan', 'is_end_of_loan',
       'is_future_transfer'],
      dtype='object')

We see that the dataframe 'transfers' contains columns that we will likely not need. These include:
* from_coach_name
* to_coach_name
* from_sport_dir_name
* to_sport_dir_name
* contract_was_till

Additionally, we may need some information from the following columns to narrow our search for transfers only.
* is_loan
* is_end_of_loan
* is_future_transfer

In [230]:
cols = ['from_coach_name', 'to_coach_name', 'from_sport_dir_name', 'to_sport_dir_name', 'contract_was_till']
transfers = transfers.drop(cols, axis=1)
transfers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111602 entries, 0 to 111601
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  111602 non-null  int64  
 1   player_id           111602 non-null  int64  
 2   player_name         111602 non-null  object 
 3   season              111602 non-null  int64  
 4   date                111530 non-null  object 
 5   from_club_id        111602 non-null  int64  
 6   from_club_name      111602 non-null  object 
 7   to_club_id          111602 non-null  int64  
 8   to_club_name        111602 non-null  object 
 9   market_value        72589 non-null   float64
 10  fee                 42799 non-null   float64
 11  is_loan             111602 non-null  int64  
 12  is_end_of_loan      111602 non-null  int64  
 13  is_future_transfer  111602 non-null  int64  
dtypes: float64(2), int64(8), object(4)
memory usage: 11.9+ MB


We now see that the largest null values occur in the fee and market value columns, as well as a handfull missing the date information. Since the season is included for every entry in the dataframe, we will remove the date column, as the information it provides is not necessary.

In [231]:
transfers = transfers.drop('date',axis=1)

Next, we will look to remove any data in regards to loans or free agent signings, as we are only concerned with the sales of players to get an idea for pricing.

In [232]:
transfers = transfers[(transfers['is_loan'] == 0)]
transfers.drop('is_loan', axis = 1, inplace = True)
transfers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94312 entries, 0 to 111601
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  94312 non-null  int64  
 1   player_id           94312 non-null  int64  
 2   player_name         94312 non-null  object 
 3   season              94312 non-null  int64  
 4   from_club_id        94312 non-null  int64  
 5   from_club_name      94312 non-null  object 
 6   to_club_id          94312 non-null  int64  
 7   to_club_name        94312 non-null  object 
 8   market_value        59317 non-null  float64
 9   fee                 40805 non-null  float64
 10  is_end_of_loan      94312 non-null  int64  
 11  is_future_transfer  94312 non-null  int64  
dtypes: float64(2), int64(7), object(3)
memory usage: 9.4+ MB


In [233]:
transfers = transfers[transfers['is_end_of_loan'] == 0]
transfers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77045 entries, 0 to 111601
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  77045 non-null  int64  
 1   player_id           77045 non-null  int64  
 2   player_name         77045 non-null  object 
 3   season              77045 non-null  int64  
 4   from_club_id        77045 non-null  int64  
 5   from_club_name      77045 non-null  object 
 6   to_club_id          77045 non-null  int64  
 7   to_club_name        77045 non-null  object 
 8   market_value        45199 non-null  float64
 9   fee                 40805 non-null  float64
 10  is_end_of_loan      77045 non-null  int64  
 11  is_future_transfer  77045 non-null  int64  
dtypes: float64(2), int64(7), object(3)
memory usage: 7.6+ MB


In [234]:
transfers.drop('is_end_of_loan', axis=1, inplace=True)
transfers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77045 entries, 0 to 111601
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  77045 non-null  int64  
 1   player_id           77045 non-null  int64  
 2   player_name         77045 non-null  object 
 3   season              77045 non-null  int64  
 4   from_club_id        77045 non-null  int64  
 5   from_club_name      77045 non-null  object 
 6   to_club_id          77045 non-null  int64  
 7   to_club_name        77045 non-null  object 
 8   market_value        45199 non-null  float64
 9   fee                 40805 non-null  float64
 10  is_future_transfer  77045 non-null  int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 7.1+ MB


We notice that there are still several transfers that don't involve a fee, which could be due to players being signed from free agency. We will explore this by examining the column labelled 'from_club_name'.

In [235]:
transfers[transfers['fee'].isnull()]['from_club_name'].unique()

array(['Leicester', 'Own U19', 'Everton U18', ..., 'Evry Essonne',
       'SC Cambuur Jgd.', 'Füchse U19'], dtype=object)

In [236]:
transfers[transfers['from_club_name'].str.contains('Own')].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145 entries, 14 to 111491
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  145 non-null    int64  
 1   player_id           145 non-null    int64  
 2   player_name         145 non-null    object 
 3   season              145 non-null    int64  
 4   from_club_id        145 non-null    int64  
 5   from_club_name      145 non-null    object 
 6   to_club_id          145 non-null    int64  
 7   to_club_name        145 non-null    object 
 8   market_value        1 non-null      float64
 9   fee                 1 non-null      float64
 10  is_future_transfer  145 non-null    int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 13.6+ KB


In [237]:
transfers[(transfers['from_club_name'].str.contains('Own'))&(transfers['fee'].notnull())]

Unnamed: 0,id,player_id,player_name,season,from_club_id,from_club_name,to_club_id,to_club_name,market_value,fee,is_future_transfer
50660,50661,5449,Jackie McNamara,1990,396,Own U19,5942,Gairdoch United,,0.0,0


In [238]:
transfers = transfers[~transfers['from_club_name'].str.contains('Own')]

In [239]:
transfers = transfers[~transfers['fee'].isnull()]

In [240]:
transfers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40804 entries, 0 to 111601
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  40804 non-null  int64  
 1   player_id           40804 non-null  int64  
 2   player_name         40804 non-null  object 
 3   season              40804 non-null  int64  
 4   from_club_id        40804 non-null  int64  
 5   from_club_name      40804 non-null  object 
 6   to_club_id          40804 non-null  int64  
 7   to_club_name        40804 non-null  object 
 8   market_value        29755 non-null  float64
 9   fee                 40804 non-null  float64
 10  is_future_transfer  40804 non-null  int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 3.7+ MB


In [241]:
transfers = transfers[transfers['fee'] > 0]
transfers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15753 entries, 7 to 111597
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  15753 non-null  int64  
 1   player_id           15753 non-null  int64  
 2   player_name         15753 non-null  object 
 3   season              15753 non-null  int64  
 4   from_club_id        15753 non-null  int64  
 5   from_club_name      15753 non-null  object 
 6   to_club_id          15753 non-null  int64  
 7   to_club_name        15753 non-null  object 
 8   market_value        12329 non-null  float64
 9   fee                 15753 non-null  float64
 10  is_future_transfer  15753 non-null  int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 1.4+ MB


In [242]:
transfers['market_value'].fillna(transfers['fee'], inplace=True)
transfers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15753 entries, 7 to 111597
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  15753 non-null  int64  
 1   player_id           15753 non-null  int64  
 2   player_name         15753 non-null  object 
 3   season              15753 non-null  int64  
 4   from_club_id        15753 non-null  int64  
 5   from_club_name      15753 non-null  object 
 6   to_club_id          15753 non-null  int64  
 7   to_club_name        15753 non-null  object 
 8   market_value        15753 non-null  float64
 9   fee                 15753 non-null  float64
 10  is_future_transfer  15753 non-null  int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 1.4+ MB


In [243]:
transfers.head()

Unnamed: 0,id,player_id,player_name,season,from_club_id,from_club_name,to_club_id,to_club_name,market_value,fee,is_future_transfer
7,8,1,Jermaine Beckford,2011,276,Everton,271,Leicester,4000000.0,3400000.0,0
19,20,3,Mark Davies,2008,293,Wolves,289,Bolton,300000.0,600000.0,0
42,43,5,Przemyslaw Kazimierczak,2007,289,Bolton,419,Darlington,125000.0,125000.0,0
52,53,6,Jaroslaw Fojut,2008,289,Bolton,426,Slask Wroclaw,200000.0,125000.0,0
57,58,6,Jaroslaw Fojut,2004,429,MSP Szamotuly,289,Bolton,25000.0,25000.0,0


We now have a dataframe for transfers that only contains transfer information for players that were sold from one club to another.

Now, we read in the individual and team stats files to have a look.

In [244]:
player_stats = pd.read_csv('stats_of_players.csv', delimiter=';')
clubs_df = pd.read_csv('clubs_in_leagues.csv', delimiter=';')

player_dict = pd.read_csv('dict_players.csv', delimiter=';')
club_dict = pd.read_csv('dict_clubs.csv', delimiter=';')

In [245]:
player_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231379 entries, 0 to 231378
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                231379 non-null  int64  
 1   player_id         231379 non-null  int64  
 2   player_name       231379 non-null  object 
 3   season            231379 non-null  int64  
 4   league_id         231379 non-null  int64  
 5   league_name       231379 non-null  object 
 6   club_id           231379 non-null  int64  
 7   club_name         231379 non-null  object 
 8   apps              231379 non-null  int64  
 9   points_per_match  225422 non-null  float64
 10  goals             166957 non-null  float64
 11  assists           159543 non-null  float64
 12  conceded_goals    118416 non-null  float64
 13  clean_sheets      117263 non-null  float64
 14  yellow_card       190724 non-null  float64
 15  two_yellow_cards  117566 non-null  float64
 16  red_card          11

In [246]:
clubs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3578 entries, 0 to 3577
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       3578 non-null   int64  
 1   id               3578 non-null   int64  
 2   club_id          3578 non-null   int64  
 3   club_name        3578 non-null   object 
 4   league_id        3578 non-null   int64  
 5   season           3578 non-null   int64  
 6   matches_played   3578 non-null   int64  
 7   matches_overall  3578 non-null   int64  
 8   wins             3578 non-null   int64  
 9   draws            3578 non-null   int64  
 10  loses            3578 non-null   int64  
 11  goals_scored     3578 non-null   int64  
 12  goals_cons       3578 non-null   int64  
 13  goals_diff       3578 non-null   int64  
 14  points           3578 non-null   int64  
 15  place            3578 non-null   int64  
 16  qualified_to     1283 non-null   object 
 17  is_champion   

In [247]:
player_stats.head()

Unnamed: 0,id,player_id,player_name,season,league_id,league_name,club_id,club_name,apps,points_per_match,goals,assists,conceded_goals,clean_sheets,yellow_card,two_yellow_cards,red_card,minutes_played
0,1,1,Jermaine Beckford,2011,1,Premier League,276,Everton FC,2,1.5,,,,,,,,68
1,2,1,Jermaine Beckford,2010,1,Premier League,276,Everton FC,32,1.34,8.0,2.0,,,3.0,,,1320
2,3,1,Jermaine Beckford,2018,59,Others,392,Bury FC,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13
3,4,1,Jermaine Beckford,2017,59,Others,392,Bury FC,16,0.68,8.0,3.0,0.0,0.0,2.0,0.0,0.0,1303
4,5,1,Jermaine Beckford,2016,59,Others,391,Preston North End,18,0.89,1.0,3.0,0.0,0.0,0.0,0.0,2.0,543


In [248]:
player_dict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11382 entries, 0 to 11381
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           11382 non-null  int64  
 1   name                         11382 non-null  object 
 2   original_name                6456 non-null   object 
 3   club_id                      11382 non-null  int64  
 4   club_name                    11382 non-null  object 
 5   position_main                11362 non-null  object 
 6   other_positions              8043 non-null   object 
 7   nationality_name             11382 non-null  object 
 8   nationality_code             11088 non-null  object 
 9   other_nationality_name       3427 non-null   object 
 10  other_nationality_code       3215 non-null   object 
 11  date_of_birth                11362 non-null  object 
 12  place_of_birth_name          11278 non-null  object 
 13  place_of_birth_c

In [249]:
club_dict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9381 entries, 0 to 9380
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             9381 non-null   int64  
 1   name           9381 non-null   object 
 2   off_name       390 non-null    object 
 3   country_id     390 non-null    object 
 4   city           517 non-null    object 
 5   stadium        517 non-null    object 
 6   stadium_id     517 non-null    float64
 7   is_first_team  9381 non-null   int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 586.4+ KB


In [250]:
club_dict.head()

Unnamed: 0,id,name,off_name,country_id,city,stadium,stadium_id,is_first_team
0,1,Bayern Munich,Bayern Munich,DEU,,,,1
1,2,Bay. Leverkusen,Bayer 04 Leverkusen,DEU,,,,1
2,3,Hamburger SV,Hamburger SV,DEU,,,,1
3,4,1860 Munich,TSV 1860 Munich,DEU,,,,1
4,5,1.FC K'lautern,1.FC Kaiserslautern,DEU,,,,1


In [251]:
player_dict.head()

Unnamed: 0,id,name,original_name,club_id,club_name,position_main,other_positions,nationality_name,nationality_code,other_nationality_name,...,place_of_birth_country_code,foot,height,player_agent,joined,contract_until,outfiter,last_extention,contract_options,current_market_value
0,1,Jermaine Beckford,Jermaine Paul Alexander Beckford,289,Bolton,Centre-Forward,Right Winger,Jamaica,JAM,England,...,GBR,right,188.0,,2017-07-01,2019-06-30,,,,500000.0
1,2,Harry Charsley,Henry William James Charsley,289,Bolton,Central Midfield,"Right Midfield, Left Midfield",Ireland,IRL,England,...,GBR,right,,,2014-07-01,2019-06-30,,2016-07-15,,250000.0
2,3,Mark Davies,Mark Nicholas Davies,289,Bolton,Central Midfield,"Attacking Midfield, Right Midfield",England,GBR,,...,GBR,right,180.0,,2017-07-01,,,,,750000.0
3,4,Alex McQuade,Alexander Michael McQuade,289,Bolton,Centre-Back,Left-Back,England,GBR,,...,GBR,left,,,2018-07-01,,,,,50000.0
4,5,Przemyslaw Kazimierczak,Przemysław Kazimierczak,289,Bolton,Goalkeeper,,Poland,POL,,...,POL,right,191.0,SOLSPORT,2018-08-08,,,,,50000.0


We can definitely drop several columns from the player_dict to help condense what we would like to see.

To combine this data with player_stats, we will really only need columns 'position_main', 'other_positions', 'nationality_name', 'nationality_code', 'date_of_birth', 'foot', 'height'. All other data is held in player_stats in more detail, as the stats cover season to season, whereas player_dict only covers the most recent updated version of each player.

In [252]:
cols = ['name', 'position_main', 'other_positions', 'nationality_name', 'nationality_code', 'date_of_birth', 'foot', 'height']

player_dict = player_dict[cols]

player_dict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11382 entries, 0 to 11381
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              11382 non-null  object 
 1   position_main     11362 non-null  object 
 2   other_positions   8043 non-null   object 
 3   nationality_name  11382 non-null  object 
 4   nationality_code  11088 non-null  object 
 5   date_of_birth     11362 non-null  object 
 6   foot              10903 non-null  object 
 7   height            11046 non-null  float64
dtypes: float64(1), object(7)
memory usage: 711.5+ KB


Before we combine this data set with our player_stats data set, we need to make sure we have no null values. These occur in 'other_positions', 'nationality_code', 'date_of_birth', 'foot', and 'height'.

In [253]:
player_dict['other_positions'].fillna(player_dict['position_main'], inplace=True)
player_dict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11382 entries, 0 to 11381
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              11382 non-null  object 
 1   position_main     11362 non-null  object 
 2   other_positions   11362 non-null  object 
 3   nationality_name  11382 non-null  object 
 4   nationality_code  11088 non-null  object 
 5   date_of_birth     11362 non-null  object 
 6   foot              10903 non-null  object 
 7   height            11046 non-null  float64
dtypes: float64(1), object(7)
memory usage: 711.5+ KB


In [254]:
player_dict[player_dict['nationality_code'].isnull()]['nationality_name'].unique()

array(['Bosnia-Herzegovina', "Cote d'Ivoire", 'DR Congo', 'Curacao',
       'Tahiti', 'Cape Verde', 'Kosovo', 'Korea, North', 'Palästina',
       'Chinese Taipei (Taiwan)'], dtype=object)

In [255]:
country_codes = [['Bosnia-Herzegovina', 'BIH'],["Cote d'Ivoire", 'CIV'],['DR Congo', 'CGO'],
                 ['Curacao', 'CUW'], ['Tahiti', 'TAH'], ['Cape Verde', 'CPV'],
                 ['Kosovo', 'KVX'], ['Korea, North', 'PRK'], ['Palästina', 'PLE'], ['Chinese Taipei (Taiwan)', 'TPE']]

for country in country_codes:
    for ind in player_dict.index:
        if country[0] == player_dict.loc[ind,'nationality_name']:
            player_dict.loc[ind,'nationality_code'] = country[1]
            
player_dict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11382 entries, 0 to 11381
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              11382 non-null  object 
 1   position_main     11362 non-null  object 
 2   other_positions   11362 non-null  object 
 3   nationality_name  11382 non-null  object 
 4   nationality_code  11382 non-null  object 
 5   date_of_birth     11362 non-null  object 
 6   foot              10903 non-null  object 
 7   height            11046 non-null  float64
dtypes: float64(1), object(7)
memory usage: 711.5+ KB


In [256]:
player_dict[player_dict['position_main'].isnull()]

Unnamed: 0,name,position_main,other_positions,nationality_name,nationality_code,date_of_birth,foot,height
3210,Hasan Yurt,,,Turkey,TUR,1978-06-13,right,182.0
3249,Aleksandar Yordanov Aleksandrov,,,Bulgaria,BGR,1975-01-19,both,176.0
4968,Devran Ayhan,,,Turkey,TUR,1978-05-25,right,179.0
5848,Lubos Pecka,,,Czech Republic,CZE,1978-02-19,left,178.0
5997,Cristiano,,,Brazil,BRA,1981-06-03,both,175.0
6689,Serkan Bensol,,,Turkey,TUR,1973-07-01,right,182.0
7187,Gastón Curbelo,,,France,FRA,1976-04-08,right,176.0
7219,Pini Balili,,,Israel,ISR,1979-06-18,both,175.0
7509,Laurentiu Rosu,,,Romania,ROU,1975-10-26,left,175.0
7520,Goran Stavrevski,,,Macedonia,MKD,1974-01-02,right,183.0


We see that many of these players were born before 1981, with the exception of Emra Tahirovic. We can like remove these players from our data, since many would be retired and haven't played for many of the years we are interested in, namely after 2008. As for Emra Tahirovic, we find that he has also retired as of 2013 as a striker. We could add his position to the data set, but it will be easier to just drop him with the rest of the players in this subset.

In [257]:
player_dict = player_dict[player_dict['position_main'].notnull()]
player_dict.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11362 entries, 0 to 11381
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              11362 non-null  object 
 1   position_main     11362 non-null  object 
 2   other_positions   11362 non-null  object 
 3   nationality_name  11362 non-null  object 
 4   nationality_code  11362 non-null  object 
 5   date_of_birth     11342 non-null  object 
 6   foot              10884 non-null  object 
 7   height            11027 non-null  float64
dtypes: float64(1), object(7)
memory usage: 798.9+ KB


In [258]:
player_dict[player_dict['date_of_birth'].isnull()]

Unnamed: 0,name,position_main,other_positions,nationality_name,nationality_code,date_of_birth,foot,height
20,Karim Matmour,Right Winger,"Centre-Forward, Left Winger",Algeria,DZA,,right,181.0
3408,Mineiro,Defensive Midfield,Defensive Midfield,Brazil,BRA,,right,169.0
3947,David Odonkor,Right Winger,"Right Midfield, Left Winger",Germany,DEU,,right,172.0
5485,Carsten Ramelow,Defensive Midfield,"Centre-Back, Central Midfield",Germany,DEU,,right,186.0
7209,Faruk Namdar,Attacking Midfield,Attacking Midfield,Turkey,TUR,,both,184.0
7549,Markus Bollmann,Centre-Back,Right-Back,Germany,DEU,,right,190.0
7841,Markus Kurth,Centre-Forward,"Centre-Back, Defensive Midfield",Germany,DEU,,right,180.0
7922,Philipp Bönig,Left-Back,Left-Back,Germany,DEU,,left,175.0
8378,Daniel Halfar,Attacking Midfield,"Central Midfield, Left Winger",Germany,DEU,,left,173.0
8631,Moses Sichone,Centre-Back,Centre-Back,Zambia,ZMB,,right,187.0


In [95]:
player_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231379 entries, 0 to 231378
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                231379 non-null  int64  
 1   player_id         231379 non-null  int64  
 2   player_name       231379 non-null  object 
 3   season            231379 non-null  int64  
 4   league_id         231379 non-null  int64  
 5   league_name       231379 non-null  object 
 6   club_id           231379 non-null  int64  
 7   club_name         231379 non-null  object 
 8   apps              231379 non-null  int64  
 9   points_per_match  225422 non-null  float64
 10  goals             166957 non-null  float64
 11  assists           159543 non-null  float64
 12  conceded_goals    118416 non-null  float64
 13  clean_sheets      117263 non-null  float64
 14  yellow_card       190724 non-null  float64
 15  two_yellow_cards  117566 non-null  float64
 16  red_card          11

In [96]:
player_stats[player_stats['goals'].isnull()].head()

Unnamed: 0,id,player_id,player_name,season,league_id,league_name,club_id,club_name,apps,points_per_match,goals,assists,conceded_goals,clean_sheets,yellow_card,two_yellow_cards,red_card,minutes_played
0,1,1,Jermaine Beckford,2011,1,Premier League,276,Everton FC,2,1.5,,,,,,,,68
16,17,2,Harry Charsley,2017,57,Europa League,276,Everton FC,1,3.0,,,,,,,,90
25,26,3,Mark Davies,2009,1,Premier League,289,Bolton Wanderers,17,0.71,,,,,1.0,,,553
26,27,3,Mark Davies,2008,1,Premier League,289,Bolton Wanderers,10,1.2,,2.0,,,3.0,,,758
51,52,6,Jaroslaw Fojut,2017,19,Ekstraklasa,423,Pogon Szczecin,17,0.94,,,,,5.0,,,1526


In [115]:
player_df = pd.merge(player_stats, player_dict, how='left', left_on=['player_id','season'], right_on=['id','season'])
player_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 231379 entries, 0 to 231378
Data columns (total 35 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id_x                         231379 non-null  int64  
 1   player_id                    231379 non-null  int64  
 2   player_name                  231379 non-null  object 
 3   season                       231379 non-null  int64  
 4   league_id                    231379 non-null  int64  
 5   league_name                  231379 non-null  object 
 6   club_id_x                    231379 non-null  int64  
 7   club_name_x                  231379 non-null  object 
 8   apps                         231379 non-null  int64  
 9   points_per_match             225422 non-null  float64
 10  goals                        166957 non-null  float64
 11  assists                      159543 non-null  float64
 12  conceded_goals               118416 non-null  float64
 13 

In [116]:
player_df.columns

Index(['id_x', 'player_id', 'player_name', 'season', 'league_id',
       'league_name', 'club_id_x', 'club_name_x', 'apps', 'points_per_match',
       'goals', 'assists', 'conceded_goals', 'clean_sheets', 'yellow_card',
       'two_yellow_cards', 'red_card', 'minutes_played', 'id_y', 'name',
       'original_name', 'club_id_y', 'club_name_y', 'position_main',
       'other_positions', 'nationality_name', 'nationality_code',
       'date_of_birth', 'place_of_birth_name', 'place_of_birth_country_name',
       'place_of_birth_country_code', 'foot', 'height', 'joined',
       'current_market_value'],
      dtype='object')

In [126]:
player_df[player_df['club_id_x'] != player_df['club_id_y']][['club_name_x','club_name_y']]

Unnamed: 0,club_name_x,club_name_y
0,Everton FC,Bolton
1,Everton FC,Bolton
2,Bury FC,Bolton
3,Bury FC,Bolton
4,Preston North End,Bolton
...,...,...
231372,Bayer 04 Leverkusen,Hertha BSC
231373,Reinickendorfer Füchse,Hertha BSC
231376,Southampton FC U23,Southampton
231377,Southampton FC U23,Southampton


In [None]:
cols = ['name','id_y','place_of_birth_name', 'place_of_birth_country_name', 'place_of_birth_country_code']

With a quick check on Wikipedia, we were able to retrieve each missing DOB and store it in the dob_dict dictionary.

In [104]:
# dob_list = [['Henri Lansbury','1990-10-12'], 
#             ['Arturo Lupoli','1987-06-24'], 
#             ['Henrik Dalsgaard','1989-07-27'],
#             ['Andrea La Mantia','1991-05-06'],
#             ['Ferjani Sassi','1992-03-18'],
#             ['Gal Alberman','1983-04-17'],
#             ['Pavel Drsek','1976-09-22']]

In [105]:
# for i in dob_list:
#     for ind in merge_tra_pla.index:
#         if i[0] == merge_tra_pla.loc[ind,'player_name_x']:
#             merge_tra_pla.loc[ind,'date_of_birth'] = i[1]

