# **CLEAN PLAYER DATA**

## CLEAN PLAYER'S WEEKLY DATA

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
import glob
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Merge data from 22 csv in one dataframe or open complete csv file
try:
    joined_files = os.path.join("Dirty data/PlayerData/Player_inChunks", "Playersdata_chunk*.csv")
    joined_list = glob.glob(joined_files) # A list of all joined files is returned
    pl_weekly = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
except Exception as e:
    pl_weekly = pd.read_csv('Dirty data/playerdata_weekly.csv')
pl_weekly.head()

Unnamed: 0.1,Unnamed: 0,Age,KitNumber,BallControl,Dribbling,Marking,SlideTackle,StandTackle,Aggression,Reactions,Att.Position,Interceptions,Vision,Composure,Crossing,ShortPass,LongPass,Acceleration,Stamina,Strength,Balance,SprintSpeed,Agility,Jumping,Heading,ShotPower,Finishing,LongShots,Curve,FKAcc.,Penalties,Volleys,GKPositioning,GKDiving,GKHandling,GKKicking,GKReflexes,Name,ObservationDate,Club,NationalTeam,FavoritePosition,Value,Wage,Height,Weight,Tackling,Reflexes,Handling,OnloanfromFCSchalke,OnloanfromHannover,Onloanfrom,OnloanfromFCIngolstadt
0,0,27,4,80,74,73,76.0,80.0,80,79,76.0,79.0,82.0,78.0,81,85,83,73,93,76,71.0,71,73.0,81.0,65,75,67,71,80.0,74,64.0,72.0,9,7,13.0,7.0,13.0,Jordan Henderson,"Oct. 2, 2017",Liverpool,England,CDM,21500000.0,115000.0,182,67,,,,,,,
1,1,27,7,72,74,18,27.0,35.0,76,74,70.0,28.0,59.0,64.0,70,68,62,86,68,77,64.0,91,67.0,78.0,67,86,68,70,68.0,54,60.0,68.0,12,7,16.0,10.0,11.0,Alexander Esswein,"Dec. 7, 2017",Hertha BSC Berlin,,RM,4200000.0,26500.0,183,85,,,,,,,
2,2,22,3,64,40,78,78.0,77.0,76,75,32.0,77.0,34.0,64.0,34,60,40,82,78,69,62.0,79,59.0,69.0,73,45,27,25,32.0,35,46.0,29.0,6,12,7.0,10.0,12.0,Ibañez,"June 10, 2021",Roma,,CB,9000000.0,26000.0,185,73,,,,,,,
3,3,28,32,85,87,39,25.0,43.0,70,74,78.0,42.0,76.0,74.0,63,80,72,78,72,50,94.0,70,90.0,38.0,28,73,69,62,79.0,59,58.0,64.0,15,15,11.0,9.0,15.0,Amin Bella-Kotchap,"Sept. 29, 2021",Eintracht Frankfurt,,CAM,15000000.0,60000.0,168,70,,,,,,,
4,4,22,43,60,54,59,62.0,64.0,68,58,39.0,60.0,53.0,54.0,56,63,60,66,71,57,65.0,55,57.0,73.0,53,60,41,35,47.0,43,49.0,36.0,5,7,14.0,9.0,10.0,George Marsh,"April 15, 2021",Tottenham Hotspur,,RB,1000000.0,9300.0,178,68,,,,,,,


In [3]:
# Drop observations from FIFA10 and FIFA 11 (to avoid problems with different pages' formats)
pl_weekly = pl_weekly[(pl_weekly['ObservationDate'] != 'Sept. 1, 2009') & (pl_weekly['ObservationDate'] != 'Sept. 1, 2010')] 
# Drop useless columns and Composure (too many NA)
pl_weekly.drop(['Onloanfrom', 'OnloanfromFCIngolstadt', 'OnloanfromFCSchalke', 'OnloanfromHannover', 'KitNumber', 'Tackling', 'Handling', 'Reflexes', 'Composure'], axis=1, inplace=True)
# Change ObservationDate to Date Format
pl_weekly['ObservationDate'] = pl_weekly['ObservationDate'].str.replace('Sept', 'Sep')
pl_weekly['ObservationDate'] = pl_weekly['ObservationDate'].str.replace('April', 'Apr.')
pl_weekly['ObservationDate'] = pl_weekly['ObservationDate'].str.replace('June', 'Jun.')
pl_weekly['ObservationDate'] = pl_weekly['ObservationDate'].str.replace('March', 'Mar.')
pl_weekly['ObservationDate'] = pl_weekly['ObservationDate'].str.replace('May', 'May.')
pl_weekly['ObservationDate'] = pl_weekly['ObservationDate'].str.replace('July', 'Jul.')
pl_weekly['ObservationDate'] = pl_weekly['ObservationDate'].map(lambda x: datetime.strptime(x, '%b. %d, %Y'))
# Create Dummy for Players that play for National Teams
pl_weekly['PlayNational'] = np.where(pl_weekly['NationalTeam'].isna(), 0, 1)
# Reorder columns order + Sort by Date and Name (also reset index)
pl_weekly = pl_weekly[['ObservationDate', 'Name', 'FavoritePosition', 'Age', 'Height', 'Weight', 'Club', 'PlayNational', 'NationalTeam', 'Value', 'Wage', 'BallControl', 'Dribbling', 'Marking', 'SlideTackle', 'StandTackle', 'Aggression', 'Reactions', 'Att.Position', 'Interceptions', 'Vision', 'Crossing', 'ShortPass', 'LongPass', 'Acceleration', 'Stamina', 'Strength', 'Balance', 'SprintSpeed', 'Agility', 'Jumping', 'Heading', 'ShotPower', 'Finishing', 'LongShots', 'Curve',  'FKAcc.', 'Penalties', 'Volleys', 'GKPositioning', 'GKDiving', 'GKHandling', 'GKKicking', 'GKReflexes']]
pl_weekly = pl_weekly.sort_values(by = ['ObservationDate', 'Name']).reset_index(drop = True)
# Change all numeric columns to Integer (to avoid Float type)
m = pl_weekly.select_dtypes(np.number)
pl_weekly[m.columns]= m.astype('Int64')
pl_weekly.replace(pd.NA, np.nan, inplace=True)
# Clean all values of attributes' columns (keeping only last 2 digits)
for i in pl_weekly.columns.tolist()[11:]: 
    pl_weekly.loc[:, str(i)] = pl_weekly.loc[:, str(i)].astype(str).str[-2:].astype('Int64')
pl_weekly.head(10)

Unnamed: 0,ObservationDate,Name,FavoritePosition,Age,Height,Weight,Club,PlayNational,NationalTeam,Value,Wage,BallControl,Dribbling,Marking,SlideTackle,StandTackle,Aggression,Reactions,Att.Position,Interceptions,Vision,Crossing,ShortPass,LongPass,Acceleration,Stamina,Strength,Balance,SprintSpeed,Agility,Jumping,Heading,ShotPower,Finishing,LongShots,Curve,FKAcc.,Penalties,Volleys,GKPositioning,GKDiving,GKHandling,GKKicking,GKReflexes
0,2011-09-01,Aaron Hughes,CB,31,183,72,Fulham,1,Northern Ireland,2600000,30000,63,53,80,77,80,77,70,28,83,40,45,66,57,53,62,69,66,55,57,83,72,47,33,25,52,24,44,32,11,7,5,15,10
1,2011-09-01,Aaron Hunt,LM,24,183,74,Werder Bremen,0,,4200000,15000,75,79,30,29,27,55,74,71,57,76,75,75,64,77,72,66,72,77,79,70,66,76,69,73,77,70,68,77,15,14,11,6,14
2,2011-09-01,Aaron Lennon,RM,24,165,63,Tottenham Hotspur,0,,15000000,80000,88,85,18,14,23,56,84,79,36,79,79,77,53,96,86,52,94,95,94,64,27,61,68,56,62,54,61,69,15,13,6,6,10
3,2011-09-01,Aaron McCarey,GK,18,185,74,Wolverhampton Wanderers,0,,220000,2250,15,11,11,12,12,44,46,12,17,29,11,24,19,51,41,45,51,41,44,47,13,23,13,15,18,10,12,13,65,56,61,58,59
4,2011-09-01,Aaron Ramsey,CM,20,182,76,Arsenal,0,,11500000,50000,80,77,56,61,66,57,72,73,66,80,74,82,82,76,79,67,68,75,72,63,55,73,67,74,64,70,71,70,10,6,11,5,8
5,2011-09-01,Aaron Wilbraham,ST,31,191,72,Norwich City,0,,150000,2250,54,42,22,18,19,65,60,64,23,43,48,45,38,53,48,79,56,54,33,68,75,68,66,43,42,41,72,55,11,14,13,7,15
6,2011-09-01,Aatif Chahechouhe,CAM,25,176,77,AS Nancy Lorraine,0,,160000,3000,63,61,26,17,18,39,54,56,32,61,52,56,53,58,63,41,72,61,63,54,37,38,44,40,47,55,62,39,14,6,9,7,9
7,2011-09-01,Abdelaziz Barrada,RW,22,179,73,Getafe Club de Fútbol S.A.D.,0,,1200000,5000,72,71,34,32,41,69,73,52,35,67,62,58,59,78,65,62,72,79,77,60,46,63,45,60,70,64,48,62,15,10,6,5,6
8,2011-09-01,Abdelhamid El Kaoutari,CB,21,185,73,Montpellier Hérault Sport Club,0,,4300000,10000,58,41,70,75,71,84,73,53,71,55,61,64,58,66,64,77,59,70,44,75,66,58,16,37,22,22,31,23,7,8,10,9,8
9,2011-09-01,Abdelkader Ghezzal,CF,26,183,78,Cesena,0,,4500000,20000,78,75,29,38,51,65,74,70,30,78,70,70,62,82,86,75,57,84,65,68,69,70,60,74,64,64,68,72,6,10,6,11,7


In [4]:
pl_weekly.to_csv('Clean data/players_weekly.csv', sep='\t', encoding='utf-8')

## CLEAN PLAYER'S YEARLY DATA

In [5]:
pl_yearly = pd.read_csv('Dirty data/Playersdata_yearly.csv')
pl_yearly.head()

Unnamed: 0,Acceleration,Age,Aggression,Agility,Att.Position,Balance,BallControl,Club,Composure,Crossing,Curve,Dribbling,FKAcc.,FavoritePosition,Finishing,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Handling,Heading,Height,Interceptions,Jumping,KitNumber,LongPass,LongShots,Marking,Name,NationalTeam,ObservationDate,Onloanfrom,OnloanfromFCIngolstadt,OnloanfromFCSchalke,OnloanfromHannover,Penalties,Reactions,Reflexes,ShortPass,ShotPower,SlideTackle,SprintSpeed,Stamina,StandTackle,Strength,Tackling,Value,Vision,Volleys,Wage,Weight
0,1273,20,1077,63.0,1440.0,74.0,963,Manchester United,,2055,25.0,931,46,CB,23,79,14.0,15.0,814,14.0,,76,192,3274.0,88.0,12,3165,35,1278,Chris Smalling,,"Sept. 1, 2010",,,,,52.0,2076,,2466,4367,75.0,1975,1278,76.0,877,,,65.0,33.0,,81
1,1369,20,555,68.0,355.0,61.0,768,A.C. Cesena,,1270,59.0,1068,49,LM,442,9,8.0,9.0,1111,13.0,,52,175,950.0,59.0,29,258,60,1045,Nicolás Gorobsov,,"Sept. 1, 2010",,,,,58.0,565,,664,59,53.0,1069,868,56.0,62,,,48.0,60.0,,68
2,90,25,660,280.0,73.0,376.0,376,Borussia Dortmund,,274,73.0,181,1167,RM,553,14,8.0,9.0,13,15.0,,46,175,66.0,964.0,16,63,64,21,Jakub Błaszczykowski,Poland,"Sept. 1, 2011",,,,,58.0,174,,274,377,42.0,192,275,340.0,467,,5000000.0,569.0,176.0,35000.0,71
3,67,26,47,77.0,48.0,83.0,62,Stade Malherbe Caen,62.0,65,52.0,60,31,LWB,50,11,15.0,9.0,14,9.0,,53,172,67.0,63.0,32,55,49,62,Chaker Alhadhur,,"Sept. 20, 2018",,,,,38.0,66,,66,56,71.0,67,76,72.0,51,,925000.0,48.0,38.0,8300.0,65
4,78,24,73,68.0,74.0,66.0,77,Hertha BSC,72.0,76,69.0,77,71,RB,72,12,15.0,7.0,16,8.0,,69,188,69.0,61.0,30,62,67,69,Marius Wolf,,"Sept. 26, 2019",,,,,52.0,77,,75,75,67.0,84,87,67.0,75,,8000000.0,74.0,62.0,43000.0,81


In [6]:
# Drop observations from FIFA10 and FIFA 11 (to avoid problems with different pages' formats)
pl_yearly = pl_yearly[(pl_yearly['ObservationDate'] != 'Sept. 1, 2009') & (pl_yearly['ObservationDate'] != 'Sept. 1, 2010')] 
# Drop useless columns and Composure (too many NA)
pl_yearly.drop(['Onloanfrom', 'OnloanfromFCIngolstadt', 'OnloanfromFCSchalke', 'OnloanfromHannover', 'KitNumber', 'Tackling', 'Handling', 'Reflexes', 'Composure'], axis=1, inplace=True)
# Change ObservationDate to Date Format
pl_yearly['ObservationDate'] = pl_yearly['ObservationDate'].str.replace('Sept', 'Sep').map(
    lambda x: datetime.strptime(x, '%b. %d, %Y'))
# Create Dummy for Players that play for National Teams
pl_yearly['PlayNational'] = np.where(pl_yearly['NationalTeam'].isna(), 0, 1)
# Reorder columns order + Sort by Date and Name (also reset index)
pl_yearly = pl_yearly[['ObservationDate', 'Name', 'FavoritePosition', 'Age', 'Height', 'Weight', 'Club', 'PlayNational', 'NationalTeam', 'Value', 'Wage', 'BallControl', 'Dribbling', 'Marking', 'SlideTackle', 'StandTackle', 'Aggression', 'Reactions', 'Att.Position', 'Interceptions', 'Vision', 'Crossing', 'ShortPass', 'LongPass', 'Acceleration', 'Stamina', 'Strength', 'Balance', 'SprintSpeed', 'Agility', 'Jumping', 'Heading', 'ShotPower', 'Finishing', 'LongShots', 'Curve',  'FKAcc.', 'Penalties', 'Volleys', 'GKPositioning', 'GKDiving', 'GKHandling', 'GKKicking', 'GKReflexes']]
pl_yearly = pl_yearly.sort_values(by = ['ObservationDate', 'Name']).reset_index(drop = True)
# Change all numeric columns to Integer (to avoid Float type)
m = pl_yearly.select_dtypes(np.number)
pl_yearly[m.columns]= m.astype('Int64')
pl_yearly.replace(pd.NA, np.nan, inplace=True)
# Clean all values of attributes' columns (keeping only last 2 digits)
for i in pl_yearly.columns.tolist()[11:]: 
    pl_yearly.loc[:, str(i)] = pl_yearly.loc[:, str(i)].astype(str).str[-2:].astype('Int64')
pl_yearly.head(10)

Unnamed: 0,ObservationDate,Name,FavoritePosition,Age,Height,Weight,Club,PlayNational,NationalTeam,Value,Wage,BallControl,Dribbling,Marking,SlideTackle,StandTackle,Aggression,Reactions,Att.Position,Interceptions,Vision,Crossing,ShortPass,LongPass,Acceleration,Stamina,Strength,Balance,SprintSpeed,Agility,Jumping,Heading,ShotPower,Finishing,LongShots,Curve,FKAcc.,Penalties,Volleys,GKPositioning,GKDiving,GKHandling,GKKicking,GKReflexes
0,2011-09-01,Aaron Hughes,CB,31,183,72,Fulham,1,Northern Ireland,2600000,30000,63,53,80,77,80,77,70,28,83,40,45,66,57,53,62,69,66,55,57,83,72,47,33,25,52,24,44,32,11,7,5,15,10
1,2011-09-01,Aaron Hunt,LM,24,183,74,Werder Bremen,0,,4200000,15000,75,79,30,29,27,55,74,71,57,76,75,75,64,77,72,66,72,77,79,70,66,76,69,73,77,70,68,77,15,14,11,6,14
2,2011-09-01,Aaron Lennon,RM,24,165,63,Tottenham Hotspur,0,,15000000,80000,88,85,18,14,23,56,84,79,36,79,79,77,53,96,86,52,94,95,94,64,27,61,68,56,62,54,61,69,15,13,6,6,10
3,2011-09-01,Aaron McCarey,GK,18,185,74,Wolverhampton Wanderers,0,,220000,2250,15,11,11,12,12,44,46,12,17,29,11,24,19,51,41,45,51,41,44,47,13,23,13,15,18,10,12,13,65,56,61,58,59
4,2011-09-01,Aaron Ramsey,CM,20,182,76,Arsenal,0,,11500000,50000,80,77,56,61,66,57,72,73,66,80,74,82,82,76,79,67,68,75,72,63,55,73,67,74,64,70,71,70,10,6,11,5,8
5,2011-09-01,Aaron Wilbraham,ST,31,191,72,Norwich City,0,,150000,2250,54,42,22,18,19,65,60,64,23,43,48,45,38,53,48,79,56,54,33,68,75,68,66,43,42,41,72,55,11,14,13,7,15
6,2011-09-01,Aatif Chahechouhe,CAM,25,176,77,AS Nancy Lorraine,0,,160000,3000,63,61,26,17,18,39,54,56,32,61,52,56,53,58,63,41,72,61,63,54,37,38,44,40,47,55,62,39,14,6,9,7,9
7,2011-09-01,Abdelaziz Barrada,RW,22,179,73,Getafe Club de Fútbol S.A.D.,0,,1200000,5000,72,71,34,32,41,69,73,52,35,67,62,58,59,78,65,62,72,79,77,60,46,63,45,60,70,64,48,62,15,10,6,5,6
8,2011-09-01,Abdelhamid El Kaoutari,CB,21,185,73,Montpellier Hérault Sport Club,0,,4300000,10000,58,41,70,75,71,84,73,53,71,55,61,64,58,66,64,77,59,70,44,75,66,58,16,37,22,22,31,23,7,8,10,9,8
9,2011-09-01,Abdelkader Ghezzal,CF,26,183,78,Cesena,0,,4500000,20000,78,75,29,38,51,65,74,70,30,78,70,70,62,82,86,75,57,84,65,68,69,70,60,74,64,64,68,72,6,10,6,11,7


In [7]:
pl_yearly.to_csv('Clean data/players_yearly.csv', sep='\t', encoding='utf-8')

# **CLEAN TEAM DATA**

## CLEANING TEAMS WEEKLY DATA

In [8]:
tm_weekly = pd.read_csv('Dirty data/Teamdata_weekly.csv')
tm_weekly.head()

Unnamed: 0,TeamName,ObservationDate,RivalTeam,Attack,Midfield,Defence,TransferBudget,Speed,Dribbling,BuildupPassing,Positioning,Passing,Crossing,Shooting,Pressure,Aggression,Team Width,Defender Line,Captain,Short Free Kick,Long Free Kick,Penalties,Left Corner,Right Corner,TeamRoster,LoanedPlayers,Defensive Style,DefensiveWidth,Depth,Offensive Style,Width,Players In Box,Corners,Free Kicks,Free Kick
0,Angers SCO,"April 14, 2016",Stade Lavallois Mayenne FC,69,72,71,2400000,62.0,52.0,56.0,Organised,55.0,46.0,66.0,39.0,47.0,51.0,Cover,N'Doye,Capelle,Capelle,Ketkeophomphone,Mangani,Capelle,"['Alexandre Letellier', 'Vincent Manceau', 'Is...",[],,,,,,,,,
1,SC Freiburg,"April 10, 2017",VfB Stuttgart,73,74,71,5500000,49.0,42.0,50.0,Free Form,55.0,31.0,48.0,56.0,61.0,36.0,Cover,Frantz,Grifo,Grifo,Grifo,Grifo,Grifo,"['Alexander Schwolow', 'Lukas Kübler', 'Çağlar...","['Mats Møller Dæhli', 'Sebastian Kerk', 'Jonas...",,,,,,,,,
2,Frosinone,"Dec. 17, 2018",Perugia,73,70,72,7000000,,,,,,,,,,,,Ciofani,Ciano,Ciano,Ciano,Ciano,Ciano,"['Marco SportielloL', 'Edoardo GoldanigaL', 'L...",['Nicola Citro'],Pressure On Heavy Touch,4.0,3.0,Balanced,4.0,4.0,3.0,3.0,
3,RCD Espanyol,"Dec. 16, 2021",FC Barcelona,78,77,76,6000000,,,,,,,,,,,,Diego López,Embarba,Embarba,De Tomás,Embarba,Embarba,"['Diego López', 'Sergi Gómez', 'Leandro Cabrer...","['Matías Vargas', 'Víctor Gómez', 'Pol Lozano'...",Balanced,60.0,50.0,Balanced,60.0,4.0,3.0,3.0,
4,SC Bastia,"Dec. 3, 2015",AC Ajaccio,72,70,71,2800000,61.0,61.0,52.0,Organised,56.0,51.0,49.0,44.0,55.0,48.0,Cover,Cahuzac,Ayité,Ayité,Ayité,Ayité,Ayité,"['Jean-Louis Leca', 'Gilles Cioni', 'Sébastien...",[],,,,,,,,,


In [9]:
tm_weekly = tm_weekly[(tm_weekly['ObservationDate'] != 'Sept. 1, 2009') & (tm_weekly['ObservationDate'] != 'Sept. 1, 2010')] 
# Change ObservationDate to Date Format
tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].str.replace('Sept', 'Sep')
tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].str.replace('April', 'Apr.')
tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].str.replace('June', 'Jun.')
tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].str.replace('March', 'Mar.')
tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].str.replace('May', 'May.')
tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].str.replace('July', 'Jul.')

tm_weekly['ObservationDate'] = tm_weekly['ObservationDate'].map(lambda x: datetime.strptime(x, '%b. %d, %Y'))
# Keep only some columns (most columns dropped have too many NA) + Sort by Date and TeamName
tm_weekly = tm_weekly[['ObservationDate', 'TeamName', 'Attack', 'Midfield', 'Defence', 'TransferBudget', 'RivalTeam', 'TeamRoster']]
tm_weekly = tm_weekly.sort_values(by = ['ObservationDate', 'TeamName']).reset_index(drop = True)
tm_weekly.head()

Unnamed: 0,ObservationDate,TeamName,Attack,Midfield,Defence,TransferBudget,RivalTeam,TeamRoster
0,2011-09-01,1. FC Kaiserslautern,74,69,71,5000000,1. FSV Mainz 05,"['Kevin Trapp', 'Florian Dick', 'Rodnei', 'Mar..."
1,2011-09-01,1. FC Köln,78,75,73,5000000,Borussia M'gladbach,"['Michael Rensing', 'Mišo Brečko', 'SerenoL', ..."
2,2011-09-01,1. FC Nürnberg,72,73,71,3000000,Bayern München,"['Raphael Schäfer', 'Timothy Chandler', 'Phili..."
3,2011-09-01,1. FSV Mainz 05,75,73,73,5500000,1. FC Kaiserslautern,"['Heinz Müller', 'Zdeněk Pospěch', 'Bo Svensso..."
4,2011-09-01,AC Ajaccio,73,67,63,750000,Sporting Club Bastia,"['Guillermo Ochoa', 'Anthony Lippini', ""Leyti ..."


In [10]:
tm_weekly.to_csv('Clean data/teams_weekly.csv', sep='\t', encoding='utf-8')

## CLEANING TEAMS YEARLY DATA

In [11]:
tm_yearly = pd.read_csv('Dirty data/Teamdata_yearly.csv')
tm_yearly.head()

Unnamed: 0,TeamName,ObservationDate,RivalTeam,Attack,Midfield,Defence,TransferBudget,Speed,Dribbling,BuildupPassing,Positioning,Passing,Crossing,Shooting,Pressure,Aggression,Team Width,Defender Line,Captain,Short Free Kick,Long Free Kick,Penalties,Left Corner,Right Corner,TeamRoster,LoanedPlayers,Defensive Style,DefensiveWidth,Depth,Offensive Style,Width,Players In Box,Corners,Free Kicks,Free Kick
0,Hull City,"Aug. 30, 2013",Leeds United,72,73,70,6500000,63.0,41.0,48.0,Organised,38.0,60.0,54.0,34.0,37.0,46.0,Cover,Koren,Brady,Brady,Brady,Koren,Brady,"['Allan McGregor', 'Ahmed Elmohamady', 'James ...","['Jack Hobbs', 'Tom Cairney', 'Mark Oxley']",,,,,,,,,
1,FC Nantes,"Sept. 20, 2016",Stade Rennais FC,71,72,73,6500000,63.0,48.0,56.0,Organised,52.0,68.0,39.0,39.0,41.0,52.0,Cover,Riou,Lucas Lima,Lucas Lima,Lucas Lima,Harit,Lucas Lima,"['Rémy Riou', 'Léo Dubois', 'Oswaldo Vizcarron...",['Kolbeinn Sigþórsson'],,,,,,,,,
2,Hellas Verona,"Sept. 26, 2019",Chievo Verona,73,73,69,8000000,,,,,,,,,,,,Miguel Veloso,Miguel Veloso,Miguel Veloso,Stępiński,Miguel Veloso,Miguel Veloso,"['Marco Silvestri', 'Amir Rrahmani', 'Marash K...","['Antonino Ragusa', 'Karim Laribi', 'Luca Marr...",Balanced,6.0,5.0,Balanced,6.0,3.0,3.0,3.0,
3,Paris Saint-Germain,"Sept. 25, 2017",Olympique de Marseille,85,83,83,130000000,36.0,34.0,36.0,Free Form,34.0,45.0,58.0,68.0,55.0,55.0,Cover,Thiago Silva,Neymar,Verratti,Cavani,Neymar,Dani Alves,"['Alphonse Areola', 'Dani Alves', 'Marquinhos'...","['Grzegorz Krychowiak', 'Jesé', 'Gonçalo Guede...",,,,,,,,,
4,FC Augsburg,"Sept. 1, 2011",Bayern München,70,68,70,2000000,59.0,,55.0,Organised,49.0,57.0,55.0,47.0,58.0,56.0,Cover,Möhrle,Baier,Baier,Baier,Baier,Baier,"['Simon Jentzsch', 'Paul Verhaegh', 'Uwe Möhrl...",['Dominic Peitz'],,,,,,,,,


In [12]:
# Drop observations from FIFA10 and FIFA 11 (to stay compatible with players yearly data)
tm_yearly = tm_yearly[(tm_yearly['ObservationDate'] != 'Sept. 1, 2009') & (tm_yearly['ObservationDate'] != 'Sept. 1, 2010')] 
# Change ObservationDate to Date Format
tm_yearly['ObservationDate'] = tm_yearly['ObservationDate'].str.replace('Sept', 'Sep').map(
    lambda x: datetime.strptime(x, '%b. %d, %Y'))
# Keep only some columns (most columns dropped have too many NA) + Sort by Date and TeamName
tm_yearly = tm_yearly[['ObservationDate', 'TeamName', 'Attack', 'Midfield', 'Defence', 'TransferBudget', 'RivalTeam', 'TeamRoster']]
tm_yearly = tm_yearly.sort_values(by = ['ObservationDate', 'TeamName']).reset_index(drop = True)
tm_yearly.head(10)

Unnamed: 0,ObservationDate,TeamName,Attack,Midfield,Defence,TransferBudget,RivalTeam,TeamRoster
0,2011-09-01,1. FC Kaiserslautern,74,69,71,5000000,1. FSV Mainz 05,"['Kevin Trapp', 'Florian Dick', 'Rodnei', 'Mar..."
1,2011-09-01,1. FC Köln,78,75,73,5000000,Borussia M'gladbach,"['Michael Rensing', 'Mišo Brečko', 'SerenoL', ..."
2,2011-09-01,1. FC Nürnberg,72,73,71,3000000,Bayern München,"['Raphael Schäfer', 'Timothy Chandler', 'Phili..."
3,2011-09-01,1. FSV Mainz 05,75,73,73,5500000,1. FC Kaiserslautern,"['Heinz Müller', 'Zdeněk Pospěch', 'Bo Svensso..."
4,2011-09-01,AC Ajaccio,73,67,63,750000,Sporting Club Bastia,"['Guillermo Ochoa', 'Anthony Lippini', ""Leyti ..."
5,2011-09-01,ACF Fiorentina,81,79,76,15000000,Juventus,"['Artur Boruc', 'Mattia CassaniL', 'Alessandro..."
6,2011-09-01,AJ Auxerre,70,74,72,4000000,Paris Saint-Germain,"['Olivier Sorin', 'Dariusz Dudka', 'Adama Coul..."
7,2011-09-01,AS Nancy Lorraine,72,71,71,3500000,FC Metz,"['Damien Grégorini', 'Jordan Lotiès', 'André L..."
8,2011-09-01,AS Saint-Etienne,72,71,72,9000000,Olympique Lyonnais,"['Stéphane Ruffier', 'Albin Ebondo', 'Sylvain ..."
9,2011-09-01,Arsenal,84,82,80,30000000,Tottenham Hotspur,"['Wojciech Szczęsny', 'Bacary Sagna', 'Per Mer..."


In [13]:
tm_yearly.to_csv('Clean data/players_yearly.csv', sep='\t', encoding='utf-8')