## Cleaning the Data
-----

### Setup:
---

In [1]:
# Import dependencies
import pandas as pd
import datetime as dt

In [2]:
# Grab the CSV
athelete_stats_df = pd.read_csv("initial_unclean_csv/complete_unclean_data.csv", encoding="utf8")

In [3]:
# Display as dataframe
athelete_stats_df.head()

Unnamed: 0,player_name,team,league,position,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,clean_sheets,goals_against,photo_url,height,weight,birthdate,nationality
0,André Silva,Real Sociedad,LALIGA,Forward,21.0,0,0,0,0,2,1,0,0,0,https://assets.laliga.com/squad/2023/t188/p165...,"6' 1""",183 lbs,1995-11-06T08:00Z,Portugal
1,Pepe Reina,Villarreal,LALIGA,Goalkeeper,1.0,0,0,0,0,0,0,0,0,0,https://assets.laliga.com/squad/2023/t449/p843...,"6' 2""",203 lbs,1982-08-31T07:00Z,Spain
2,Javi Guerra,Valencia,LALIGA,Midfielder,8.0,3,1,2,0,18,8,2,4,16,https://assets.laliga.com/squad/2023/t191/p546...,"6' 2""",170 lbs,2003-05-13T07:00Z,Spain
3,Alejandro Pozo,Almería,LALIGA,Defender,17.0,0,0,0,0,1,1,0,0,16,https://assets.laliga.com/squad/2023/t1564/p23...,"5' 9""",152 lbs,1999-02-22T08:00Z,Spain
4,Iván Cuéllar,Mallorca,LALIGA,Goalkeeper,25.0,0,0,0,0,0,0,0,0,0,https://assets.laliga.com/squad/2023/t181/p162...,"6' 2""",168 lbs,1984-05-27T07:00Z,Spain


### Cleaning:
---

#### Convert Height

In [4]:
# Split current heights
split_heights = athelete_stats_df["height"]
split_heights = split_heights.str.split("'", n=1, expand=True)

In [5]:
# Remove everthing but the numbers from the feet
# Convert to float for math
heights_feet = split_heights[0].astype("float")
heights_feet.head(5)

0    6.0
1    6.0
2    6.0
3    5.0
4    6.0
Name: 0, dtype: float64

In [6]:
# Remove everything but the nubmers from the inches
heights_inches = split_heights[1]

split_heights_inches = heights_inches.str.split('"', n=1, expand=True)

# Convert to float for math
split_heights_inches = split_heights_inches[0].astype("float")
heights_inches = split_heights_inches
heights_inches.head(5)

0    1.0
1    2.0
2    2.0
3    9.0
4    2.0
Name: 0, dtype: float64

In [7]:
# Converting inches to feet
inches_in_feet = []
for measurement in heights_inches:
    cur_feet = measurement/12
    inches_in_feet.append(cur_feet)
   

In [8]:
# Create dataframe with new heights colums, both in feet
split_heights_in_ft = {
    "heights_ft1": heights_feet,
    "heights_ft2": inches_in_feet
}
split_heights_in_ft_df = pd.DataFrame(split_heights_in_ft)
split_heights_in_ft_df.head(5)

Unnamed: 0,heights_ft1,heights_ft2
0,6.0,0.083333
1,6.0,0.166667
2,6.0,0.166667
3,5.0,0.75
4,6.0,0.166667


In [9]:
# Adding feet and converted inches column to get new total feet measurments column with decimals
new_height = split_heights_in_ft_df["heights_ft1"] + split_heights_in_ft_df["heights_ft2"]

# Round the combined height
new_height = round(new_height,2)

In [10]:
# Display cleaned data
new_height.head(5)

0    6.08
1    6.17
2    6.17
3    5.75
4    6.17
dtype: float64

#### Convert Weight

In [11]:
# Grab uncleaned weight
uncleaned_weight = athelete_stats_df["weight"]
uncleaned_weight.head(5)

0    183 lbs
1    203 lbs
2    170 lbs
3    152 lbs
4    168 lbs
Name: weight, dtype: object

In [12]:
# Split weights
split_weights = uncleaned_weight.str.split(' ', n=1, expand=True)
split_weights.head(5)

Unnamed: 0,0,1
0,183,lbs
1,203,lbs
2,170,lbs
3,152,lbs
4,168,lbs


In [13]:
# Display cleaned data
new_weight = split_weights[0]
new_weight.head(5)

0    183
1    203
2    170
3    152
4    168
Name: 0, dtype: object

#### Convert Dates

In [14]:
# Split Current Dates
split_birthdates = athelete_stats_df["birthdate"]
split_birthdates = split_birthdates.str.split("T", n=1, expand=True)

In [15]:
# Select only the part we want to keep
split_birthdates = split_birthdates [0]
split_birthdates.head(5)

0    1995-11-06
1    1982-08-31
2    2003-05-13
3    1999-02-22
4    1984-05-27
Name: 0, dtype: object

In [16]:
# Convert the dates to datetime
split_birthdates = pd.to_datetime(split_birthdates)
split_birthdates.head(5)

0   1995-11-06
1   1982-08-31
2   2003-05-13
3   1999-02-22
4   1984-05-27
Name: 0, dtype: datetime64[ns]

In [17]:
# Check dates converted to timestamps
type (split_birthdates[0])

pandas._libs.tslibs.timestamps.Timestamp

In [18]:
# Display cleaned data
new_birthdates = split_birthdates
new_birthdates.head(5)

0   1995-11-06
1   1982-08-31
2   2003-05-13
3   1999-02-22
4   1984-05-27
Name: 0, dtype: datetime64[ns]

#### Remove "Goals Against" and "Clean Sheets" for Non-Goalies

In [19]:
# Remove goals against and clean sheets for non-goalies
new_goals_against = []
new_clean_sheets = []

for index, row in athelete_stats_df.iterrows():
    cur_position = row["position"]
    cur_goals_against = row["goals_against"]
    cur_clean_sheets = row["clean_sheets"]
    
    if cur_position == "Goalkeeper":
        new_goals_against.append(cur_goals_against)
        new_clean_sheets.append(cur_clean_sheets)
    else:
        new_goals_against.append(0)
        new_clean_sheets.append(0)


#### Creating Cleaned Dataframe

In [20]:
# Remove unclean sections from dataframe
athelete_stats_df = athelete_stats_df.drop(["birthdate"], axis=1)
athelete_stats_df = athelete_stats_df.drop(["height"], axis=1)
athelete_stats_df = athelete_stats_df.drop(["weight"], axis=1)
athelete_stats_df = athelete_stats_df.drop(["goals_against"], axis=1)
athelete_stats_df = athelete_stats_df.drop(["clean_sheets"], axis=1)
athelete_stats_df.head()

Unnamed: 0,player_name,team,league,position,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,photo_url,nationality
0,André Silva,Real Sociedad,LALIGA,Forward,21.0,0,0,0,0,2,1,0,https://assets.laliga.com/squad/2023/t188/p165...,Portugal
1,Pepe Reina,Villarreal,LALIGA,Goalkeeper,1.0,0,0,0,0,0,0,0,https://assets.laliga.com/squad/2023/t449/p843...,Spain
2,Javi Guerra,Valencia,LALIGA,Midfielder,8.0,3,1,2,0,18,8,2,https://assets.laliga.com/squad/2023/t191/p546...,Spain
3,Alejandro Pozo,Almería,LALIGA,Defender,17.0,0,0,0,0,1,1,0,https://assets.laliga.com/squad/2023/t1564/p23...,Spain
4,Iván Cuéllar,Mallorca,LALIGA,Goalkeeper,25.0,0,0,0,0,0,0,0,https://assets.laliga.com/squad/2023/t181/p162...,Spain


In [21]:
# Add clean sections to dataframe
athelete_stats_df["birthdate"] = new_birthdates
athelete_stats_df["height_(ft)"] = new_height
athelete_stats_df["weight_(lbs)"] = new_weight
athelete_stats_df["goals_against"] = new_goals_against
athelete_stats_df["clean_sheets"] = new_clean_sheets
athelete_stats_df.head()

Unnamed: 0,player_name,team,league,position,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,photo_url,nationality,birthdate,height_(ft),weight_(lbs),goals_against,clean_sheets
0,André Silva,Real Sociedad,LALIGA,Forward,21.0,0,0,0,0,2,1,0,https://assets.laliga.com/squad/2023/t188/p165...,Portugal,1995-11-06,6.08,183,0,0
1,Pepe Reina,Villarreal,LALIGA,Goalkeeper,1.0,0,0,0,0,0,0,0,https://assets.laliga.com/squad/2023/t449/p843...,Spain,1982-08-31,6.17,203,0,0
2,Javi Guerra,Valencia,LALIGA,Midfielder,8.0,3,1,2,0,18,8,2,https://assets.laliga.com/squad/2023/t191/p546...,Spain,2003-05-13,6.17,170,0,0
3,Alejandro Pozo,Almería,LALIGA,Defender,17.0,0,0,0,0,1,1,0,https://assets.laliga.com/squad/2023/t1564/p23...,Spain,1999-02-22,5.75,152,0,0
4,Iván Cuéllar,Mallorca,LALIGA,Goalkeeper,25.0,0,0,0,0,0,0,0,https://assets.laliga.com/squad/2023/t181/p162...,Spain,1984-05-27,6.17,168,0,0


In [22]:
# Reorder dataframe
athelete_stats_df = athelete_stats_df[["player_name",
                                       "team",
                                       "league",
                                       "position",
                                       "jersey_number",
                                       "goals",
                                       "assists",
                                       "yellow_cards",
                                       "red_cards",
                                       "shots",
                                       "shots_on_goal",
                                       "saves",
                                       "clean_sheets",
                                       "goals_against",
                                       "photo_url",
                                       "height_(ft)",
                                       "weight_(lbs)",
                                       "birthdate",
                                       "nationality"
                                       ]]
athelete_stats_df.head()

Unnamed: 0,player_name,team,league,position,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,clean_sheets,goals_against,photo_url,height_(ft),weight_(lbs),birthdate,nationality
0,André Silva,Real Sociedad,LALIGA,Forward,21.0,0,0,0,0,2,1,0,0,0,https://assets.laliga.com/squad/2023/t188/p165...,6.08,183,1995-11-06,Portugal
1,Pepe Reina,Villarreal,LALIGA,Goalkeeper,1.0,0,0,0,0,0,0,0,0,0,https://assets.laliga.com/squad/2023/t449/p843...,6.17,203,1982-08-31,Spain
2,Javi Guerra,Valencia,LALIGA,Midfielder,8.0,3,1,2,0,18,8,2,0,0,https://assets.laliga.com/squad/2023/t191/p546...,6.17,170,2003-05-13,Spain
3,Alejandro Pozo,Almería,LALIGA,Defender,17.0,0,0,0,0,1,1,0,0,0,https://assets.laliga.com/squad/2023/t1564/p23...,5.75,152,1999-02-22,Spain
4,Iván Cuéllar,Mallorca,LALIGA,Goalkeeper,25.0,0,0,0,0,0,0,0,0,0,https://assets.laliga.com/squad/2023/t181/p162...,6.17,168,1984-05-27,Spain


### Creating Dataframes for SQL Tables:
---

#### Creating Leagues Dataframe

In [23]:
leagues = athelete_stats_df["league"].unique()
leagues_df = pd.DataFrame(leagues)
leagues_df = leagues_df.reset_index()
leagues_df = leagues_df.rename(columns={"index":"league_id", 0:"league"})
leagues_df["league_id"] = leagues_df["league_id"]+1
leagues_df["league_id"] = ["lg" + str(league_id) for league_id in leagues_df["league_id"]]
leagues_df

Unnamed: 0,league_id,league
0,lg1,LALIGA
1,lg2,Bundesliga
2,lg3,Liga MX
3,lg4,Ligue 1
4,lg5,Serie A
5,lg6,MLS
6,lg7,NWSL
7,lg8,Premier League


#### Creating Teams Dataframe

In [24]:
teams = athelete_stats_df["team"].unique()
teams_df = pd.DataFrame(teams)
teams_df = teams_df.reset_index()
teams_df = teams_df.rename(columns={"index":"team_id", 0:"team"})
teams_df["team_id"] = teams_df["team_id"]+1
teams_df["team_id"] = ["tm" + str(team_id) for team_id in teams_df["team_id"]]
teams_df.head(5)

Unnamed: 0,team_id,team
0,tm1,Real Sociedad
1,tm2,Villarreal
2,tm3,Valencia
3,tm4,Almería
4,tm5,Mallorca


#### Creating Positions Dataframe

In [25]:
positions = athelete_stats_df["position"].unique()
positions_df = pd.DataFrame(positions)
positions_df = positions_df.reset_index()
positions_df = positions_df.rename(columns={"index":"position_id", 0:"position"})
positions_df["position_id"] = positions_df["position_id"]+1
positions_df["position_id"] = ["pos" + str(position_id) for position_id in positions_df["position_id"]]
positions_df

Unnamed: 0,position_id,position
0,pos1,Forward
1,pos2,Goalkeeper
2,pos3,Midfielder
3,pos4,Defender


#### Creating Nationalities Dataframe

In [26]:
nationalities = athelete_stats_df["nationality"].unique()
nationalities_df = pd.DataFrame(nationalities)
nationalities_df = nationalities_df.reset_index()
nationalities_df = nationalities_df.rename(columns={"index":"nationality_id", 0:"nationality"})
nationalities_df["nationality_id"] = nationalities_df["nationality_id"]+1
nationalities_df["nationality_id"] = ["nat" + str(nationality_id) for nationality_id in nationalities_df["nationality_id"]]
nationalities_df.head(5)

Unnamed: 0,nationality_id,nationality
0,nat1,Portugal
1,nat2,Spain
2,nat3,Guinea
3,nat4,Argentina
4,nat5,Uruguay


In [27]:
# Fill na values for nationalities
nationalities_df["nationality"] = nationalities_df["nationality"].fillna("N/A")

#### Creating Player Data Dataframe

In [28]:
# Add player ID
player_data_df = athelete_stats_df
player_data_df = player_data_df.reset_index()
player_data_df = player_data_df.rename(columns={"index":"player_id"})
player_data_df["player_id"] = player_data_df["player_id"]+1
player_data_df["player_id"] = ["pl" + str(player_id) for player_id in player_data_df["player_id"]]
player_data_df.head(5)

Unnamed: 0,player_id,player_name,team,league,position,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,clean_sheets,goals_against,photo_url,height_(ft),weight_(lbs),birthdate,nationality
0,pl1,André Silva,Real Sociedad,LALIGA,Forward,21.0,0,0,0,0,2,1,0,0,0,https://assets.laliga.com/squad/2023/t188/p165...,6.08,183,1995-11-06,Portugal
1,pl2,Pepe Reina,Villarreal,LALIGA,Goalkeeper,1.0,0,0,0,0,0,0,0,0,0,https://assets.laliga.com/squad/2023/t449/p843...,6.17,203,1982-08-31,Spain
2,pl3,Javi Guerra,Valencia,LALIGA,Midfielder,8.0,3,1,2,0,18,8,2,0,0,https://assets.laliga.com/squad/2023/t191/p546...,6.17,170,2003-05-13,Spain
3,pl4,Alejandro Pozo,Almería,LALIGA,Defender,17.0,0,0,0,0,1,1,0,0,0,https://assets.laliga.com/squad/2023/t1564/p23...,5.75,152,1999-02-22,Spain
4,pl5,Iván Cuéllar,Mallorca,LALIGA,Goalkeeper,25.0,0,0,0,0,0,0,0,0,0,https://assets.laliga.com/squad/2023/t181/p162...,6.17,168,1984-05-27,Spain


In [29]:
# Add ID columns
player_data_df = pd.merge(player_data_df,teams_df, on=["team"], how="inner")
player_data_df = pd.merge(player_data_df,leagues_df, on=["league"], how="inner")
player_data_df = pd.merge(player_data_df,positions_df, on=["position"], how="inner")
player_data_df = pd.merge(player_data_df,nationalities_df, on=["nationality"], how="inner")
player_data_df

Unnamed: 0,player_id,player_name,team,league,position,jersey_number,goals,assists,yellow_cards,red_cards,...,goals_against,photo_url,height_(ft),weight_(lbs),birthdate,nationality,team_id,league_id,position_id,nationality_id
0,pl1,André Silva,Real Sociedad,LALIGA,Forward,21.0,0,0,0,0,...,0,https://assets.laliga.com/squad/2023/t188/p165...,6.08,183,1995-11-06,Portugal,tm1,lg1,pos1,nat1
1,pl80,Bebé,Rayo Vallecano,LALIGA,Forward,10.0,2,0,1,0,...,0,https://assets.laliga.com/squad/2023/t184/p843...,6.25,183,1990-07-12,Portugal,tm8,lg1,pos1,nat1
2,pl368,João Félix,Barcelona,LALIGA,Forward,14.0,1,2,1,0,...,0,https://assets.laliga.com/squad/2023/t178/p428...,5.92,152,1999-11-10,Portugal,tm19,lg1,pos1,nat1
3,pl731,Gonçalo Paciência,VfL Bochum,Bundesliga,Forward,9.0,1,0,2,0,...,0,https://assets.bundesliga.com/player/dfl-obj-0...,6.00,179,1994-08-01,Portugal,tm35,lg2,pos1,nat1
4,pl949,Tiago Tomas,VfL Wolfsburg,Bundesliga,Forward,11.0,1,0,1,0,...,0,https://assets.bundesliga.com/player/dfl-obj-j...,5.92,150,2002-06-16,Portugal,tm36,lg2,pos1,nat1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4323,pl3145,Steven Sserwadda,New York Red Bulls,MLS,Midfielder,65.0,0,0,0,0,...,0,https://a.espncdn.com/i/headshots/soccer/playe...,5.42,161,2002-08-28,Uganda,tm122,lg6,pos3,nat109
4324,pl3466,Ibrahim Kasule,New York Red Bulls,MLS,Midfielder,31.0,0,0,0,0,...,0,,,,2004-02-17,Uganda,tm122,lg6,pos3,nat109
4325,pl3548,Wang Shuang,Racing Louisville FC,NWSL,Midfielder,77.0,2,2,2,0,...,0,https://d2nkt8hgeld8zj.cloudfront.net\images\2...,5.42,128,1995-01-23,China PR,tm126,lg7,pos3,nat111
4326,pl4160,Shandon Baptiste,Brentford,Premier League,Midfielder,26.0,0,0,0,0,...,0,https://resources.premierleague.com/premierlea...,5.75,148,1998-04-08,Grenada,tm154,lg8,pos3,nat114


In [30]:
# Drop sections no longer needed
player_data_df = player_data_df.drop(["team"], axis=1)
player_data_df = player_data_df.drop(["league"], axis=1)
player_data_df = player_data_df.drop(["position"], axis=1)
player_data_df = player_data_df.drop(["nationality"], axis=1)

In [31]:
# Reorder dataframe
player_data_df = player_data_df[[
                                "player_id",
                                "player_name",
                                "team_id",
                                "league_id",
                                "position_id",
                                "jersey_number",
                                "goals",
                                "assists",
                                "yellow_cards",
                                "red_cards",
                                "shots",
                                "shots_on_goal",
                                "saves",
                                "clean_sheets",
                                "goals_against",
                                "photo_url",
                                "height_(ft)",
                                "weight_(lbs)",
                                "birthdate",
                                "nationality_id"
                            ]]
player_data_df.head()

Unnamed: 0,player_id,player_name,team_id,league_id,position_id,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,clean_sheets,goals_against,photo_url,height_(ft),weight_(lbs),birthdate,nationality_id
0,pl1,André Silva,tm1,lg1,pos1,21.0,0,0,0,0,2,1,0,0,0,https://assets.laliga.com/squad/2023/t188/p165...,6.08,183,1995-11-06,nat1
1,pl80,Bebé,tm8,lg1,pos1,10.0,2,0,1,0,9,4,0,0,0,https://assets.laliga.com/squad/2023/t184/p843...,6.25,183,1990-07-12,nat1
2,pl368,João Félix,tm19,lg1,pos1,14.0,1,2,1,0,24,10,1,0,0,https://assets.laliga.com/squad/2023/t178/p428...,5.92,152,1999-11-10,nat1
3,pl731,Gonçalo Paciência,tm35,lg2,pos1,9.0,1,0,2,0,10,3,0,0,0,https://assets.bundesliga.com/player/dfl-obj-0...,6.0,179,1994-08-01,nat1
4,pl949,Tiago Tomas,tm36,lg2,pos1,11.0,1,0,1,0,13,6,0,0,0,https://assets.bundesliga.com/player/dfl-obj-j...,5.92,150,2002-06-16,nat1


In [32]:
# Complete teams dataframe
teams_df = pd.merge(teams_df,player_data_df[["team_id","league_id"]], on=["team_id"], how ="left")
teams_df = teams_df.drop_duplicates()
teams_df = teams_df.reset_index(drop=True)
teams_df

Unnamed: 0,team_id,team,league_id
0,tm1,Real Sociedad,lg1
1,tm2,Villarreal,lg1
2,tm3,Valencia,lg1
3,tm4,Almería,lg1
4,tm5,Mallorca,lg1
...,...,...,...
150,tm151,Burnley,lg8
151,tm152,Chelsea,lg8
152,tm153,Crystal Palace,lg8
153,tm154,Brentford,lg8


#### Correct Data Types

In [33]:
# Check data types
player_data_df.dtypes

player_id                 object
player_name               object
team_id                   object
league_id                 object
position_id               object
jersey_number            float64
goals                      int64
assists                    int64
yellow_cards               int64
red_cards                  int64
shots                      int64
shots_on_goal              int64
saves                      int64
clean_sheets               int64
goals_against              int64
photo_url                 object
height_(ft)              float64
weight_(lbs)              object
birthdate         datetime64[ns]
nationality_id            object
dtype: object

In [34]:
# Set data type for jersey numbers
jersey_int=[]
for jersey_number in player_data_df["jersey_number"]:
    
    try:
        new_type = int(jersey_number)
    except:
       new_type = -1
    
    jersey_int.append(new_type)
    
print(jersey_int)

[21, 10, 14, 9, 11, 9, 19, 9, 67, 10, 47, 20, 70, 16, 14, 28, 20, 7, 9, 12, 3, 6, 2, 4, 22, 24, 22, 13, 13, 6, 14, 3, 22, 24, 20, 17, 3, 10, 12, 14, 18, 15, 17, 10, 29, 20, 8, 10, 7, 21, 26, 20, 27, 8, 21, 25, 13, 13, 31, 1, 1, 12, 1, 9, 10, 7, 7, 15, 27, 30, 9, 16, 36, 10, 9, 19, 9, 10, 7, 20, 12, 11, 29, 22, 19, 34, 9, 21, 7, 19, 16, 19, 32, 9, 22, 33, 14, 10, 9, 38, 29, 10, 7, 7, 19, 14, 11, 20, 23, 24, 30, 31, 9, 7, 19, 7, 37, 27, 38, 15, 11, 17, 9, 42, 32, 48, 33, 7, 11, 77, 21, 9, 9, 19, 9, 9, 11, 31, 11, 6, 24, 2, 3, 20, 26, 3, 17, 18, 5, 37, 26, 14, 21, 6, 3, 29, 31, 34, 17, 3, 15, 21, 20, 18, 27, 11, 21, 3, 6, 29, 27, 31, 16, 4, 3, 27, 18, 17, 5, 15, 23, 3, 4, 24, 19, 32, 27, 5, 4, 3, 23, 2, 22, 20, 3, 5, 15, 30, 3, 22, 17, 28, 16, 14, 5, 4, 15, 12, 28, 33, 32, 34, 6, 2, 20, 15, 2, 3, 24, 34, 20, 4, 3, 23, 28, 32, 16, 18, 21, 27, 3, 15, 4, 5, 2, 24, 12, 28, 25, 15, 5, 11, 3, 4, 4, 14, 6, 3, 2, 15, 31, 17, 20, 5, 3, 39, 33, 3, 2, 7, 27, 16, 39, 26, 2, 45, 20, 20, 3, 14, 74, 22,

In [35]:
# Set data type for weight
weight_int=[]
for weight in player_data_df["weight_(lbs)"]:
    
    try:
        new_type = int(weight)
    except:
       new_type = -1
    
    weight_int.append(new_type)
    
print(weight_int)

[183, 183, 152, 179, 150, 172, 161, 161, -1, 179, 161, 185, -1, 150, 192, 183, 150, 137, 163, 150, 150, 174, 161, 152, 157, 146, -1, 170, 172, 148, 148, 168, 148, 159, 168, 148, 163, 137, 163, 183, 137, 183, 139, 139, 161, 152, 159, -1, 168, 183, 183, 139, 172, 150, 128, 181, 172, 201, 192, 179, 183, 183, 183, 159, 172, 163, 170, 152, -1, -1, 152, 137, -1, 146, 179, -1, 190, 141, 161, 190, 170, 148, -1, 159, 159, 150, 157, 161, 161, 172, 174, 183, -1, 161, 161, 161, 174, 159, 190, 174, 161, 148, 163, 172, 161, 190, 150, 152, 152, 146, -1, -1, 157, 168, 152, 170, 170, -1, -1, 172, 150, 150, -1, 159, -1, 174, 150, 174, 168, 161, -1, 141, 168, 181, 181, 119, 130, 146, 190, 157, 174, 146, 150, 170, 174, 161, 132, 141, 163, -1, 137, 146, 168, 137, 172, -1, 139, -1, 152, 172, 152, 172, 161, 190, -1, 137, 174, 139, 161, -1, -1, -1, 130, 181, 150, -1, 170, 172, 172, 152, 179, 172, 168, -1, 139, -1, -1, 170, 172, 159, 163, 152, 170, 150, 183, 170, 159, -1, 168, 163, 152, 172, 157, 181, 172, 190

In [36]:
# Replace converted sections
player_data_df["jersey_number"] = jersey_int
player_data_df["weight_(lbs)"] = weight_int

In [37]:
# Drop league ID
player_data_df = player_data_df.drop(["league_id"], axis=1)

In [38]:
# Deal with na values
player_data_df["goals"] = player_data_df["goals"].fillna(0)
player_data_df["assists"] = player_data_df["assists"].fillna(0)
player_data_df["yellow_cards"] = player_data_df["yellow_cards"].fillna(0)
player_data_df["red_cards"] = player_data_df["red_cards"].fillna(0)
player_data_df["shots"] = player_data_df["shots"].fillna(0)
player_data_df["shots_on_goal"] = player_data_df["shots_on_goal"].fillna(0)
player_data_df["saves"] = player_data_df["saves"].fillna(0)
player_data_df["clean_sheets"] = player_data_df["clean_sheets"].fillna(0)
player_data_df["goals_against"] = player_data_df["goals_against"].fillna(0)
player_data_df["photo_url"] = player_data_df["photo_url"].fillna("N/A")
player_data_df["height_(ft)"] = player_data_df["height_(ft)"].fillna(0)
player_data_df["weight_(lbs)"] = player_data_df["weight_(lbs)"].fillna(0)
player_data_df["birthdate"] = player_data_df["birthdate"].fillna(dt.date(1111,11,11))

In [39]:
# Display Player Data dataframe
player_data_df.head(10)

Unnamed: 0,player_id,player_name,team_id,position_id,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,clean_sheets,goals_against,photo_url,height_(ft),weight_(lbs),birthdate,nationality_id
0,pl1,André Silva,tm1,pos1,21,0,0,0,0,2,1,0,0,0,https://assets.laliga.com/squad/2023/t188/p165...,6.08,183,1995-11-06 00:00:00,nat1
1,pl80,Bebé,tm8,pos1,10,2,0,1,0,9,4,0,0,0,https://assets.laliga.com/squad/2023/t184/p843...,6.25,183,1990-07-12 00:00:00,nat1
2,pl368,João Félix,tm19,pos1,14,1,2,1,0,24,10,1,0,0,https://assets.laliga.com/squad/2023/t178/p428...,5.92,152,1999-11-10 00:00:00,nat1
3,pl731,Gonçalo Paciência,tm35,pos1,9,1,0,2,0,10,3,0,0,0,https://assets.bundesliga.com/player/dfl-obj-0...,6.0,179,1994-08-01 00:00:00,nat1
4,pl949,Tiago Tomas,tm36,pos1,11,1,0,1,0,13,6,0,0,0,https://assets.bundesliga.com/player/dfl-obj-j...,5.92,150,2002-06-16 00:00:00,nat1
5,pl1848,Goncalo Ramos,tm59,pos1,9,2,0,0,0,19,5,0,0,0,https://www.ligue1.com/-/media/Project/LFP/sha...,6.08,172,2001-06-20 00:00:00,nat1
6,pl1974,Diego Moreira,tm61,pos1,19,0,0,0,0,2,1,0,0,0,https://www.ligue1.com/-/media/Project/LFP/sha...,5.83,161,2004-08-06 00:00:00,nat1
7,pl1881,Vitinha,tm68,pos1,9,2,0,1,0,14,4,1,0,0,https://www.ligue1.com/-/media/Project/LFP/sha...,5.83,161,2000-03-15 00:00:00,nat1
8,pl2623,João Gabriel E Costa Cesco,tm80,pos1,67,0,0,0,0,0,0,0,0,0,https://img.legaseriea.it/vimages/,5.83,-1,2005-03-28 00:00:00,nat1
9,pl2364,Rafael Leão,tm81,pos1,10,3,3,1,0,21,7,0,0,0,https://img.legaseriea.it/vimages/64e4967d/LEA...,6.17,179,1999-06-10 00:00:00,nat1


In [40]:
player_data_df.dtypes

player_id          object
player_name        object
team_id            object
position_id        object
jersey_number       int64
goals               int64
assists             int64
yellow_cards        int64
red_cards           int64
shots               int64
shots_on_goal       int64
saves               int64
clean_sheets        int64
goals_against       int64
photo_url          object
height_(ft)       float64
weight_(lbs)        int64
birthdate          object
nationality_id     object
dtype: object

In [41]:
teams_df.dtypes

team_id      object
team         object
league_id    object
dtype: object

In [42]:
leagues_df.dtypes

league_id    object
league       object
dtype: object

In [43]:
positions_df.dtypes

position_id    object
position       object
dtype: object

In [44]:
nationalities_df.dtypes

nationality_id    object
nationality       object
dtype: object

### Exports:
---

In [45]:
# Export CSV 1-Player Data
player_data_df.to_csv("../2-database_building/cleaned_data_exports/complete_player_data.csv", index=False)

In [46]:
# Export CSV 2-Teams Data
teams_df.to_csv("../2-database_building/cleaned_data_exports/complete_teams.csv", index=False)

In [47]:
# Export CSV 3-Leagues Data
leagues_df.to_csv("../2-database_building/cleaned_data_exports/complete_leagues.csv", index=False)

In [48]:
# Export CSV 4-Positions Data
positions_df.to_csv("../2-database_building/cleaned_data_exports/complete_positions.csv", index=False)

In [49]:
# Export CSV 5-Nationalities Data
nationalities_df.to_csv("../2-database_building/cleaned_data_exports/complete_nationalities.csv", index=False)