## Cleaning the Data
-----

### Setup:
---

In [1]:
# Import dependencies
import pandas as pd
import datetime as dt

In [2]:
# Grab the CSV
athelete_stats_df = pd.read_csv("initial_unclean_csv/complete_unclean_data.csv", encoding="utf8")

In [3]:
# Display as dataframe
athelete_stats_df.head()

Unnamed: 0,player_name,team,league,position,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,clean_sheets,goals_against,photo_url,height,weight,birthdate,nationality
0,Patrick Agyemang,Charlotte FC,MLS,Forward,33.0,2,0,3,0,8,3,0,0,6,https://a.espncdn.com/i/headshots/soccer/playe...,"6' 4""",196 lbs,2000-11-07T08:00Z,Ghana
1,David Bingham,Portland Timbers,MLS,Goalkeeper,1.0,0,0,2,0,0,0,49,6,29,https://a.espncdn.com/i/headshots/soccer/playe...,"6' 2""",183 lbs,1989-10-19T07:00Z,USA
2,Ashley Westwood,Charlotte FC,MLS,Midfielder,8.0,3,4,6,0,18,3,6,6,45,https://a.espncdn.com/i/headshots/soccer/playe...,"5' 9""",174 lbs,1990-04-01T08:00Z,England
3,Sean Davis,Nashville SC,MLS,Midfielder,54.0,0,1,0,0,11,1,18,8,21,https://a.espncdn.com/i/headshots/soccer/playe...,"6' 0""",163 lbs,1993-02-08T08:00Z,USA
4,Steve Clark,Houston Dynamo FC,MLS,Goalkeeper,12.0,0,2,5,0,0,0,90,13,40,https://a.espncdn.com/i/headshots/soccer/playe...,"6' 2""",190 lbs,1986-04-14T08:00Z,USA


### Cleaning:
---

#### Convert Height

In [4]:
# Split current heights
split_heights = athelete_stats_df["height"]
split_heights = split_heights.str.split("'", n=1, expand=True)

In [5]:
# Remove everthing but the numbers from the feet
# Convert to float for math
heights_feet = split_heights[0].astype("float")
heights_feet.head(5)

0    6.0
1    6.0
2    5.0
3    6.0
4    6.0
Name: 0, dtype: float64

In [6]:
# Remove everything but the nubmers from the inches
heights_inches = split_heights[1]

split_heights_inches = heights_inches.str.split('"', n=1, expand=True)

# Convert to float for math
split_heights_inches = split_heights_inches[0].astype("float")
heights_inches = split_heights_inches
heights_inches.head(5)

0    4.0
1    2.0
2    9.0
3    0.0
4    2.0
Name: 0, dtype: float64

In [7]:
# Converting inches to feet
inches_in_feet = []
for measurement in heights_inches:
    cur_feet = measurement/12
    inches_in_feet.append(cur_feet)
   

In [8]:
# Create dataframe with new heights colums, both in feet
split_heights_in_ft = {
    "heights_ft1": heights_feet,
    "heights_ft2": inches_in_feet
}
split_heights_in_ft_df = pd.DataFrame(split_heights_in_ft)
split_heights_in_ft_df.head(5)

Unnamed: 0,heights_ft1,heights_ft2
0,6.0,0.333333
1,6.0,0.166667
2,5.0,0.75
3,6.0,0.0
4,6.0,0.166667


In [9]:
# Adding feet and converted inches column to get new total feet measurments column with decimals
new_height = split_heights_in_ft_df["heights_ft1"] + split_heights_in_ft_df["heights_ft2"]

# Round the combined height
new_height = round(new_height,2)

In [10]:
# Display cleaned data
new_height.head(5)

0    6.33
1    6.17
2    5.75
3    6.00
4    6.17
dtype: float64

#### Convert Weight

In [11]:
# Grab uncleaned weight
uncleaned_weight = athelete_stats_df["weight"]
uncleaned_weight.head(5)

0    196 lbs
1    183 lbs
2    174 lbs
3    163 lbs
4    190 lbs
Name: weight, dtype: object

In [12]:
# Split weights
split_weights = uncleaned_weight.str.split(' ', n=1, expand=True)
split_weights.head(5)

Unnamed: 0,0,1
0,196,lbs
1,183,lbs
2,174,lbs
3,163,lbs
4,190,lbs


In [13]:
# Display cleaned data
new_weight = split_weights[0]
new_weight.head(5)

0    196
1    183
2    174
3    163
4    190
Name: 0, dtype: object

#### Convert Dates

In [14]:
# Split Current Dates
split_birthdates = athelete_stats_df["birthdate"]
split_birthdates = split_birthdates.str.split("T", n=1, expand=True)

In [15]:
# Select only the part we want to keep
split_birthdates = split_birthdates [0]
split_birthdates.head(5)

0    2000-11-07
1    1989-10-19
2    1990-04-01
3    1993-02-08
4    1986-04-14
Name: 0, dtype: object

In [16]:
# Convert the dates to datetime
split_birthdates = pd.to_datetime(split_birthdates)
split_birthdates.head(5)

0   2000-11-07
1   1989-10-19
2   1990-04-01
3   1993-02-08
4   1986-04-14
Name: 0, dtype: datetime64[ns]

In [17]:
# Check dates converted to timestamps
type (split_birthdates[0])

pandas._libs.tslibs.timestamps.Timestamp

In [18]:
# Display cleaned data
new_birthdates = split_birthdates
new_birthdates.head(5)

0   2000-11-07
1   1989-10-19
2   1990-04-01
3   1993-02-08
4   1986-04-14
Name: 0, dtype: datetime64[ns]

#### Remove "Goals Against" and "Clean Sheets" for Non-Goalies

In [20]:
# Remove goals against and clean sheets for non-goalies
new_goals_against = []
new_clean_sheets = []

for index, row in athelete_stats_df.iterrows():
    cur_position = row["position"]
    cur_goals_against = row["goals_against"]
    cur_clean_sheets = row["clean_sheets"]
    
    if cur_position == "Goalkeeper":
        new_goals_against.append(cur_goals_against)
        new_clean_sheets.append(cur_clean_sheets)
    else:
        new_goals_against.append(0)
        new_clean_sheets.append(0)


#### Creating Cleaned Dataframe

In [21]:
# Remove unclean sections from dataframe
athelete_stats_df = athelete_stats_df.drop(["birthdate"], axis=1)
athelete_stats_df = athelete_stats_df.drop(["height"], axis=1)
athelete_stats_df = athelete_stats_df.drop(["weight"], axis=1)
athelete_stats_df = athelete_stats_df.drop(["goals_against"], axis=1)
athelete_stats_df = athelete_stats_df.drop(["clean_sheets"], axis=1)
athelete_stats_df.head()

Unnamed: 0,player_name,team,league,position,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,photo_url,nationality
0,Patrick Agyemang,Charlotte FC,MLS,Forward,33.0,2,0,3,0,8,3,0,https://a.espncdn.com/i/headshots/soccer/playe...,Ghana
1,David Bingham,Portland Timbers,MLS,Goalkeeper,1.0,0,0,2,0,0,0,49,https://a.espncdn.com/i/headshots/soccer/playe...,USA
2,Ashley Westwood,Charlotte FC,MLS,Midfielder,8.0,3,4,6,0,18,3,6,https://a.espncdn.com/i/headshots/soccer/playe...,England
3,Sean Davis,Nashville SC,MLS,Midfielder,54.0,0,1,0,0,11,1,18,https://a.espncdn.com/i/headshots/soccer/playe...,USA
4,Steve Clark,Houston Dynamo FC,MLS,Goalkeeper,12.0,0,2,5,0,0,0,90,https://a.espncdn.com/i/headshots/soccer/playe...,USA


In [22]:
# Add clean sections to dataframe
athelete_stats_df["birthdate"] = new_birthdates
athelete_stats_df["height_(ft)"] = new_height
athelete_stats_df["weight_(lbs)"] = new_weight
athelete_stats_df["goals_against"] = new_goals_against
athelete_stats_df["clean_sheets"] = new_clean_sheets
athelete_stats_df.head()

Unnamed: 0,player_name,team,league,position,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,photo_url,nationality,birthdate,height_(ft),weight_(lbs),goals_against,clean_sheets
0,Patrick Agyemang,Charlotte FC,MLS,Forward,33.0,2,0,3,0,8,3,0,https://a.espncdn.com/i/headshots/soccer/playe...,Ghana,2000-11-07,6.33,196,0,0
1,David Bingham,Portland Timbers,MLS,Goalkeeper,1.0,0,0,2,0,0,0,49,https://a.espncdn.com/i/headshots/soccer/playe...,USA,1989-10-19,6.17,183,29,6
2,Ashley Westwood,Charlotte FC,MLS,Midfielder,8.0,3,4,6,0,18,3,6,https://a.espncdn.com/i/headshots/soccer/playe...,England,1990-04-01,5.75,174,0,0
3,Sean Davis,Nashville SC,MLS,Midfielder,54.0,0,1,0,0,11,1,18,https://a.espncdn.com/i/headshots/soccer/playe...,USA,1993-02-08,6.0,163,0,0
4,Steve Clark,Houston Dynamo FC,MLS,Goalkeeper,12.0,0,2,5,0,0,0,90,https://a.espncdn.com/i/headshots/soccer/playe...,USA,1986-04-14,6.17,190,40,13


In [23]:
# Reorder dataframe
athelete_stats_df = athelete_stats_df[["player_name",
                                       "team",
                                       "league",
                                       "position",
                                       "jersey_number",
                                       "goals",
                                       "assists",
                                       "yellow_cards",
                                       "red_cards",
                                       "shots",
                                       "shots_on_goal",
                                       "saves",
                                       "clean_sheets",
                                       "goals_against",
                                       "photo_url",
                                       "height_(ft)",
                                       "weight_(lbs)",
                                       "birthdate",
                                       "nationality"
                                       ]]
athelete_stats_df.head()

Unnamed: 0,player_name,team,league,position,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,clean_sheets,goals_against,photo_url,height_(ft),weight_(lbs),birthdate,nationality
0,Patrick Agyemang,Charlotte FC,MLS,Forward,33.0,2,0,3,0,8,3,0,0,0,https://a.espncdn.com/i/headshots/soccer/playe...,6.33,196,2000-11-07,Ghana
1,David Bingham,Portland Timbers,MLS,Goalkeeper,1.0,0,0,2,0,0,0,49,6,29,https://a.espncdn.com/i/headshots/soccer/playe...,6.17,183,1989-10-19,USA
2,Ashley Westwood,Charlotte FC,MLS,Midfielder,8.0,3,4,6,0,18,3,6,0,0,https://a.espncdn.com/i/headshots/soccer/playe...,5.75,174,1990-04-01,England
3,Sean Davis,Nashville SC,MLS,Midfielder,54.0,0,1,0,0,11,1,18,0,0,https://a.espncdn.com/i/headshots/soccer/playe...,6.0,163,1993-02-08,USA
4,Steve Clark,Houston Dynamo FC,MLS,Goalkeeper,12.0,0,2,5,0,0,0,90,13,40,https://a.espncdn.com/i/headshots/soccer/playe...,6.17,190,1986-04-14,USA


### Creating Dataframes for SQL Tables:
---

#### Creating Leagues Dataframe

In [24]:
leagues = athelete_stats_df["league"].unique()
leagues_df = pd.DataFrame(leagues)
leagues_df = leagues_df.reset_index()
leagues_df = leagues_df.rename(columns={"index":"league_id", 0:"league"})
leagues_df["league_id"] = leagues_df["league_id"]+1
leagues_df["league_id"] = ["lg" + str(league_id) for league_id in leagues_df["league_id"]]
leagues_df

Unnamed: 0,league_id,league
0,lg1,MLS
1,lg2,NWSL
2,lg3,Premier League
3,lg4,LALIGA
4,lg5,Bundesliga
5,lg6,Liga MX
6,lg7,Ligue 1
7,lg8,Serie A


#### Creating Teams Dataframe

In [25]:
teams = athelete_stats_df["team"].unique()
teams_df = pd.DataFrame(teams)
teams_df = teams_df.reset_index()
teams_df = teams_df.rename(columns={"index":"team_id", 0:"team"})
teams_df["team_id"] = teams_df["team_id"]+1
teams_df["team_id"] = ["tm" + str(team_id) for team_id in teams_df["team_id"]]
teams_df.head(5)

Unnamed: 0,team_id,team
0,tm1,Charlotte FC
1,tm2,Portland Timbers
2,tm3,Nashville SC
3,tm4,Houston Dynamo FC
4,tm5,Inter Miami CF


#### Creating Positions Dataframe

In [26]:
positions = athelete_stats_df["position"].unique()
positions_df = pd.DataFrame(positions)
positions_df = positions_df.reset_index()
positions_df = positions_df.rename(columns={"index":"position_id", 0:"position"})
positions_df["position_id"] = positions_df["position_id"]+1
positions_df["position_id"] = ["pos" + str(position_id) for position_id in positions_df["position_id"]]
positions_df

Unnamed: 0,position_id,position
0,pos1,Forward
1,pos2,Goalkeeper
2,pos3,Midfielder
3,pos4,Defender


#### Creating Nationalities Dataframe

In [27]:
nationalities = athelete_stats_df["nationality"].unique()
nationalities_df = pd.DataFrame(nationalities)
nationalities_df = nationalities_df.reset_index()
nationalities_df = nationalities_df.rename(columns={"index":"nationality_id", 0:"nationality"})
nationalities_df["nationality_id"] = nationalities_df["nationality_id"]+1
nationalities_df["nationality_id"] = ["nat" + str(nationality_id) for nationality_id in nationalities_df["nationality_id"]]
nationalities_df.head(5)

Unnamed: 0,nationality_id,nationality
0,nat1,Ghana
1,nat2,USA
2,nat3,England
3,nat4,Costa Rica
4,nat5,Mexico


In [28]:
# Fill na values for nationalities
nationalities_df["nationality"] = nationalities_df["nationality"].fillna("N/A")

#### Creating Player Data Dataframe

In [29]:
# Add player ID
player_data_df = athelete_stats_df
player_data_df = player_data_df.reset_index()
player_data_df = player_data_df.rename(columns={"index":"player_id"})
player_data_df["player_id"] = player_data_df["player_id"]+1
player_data_df["player_id"] = ["pl" + str(player_id) for player_id in player_data_df["player_id"]]
player_data_df.head(5)

Unnamed: 0,player_id,player_name,team,league,position,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,clean_sheets,goals_against,photo_url,height_(ft),weight_(lbs),birthdate,nationality
0,pl1,Patrick Agyemang,Charlotte FC,MLS,Forward,33.0,2,0,3,0,8,3,0,0,0,https://a.espncdn.com/i/headshots/soccer/playe...,6.33,196,2000-11-07,Ghana
1,pl2,David Bingham,Portland Timbers,MLS,Goalkeeper,1.0,0,0,2,0,0,0,49,6,29,https://a.espncdn.com/i/headshots/soccer/playe...,6.17,183,1989-10-19,USA
2,pl3,Ashley Westwood,Charlotte FC,MLS,Midfielder,8.0,3,4,6,0,18,3,6,0,0,https://a.espncdn.com/i/headshots/soccer/playe...,5.75,174,1990-04-01,England
3,pl4,Sean Davis,Nashville SC,MLS,Midfielder,54.0,0,1,0,0,11,1,18,0,0,https://a.espncdn.com/i/headshots/soccer/playe...,6.0,163,1993-02-08,USA
4,pl5,Steve Clark,Houston Dynamo FC,MLS,Goalkeeper,12.0,0,2,5,0,0,0,90,13,40,https://a.espncdn.com/i/headshots/soccer/playe...,6.17,190,1986-04-14,USA


In [30]:
# Add ID columns
player_data_df = pd.merge(player_data_df,teams_df, on=["team"], how="inner")
player_data_df = pd.merge(player_data_df,leagues_df, on=["league"], how="inner")
player_data_df = pd.merge(player_data_df,positions_df, on=["position"], how="inner")
player_data_df = pd.merge(player_data_df,nationalities_df, on=["nationality"], how="inner")
player_data_df

Unnamed: 0,player_id,player_name,team,league,position,jersey_number,goals,assists,yellow_cards,red_cards,...,goals_against,photo_url,height_(ft),weight_(lbs),birthdate,nationality,team_id,league_id,position_id,nationality_id
0,pl1,Patrick Agyemang,Charlotte FC,MLS,Forward,33.0,2,0,3,0,...,0,https://a.espncdn.com/i/headshots/soccer/playe...,6.33,196,2000-11-07,Ghana,tm1,lg1,pos1,nat1
1,pl697,Kwadwo Opoku,CF Montréal,MLS,Forward,90.0,6,3,1,1,...,0,https://a.espncdn.com/i/headshots/soccer/playe...,5.58,152,2001-07-13,Ghana,tm17,lg1,pos1,nat1
2,pl825,Joshua Bolma,New England Revolution,MLS,Forward,16.0,0,0,0,0,...,0,https://a.espncdn.com/i/headshots/soccer/playe...,5.75,132,2002-04-10,Ghana,tm24,lg1,pos1,nat1
3,pl223,Eugene Ansah,FC Dallas,MLS,Forward,31.0,1,0,0,0,...,0,,,,1994-12-16,Ghana,tm27,lg1,pos1,nat1
4,pl1556,Antoine Semenyo,AFC Bournemouth,Premier League,Forward,24.0,2,0,1,0,...,0,https://resources.premierleague.com/premierlea...,6.08,172,2000-01-07,Ghana,tm44,lg3,pos1,nat1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4323,pl4032,Eljif Elmas,Napoli,Serie A,Midfielder,7.0,1,0,1,0,...,0,https://img.legaseriea.it/vimages/64e38272/ELM...,6.00,150,1999-09-24,North Macedonia,tm155,lg8,pos3,nat98
4324,pl1834,Stole Dimitrievski,Rayo Vallecano,LALIGA,Goalkeeper,1.0,0,0,1,0,...,17,https://assets.laliga.com/squad/2023/t184/p948...,6.17,183,1993-12-25,North Macedonia,tm69,lg4,pos2,nat98
4325,pl3494,Youssouf Ndayishimiye,Nice,Ligue 1,Defender,55.0,0,0,2,1,...,0,https://www.ligue1.com/-/media/Project/LFP/sha...,6.00,,1998-10-27,Burundi,tm119,lg7,pos4,nat111
4326,pl3407,Yoann Salmier,Le Havre AC,Ligue 1,Defender,22.0,0,1,1,0,...,0,https://www.ligue1.com/-/media/Project/LFP/sha...,6.17,185,1992-11-21,French Guiana,tm130,lg7,pos4,nat109


In [31]:
# Drop sections no longer needed
player_data_df = player_data_df.drop(["team"], axis=1)
player_data_df = player_data_df.drop(["league"], axis=1)
player_data_df = player_data_df.drop(["position"], axis=1)
player_data_df = player_data_df.drop(["nationality"], axis=1)

In [32]:
# Reorder dataframe
player_data_df = player_data_df[[
                                "player_id",
                                "player_name",
                                "team_id",
                                "league_id",
                                "position_id",
                                "jersey_number",
                                "goals",
                                "assists",
                                "yellow_cards",
                                "red_cards",
                                "shots",
                                "shots_on_goal",
                                "saves",
                                "clean_sheets",
                                "goals_against",
                                "photo_url",
                                "height_(ft)",
                                "weight_(lbs)",
                                "birthdate",
                                "nationality_id"
                            ]]
player_data_df.head()

Unnamed: 0,player_id,player_name,team_id,league_id,position_id,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,clean_sheets,goals_against,photo_url,height_(ft),weight_(lbs),birthdate,nationality_id
0,pl1,Patrick Agyemang,tm1,lg1,pos1,33.0,2,0,3,0,8,3,0,0,0,https://a.espncdn.com/i/headshots/soccer/playe...,6.33,196.0,2000-11-07,nat1
1,pl697,Kwadwo Opoku,tm17,lg1,pos1,90.0,6,3,1,1,51,18,2,0,0,https://a.espncdn.com/i/headshots/soccer/playe...,5.58,152.0,2001-07-13,nat1
2,pl825,Joshua Bolma,tm24,lg1,pos1,16.0,0,0,0,0,1,0,0,0,0,https://a.espncdn.com/i/headshots/soccer/playe...,5.75,132.0,2002-04-10,nat1
3,pl223,Eugene Ansah,tm27,lg1,pos1,31.0,1,0,0,0,8,4,0,0,0,,,,1994-12-16,nat1
4,pl1556,Antoine Semenyo,tm44,lg3,pos1,24.0,2,0,1,0,19,8,0,0,0,https://resources.premierleague.com/premierlea...,6.08,172.0,2000-01-07,nat1


In [33]:
# Complete teams dataframe
teams_df = pd.merge(teams_df,player_data_df[["team_id","league_id"]], on=["team_id"], how ="left")
teams_df = teams_df.drop_duplicates()
teams_df = teams_df.reset_index(drop=True)
teams_df

Unnamed: 0,team_id,team,league_id
0,tm1,Charlotte FC,lg1
1,tm2,Portland Timbers,lg1
2,tm3,Nashville SC,lg1
3,tm4,Houston Dynamo FC,lg1
4,tm5,Inter Miami CF,lg1
...,...,...,...
150,tm151,Frosinone,lg8
151,tm152,Empoli,lg8
152,tm153,Torino,lg8
153,tm154,Lecce,lg8


#### Correct Data Types

In [34]:
# Check data types
player_data_df.dtypes

player_id                 object
player_name               object
team_id                   object
league_id                 object
position_id               object
jersey_number            float64
goals                      int64
assists                    int64
yellow_cards               int64
red_cards                  int64
shots                      int64
shots_on_goal              int64
saves                      int64
clean_sheets               int64
goals_against              int64
photo_url                 object
height_(ft)              float64
weight_(lbs)              object
birthdate         datetime64[ns]
nationality_id            object
dtype: object

In [35]:
# Set data type for jersey numbers
jersey_int=[]
for jersey_number in player_data_df["jersey_number"]:
    
    try:
        new_type = int(jersey_number)
    except:
       new_type = -1
    
    jersey_int.append(new_type)
    
print(jersey_int)

[33, 90, 16, 31, 24, 9, 9, 22, 17, 37, 18, 11, 14, 11, 18, 14, 14, 5, 15, 4, 7, 6, 32, 25, 30, 6, 15, 4, 22, 2, 16, 15, 5, 36, 37, 11, 11, 11, 9, 9, 14, 19, 7, 32, 19, 43, 15, 16, 29, 24, 82, 59, 99, 77, 20, 2, 15, 2, 27, 23, 48, 19, 4, 1, 1, 1, 28, 1, 16, 9, 10, 10, 22, 22, 7, 37, 17, 11, 7, 5, 9, 20, 7, 36, 19, 17, 10, 7, 9, 9, 21, 9, 11, 7, 27, 9, 29, 9, 9, 20, 10, 9, 21, 10, 19, 18, 18, 24, 11, 11, 7, 10, 15, 27, 7, 10, 5, 10, 29, 10, 31, 7, 22, 19, 5, 10, 31, 18, 40, 10, 8, 16, 5, 17, 5, 8, 17, 5, 32, 5, 11, 8, 25, 10, 11, 14, 19, 32, 27, 22, 11, 7, 11, 5, 22, 8, 10, 20, 19, 16, 18, 17, 21, 37, 32, 7, 45, 5, 2, 33, 6, 15, 3, 3, 17, 25, 6, 29, 8, 15, 19, 2, 16, 6, 3, 14, 4, 22, 2, 6, 19, 5, 22, 2, 3, 14, 5, 2, 28, 31, 6, 11, 18, 17, 1, 1, 1, 13, 30, 33, 1, 23, 1, 22, 1, 21, 80, 43, 10, 9, 9, 19, 11, 10, 9, 19, 9, 9, 9, 20, 30, 19, 12, 7, 21, 9, 11, 36, 11, 12, 14, 12, 7, 11, 12, 11, 16, 11, 17, 22, 9, 11, 2, 9, 47, 10, 23, 30, 9, 27, 20, 6, 7, 26, 8, 21, 20, 51, 8, 12, 6, 23, 93, 1

In [36]:
# Set data type for weight
weight_int=[]
for weight in player_data_df["weight_(lbs)"]:
    
    try:
        new_type = int(weight)
    except:
       new_type = -1
    
    weight_int.append(new_type)
    
print(weight_int)

[196, 152, 132, -1, 172, 174, 179, 159, 141, 157, 174, 152, 159, 139, 150, 146, 152, 170, 159, 161, 152, 150, 172, -1, -1, 181, 161, 183, 168, 119, 172, 174, 201, 150, -1, 170, 170, 192, 179, 163, 174, 150, 152, 117, 150, 183, 172, 139, 152, 152, -1, 150, 179, 161, 168, 172, 163, 157, 174, 170, -1, 170, -1, 183, -1, 172, 183, 196, 161, 161, 150, 159, 159, 179, 150, 163, 159, 161, 159, 168, 146, 139, 119, 170, 157, 174, 150, 179, 179, 181, 139, 141, 132, 190, 172, 150, 161, 179, 190, 163, 159, 163, 163, 159, 152, -1, 179, 137, 152, 128, 192, 174, 150, 115, 150, 150, 172, 139, 139, 161, 150, 174, 161, 148, 161, 132, 146, 150, 146, 150, 168, 161, 152, 172, 183, 172, 168, 152, 161, 174, 148, 150, 148, 157, 150, 161, 174, 130, 139, 150, 139, 152, 170, 150, 174, 141, 150, 161, 168, 163, -1, 150, -1, 170, 174, 161, 179, 150, 159, 152, 159, 181, 181, 157, 172, 174, 170, 150, 183, 152, 150, 172, 152, 181, 161, 161, 172, 174, 161, 172, 185, 181, 170, 183, 146, 172, 185, 183, 163, 172, 192, -1, 1

In [37]:
# Replace converted sections
player_data_df["jersey_number"] = jersey_int
player_data_df["weight_(lbs)"] = weight_int

In [38]:
# Drop league ID
player_data_df = player_data_df.drop(["league_id"], axis=1)

In [39]:
# Deal with na values
player_data_df["goals"] = player_data_df["goals"].fillna(0)
player_data_df["assists"] = player_data_df["assists"].fillna(0)
player_data_df["yellow_cards"] = player_data_df["yellow_cards"].fillna(0)
player_data_df["red_cards"] = player_data_df["red_cards"].fillna(0)
player_data_df["shots"] = player_data_df["shots"].fillna(0)
player_data_df["shots_on_goal"] = player_data_df["shots_on_goal"].fillna(0)
player_data_df["saves"] = player_data_df["saves"].fillna(0)
player_data_df["clean_sheets"] = player_data_df["clean_sheets"].fillna(0)
player_data_df["goals_against"] = player_data_df["goals_against"].fillna(0)
player_data_df["photo_url"] = player_data_df["photo_url"].fillna("N/A")
player_data_df["height_(ft)"] = player_data_df["height_(ft)"].fillna(0)
player_data_df["weight_(lbs)"] = player_data_df["weight_(lbs)"].fillna(0)
player_data_df["birthdate"] = player_data_df["birthdate"].fillna(dt.date(1111,11,11))

In [40]:
# Display Player Data dataframe
player_data_df.head(10)

Unnamed: 0,player_id,player_name,team_id,position_id,jersey_number,goals,assists,yellow_cards,red_cards,shots,shots_on_goal,saves,clean_sheets,goals_against,photo_url,height_(ft),weight_(lbs),birthdate,nationality_id
0,pl1,Patrick Agyemang,tm1,pos1,33,2,0,3,0,8,3,0,0,0,https://a.espncdn.com/i/headshots/soccer/playe...,6.33,196,2000-11-07 00:00:00,nat1
1,pl697,Kwadwo Opoku,tm17,pos1,90,6,3,1,1,51,18,2,0,0,https://a.espncdn.com/i/headshots/soccer/playe...,5.58,152,2001-07-13 00:00:00,nat1
2,pl825,Joshua Bolma,tm24,pos1,16,0,0,0,0,1,0,0,0,0,https://a.espncdn.com/i/headshots/soccer/playe...,5.75,132,2002-04-10 00:00:00,nat1
3,pl223,Eugene Ansah,tm27,pos1,31,1,0,0,0,8,4,0,0,0,,0.0,-1,1994-12-16 00:00:00,nat1
4,pl1556,Antoine Semenyo,tm44,pos1,24,2,0,1,0,19,8,0,0,0,https://resources.premierleague.com/premierlea...,6.08,172,2000-01-07 00:00:00,nat1
5,pl1217,Jordan Ayew,tm59,pos1,9,1,3,4,0,21,5,1,0,0,https://resources.premierleague.com/premierlea...,6.0,174,1991-09-11 00:00:00,nat1
6,pl1947,Iñaki Williams,tm68,pos1,9,5,3,2,0,36,17,3,0,0,https://assets.laliga.com/squad/2023/t174/p197...,6.08,179,1994-06-15 00:00:00,nat1
7,pl2558,Christopher Antwi-Adjei,tm96,pos1,22,0,0,2,0,11,4,1,0,0,https://assets.bundesliga.com/player/dfl-obj-0...,5.67,159,1994-02-07 00:00:00,nat1
8,pl3424,Benjamin Tetteh,tm118,pos1,17,0,0,1,0,3,1,0,0,0,https://www.ligue1.com/-/media/Project/LFP/sha...,6.33,141,1997-07-10 00:00:00,nat1
9,pl3683,Ernest Nuamah,tm122,pos1,37,1,0,1,0,14,6,0,0,0,https://www.ligue1.com/-/media/Project/LFP/sha...,5.83,157,2003-11-01 00:00:00,nat1


In [41]:
player_data_df.dtypes

player_id          object
player_name        object
team_id            object
position_id        object
jersey_number       int64
goals               int64
assists             int64
yellow_cards        int64
red_cards           int64
shots               int64
shots_on_goal       int64
saves               int64
clean_sheets        int64
goals_against       int64
photo_url          object
height_(ft)       float64
weight_(lbs)        int64
birthdate          object
nationality_id     object
dtype: object

In [42]:
teams_df.dtypes

team_id      object
team         object
league_id    object
dtype: object

In [43]:
leagues_df.dtypes

league_id    object
league       object
dtype: object

In [44]:
positions_df.dtypes

position_id    object
position       object
dtype: object

In [45]:
nationalities_df.dtypes

nationality_id    object
nationality       object
dtype: object

### Exports:
---

In [46]:
# Export CSV 1-Player Data
player_data_df.to_csv("../2-database_building/cleaned_data_exports/complete_player_data.csv", index=False)

In [47]:
# Export CSV 2-Teams Data
teams_df.to_csv("../2-database_building/cleaned_data_exports/complete_teams.csv", index=False)

In [48]:
# Export CSV 3-Leagues Data
leagues_df.to_csv("../2-database_building/cleaned_data_exports/complete_leagues.csv", index=False)

In [49]:
# Export CSV 4-Positions Data
positions_df.to_csv("../2-database_building/cleaned_data_exports/complete_positions.csv", index=False)

In [50]:
# Export CSV 5-Nationalities Data
nationalities_df.to_csv("../2-database_building/cleaned_data_exports/complete_nationalities.csv", index=False)