# ETL Project - MLS

In [337]:
# Import Dependencies
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype

from sqlalchemy import create_engine
from config import PH_User_Name
from config import PH_Password

### Extract Data

In [338]:
# Read CSV files in pandas
mmls_path = ('mens_major_league_soccer.csv')
mls_salaries_path = ('mls-salaries-2017.csv')
mmls_data_df = pd.read_csv(mmls_path, header=5, encoding="ISO-8859-1")
mls_salaries = pd.read_csv(mls_salaries_path, encoding="ISO-8859-1")

### Transform Data

In [339]:
# Rename column Names
mmls_data_df.columns = ['rank', 'player', 'nation', 'position', 'squad', 'age', 'born', 'matches_played', 'starts',
                           'min_played', 'goals', 'assist', 'penalty_kicks', 'pk_attempts', 'yell_card', 'red_card',
                           'goals_90', 'assist_90', 'goals_assist_90', 'goals_pk_90', 'goals_assist_pk_90', 'Matches']

mmls_data_df

Unnamed: 0,rank,player,nation,position,squad,age,born,matches_played,starts,min_played,...,penalty_kicks,pk_attempts,yell_card,red_card,goals_90,assist_90,goals_assist_90,goals_pk_90,goals_assist_pk_90,Matches
0,1,Saad Abdul-Salaam\Saad-Abdul-Salaam,us USA,DF,Sporting KC,25,1991,12,10,902,...,0,0,1,0,0.10,0.10,0.20,0.10,0.20,Matches
1,2,Mohammed Abu\Mohammed-Abu,gh GHA,MF,Columbus,25,1991,20,14,1279,...,0,0,3,0,0.00,0.00,0.00,0.00,0.00,Matches
2,3,Lalas Abubakar\Lalas-Abubakar,gh GHA,DF,Columbus,22,1994,7,6,529,...,0,0,2,1,0.17,0.00,0.17,0.17,0.17,Matches
3,4,David Accam\David-Accam,gh GHA,MFFW,Chicago,26,1990,30,24,2194,...,2,2,2,0,0.57,0.25,0.82,0.49,0.74,Matches
4,5,Danilo Acosta\Danilo-Acosta,us USA,DF,Real Salt Lake,19,1997,17,16,1408,...,0,0,2,1,0.00,0.06,0.06,0.00,0.06,Matches
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583,584,Eriq Zavaleta\Eriq-Zavaleta,us USA,DF,Toronto FC,24,1992,29,27,2353,...,0,0,4,0,0.04,0.00,0.04,0.04,0.04,Matches
584,585,Ben Zemanski\Ben-Zemanski,us USA,MFDF,Portland,28,1988,7,4,439,...,0,0,1,0,0.00,0.21,0.21,0.00,0.21,Matches
585,586,Walker Zimmerman\Walker-Zimmerman,us USA,DF,FC Dallas,23,1993,22,18,1602,...,0,0,0,0,0.06,0.06,0.11,0.06,0.11,Matches
586,587,Sal Zizzo\Sal-Zizzo,us USA,DF,NY Red Bulls,29,1987,23,18,1648,...,0,0,5,2,0.00,0.11,0.11,0.00,0.11,Matches


In [340]:
# Remove special characters from Player, separate Player Names and drop extra columns
mmls_data_df[['Name1','Name2']] = pd.DataFrame(mmls_data_df.player.str.split(r'\\',1).tolist())
mmls_data_df.drop(['player', 'Name1', 'Matches'], axis=1, inplace = True)
mmls_data_df = mmls_data_df.rename(columns = {'Name2':'player'})
mmls_data_df['player'] = mmls_data_df['player'].str.replace('-',' ')
mmls_data_df

Unnamed: 0,rank,nation,position,squad,age,born,matches_played,starts,min_played,goals,...,penalty_kicks,pk_attempts,yell_card,red_card,goals_90,assist_90,goals_assist_90,goals_pk_90,goals_assist_pk_90,player
0,1,us USA,DF,Sporting KC,25,1991,12,10,902,1,...,0,0,1,0,0.10,0.10,0.20,0.10,0.20,Saad Abdul Salaam
1,2,gh GHA,MF,Columbus,25,1991,20,14,1279,0,...,0,0,3,0,0.00,0.00,0.00,0.00,0.00,Mohammed Abu
2,3,gh GHA,DF,Columbus,22,1994,7,6,529,1,...,0,0,2,1,0.17,0.00,0.17,0.17,0.17,Lalas Abubakar
3,4,gh GHA,MFFW,Chicago,26,1990,30,24,2194,14,...,2,2,2,0,0.57,0.25,0.82,0.49,0.74,David Accam
4,5,us USA,DF,Real Salt Lake,19,1997,17,16,1408,0,...,0,0,2,1,0.00,0.06,0.06,0.00,0.06,Danilo Acosta
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583,584,us USA,DF,Toronto FC,24,1992,29,27,2353,1,...,0,0,4,0,0.04,0.00,0.04,0.04,0.04,Eriq Zavaleta
584,585,us USA,MFDF,Portland,28,1988,7,4,439,0,...,0,0,1,0,0.00,0.21,0.21,0.00,0.21,Ben Zemanski
585,586,us USA,DF,FC Dallas,23,1993,22,18,1602,1,...,0,0,0,0,0.06,0.06,0.11,0.06,0.11,Walker Zimmerman
586,587,us USA,DF,NY Red Bulls,29,1987,23,18,1648,0,...,0,0,5,2,0.00,0.11,0.11,0.00,0.11,Sal Zizzo


In [341]:
# Seporate Nation into 2 and 3 digit codes
mmls_data_df[['nation_2', 'nation_3']] = pd.DataFrame(mmls_data_df.nation.str.split(' ',1).tolist())
mmls_data_df

Unnamed: 0,rank,nation,position,squad,age,born,matches_played,starts,min_played,goals,...,yell_card,red_card,goals_90,assist_90,goals_assist_90,goals_pk_90,goals_assist_pk_90,player,nation_2,nation_3
0,1,us USA,DF,Sporting KC,25,1991,12,10,902,1,...,1,0,0.10,0.10,0.20,0.10,0.20,Saad Abdul Salaam,us,USA
1,2,gh GHA,MF,Columbus,25,1991,20,14,1279,0,...,3,0,0.00,0.00,0.00,0.00,0.00,Mohammed Abu,gh,GHA
2,3,gh GHA,DF,Columbus,22,1994,7,6,529,1,...,2,1,0.17,0.00,0.17,0.17,0.17,Lalas Abubakar,gh,GHA
3,4,gh GHA,MFFW,Chicago,26,1990,30,24,2194,14,...,2,0,0.57,0.25,0.82,0.49,0.74,David Accam,gh,GHA
4,5,us USA,DF,Real Salt Lake,19,1997,17,16,1408,0,...,2,1,0.00,0.06,0.06,0.00,0.06,Danilo Acosta,us,USA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583,584,us USA,DF,Toronto FC,24,1992,29,27,2353,1,...,4,0,0.04,0.00,0.04,0.04,0.04,Eriq Zavaleta,us,USA
584,585,us USA,MFDF,Portland,28,1988,7,4,439,0,...,1,0,0.00,0.21,0.21,0.00,0.21,Ben Zemanski,us,USA
585,586,us USA,DF,FC Dallas,23,1993,22,18,1602,1,...,0,0,0.06,0.06,0.11,0.06,0.11,Walker Zimmerman,us,USA
586,587,us USA,DF,NY Red Bulls,29,1987,23,18,1648,0,...,5,2,0.00,0.11,0.11,0.00,0.11,Sal Zizzo,us,USA


In [342]:
# Assigned Column types
mmls_data_df = mmls_data_df.astype({'rank':'int', 'nation':'str', 'position':'str', 'squad':'str', 'age':'int',
                                    'born':'int', 'matches_played':'int', 'starts':'int', 'min_played':'int',
                                    'goals':'int', 'assist':'int', 'penalty_kicks':'int', 'pk_attempts':'int',
                                    'yell_card':'int', 'red_card':'int', 'goals_90':'float', 'assist_90':'float',
                                    'goals_assist_90':'float', 'goals_pk_90':'float', 'goals_assist_pk_90':'float',
                                    'player':'str', 'nation_2': 'str', 'nation_3': 'str'})

mmls_data_df.head()

Unnamed: 0,rank,nation,position,squad,age,born,matches_played,starts,min_played,goals,...,yell_card,red_card,goals_90,assist_90,goals_assist_90,goals_pk_90,goals_assist_pk_90,player,nation_2,nation_3
0,1,us USA,DF,Sporting KC,25,1991,12,10,902,1,...,1,0,0.1,0.1,0.2,0.1,0.2,Saad Abdul Salaam,us,USA
1,2,gh GHA,MF,Columbus,25,1991,20,14,1279,0,...,3,0,0.0,0.0,0.0,0.0,0.0,Mohammed Abu,gh,GHA
2,3,gh GHA,DF,Columbus,22,1994,7,6,529,1,...,2,1,0.17,0.0,0.17,0.17,0.17,Lalas Abubakar,gh,GHA
3,4,gh GHA,MFFW,Chicago,26,1990,30,24,2194,14,...,2,0,0.57,0.25,0.82,0.49,0.74,David Accam,gh,GHA
4,5,us USA,DF,Real Salt Lake,19,1997,17,16,1408,0,...,2,1,0.0,0.06,0.06,0.0,0.06,Danilo Acosta,us,USA


In [343]:
# Rename and dropped extra columns
mls_salaries.rename(columns = {'Unnamed: 6':'Player'}, inplace = True)
mls_salaries_df = mls_salaries.drop(['last_name','first_name', 'position'], axis=1)
mls_salaries_df.head()

Unnamed: 0,club,base_salary,guaranteed_compensation,Player
0,ATL,1912500.0,2297000.0,Miguel Almiron
1,ATL,65625.0,65625.0,Mikey Ambrose
2,ATL,150000.0,150000.0,Yamil Asad
3,ATL,99225.0,106573.89,Mark Bloom
4,ATL,65000.0,77400.0,Andrew Carleton


Create Tables

In [367]:
# Create Tables
mmls_player = mmls_data_df[['player','born','age','nation','nation_2','nation_3']].drop_duplicates(keep='first')
mmls_club =  mmls_data_df['squad'].drop_duplicates(keep='first')
mmls_positions = mmls_data_df['position'].drop_duplicates(keep='first')
mmls_stats = mmls_data_df[['player','position']].drop_duplicates(keep='first')

## Load Data

In [None]:
# rds_connection_string = "<insert user name>:<insert password>@localhost:5432/customer_db"
# engine = create_engine(f'postgresql://{rds_connection_string}')
engine = create_engine(f'postgresql://{PH_User_Name}:{PH_Password}@localhost:5432/ETL_Team007_Project')

In [None]:
engine.table_names()

In [None]:
mls_df.to_sql(name='mls', con=engine, if_exists='append', index=False)

Next Steps

Next Steps 2