### DEPENDENCIES

In [1]:
import pandas as pd
import requests
import json
import csv
import os
from sqlalchemy import create_engine
import config

### Extract CSVs into DataFrames

In [2]:
#https://www.kaggle.com/abecklas/fifa-world-cup/version/5#WorldCups.csv
#All the countries that have hosted the world cup from the first one until 2014
#world = "\\Users\dharti.patel\Documents\Repositories\ETL-project\Resources\WorldCups.csv"
world = "Resources/WorldCups.csv"
world_df = pd.read_csv(world)
world_df

Unnamed: 0,Year,Country,Winner,Runners-Up,Third,Fourth,GoalsScored,QualifiedTeams,MatchesPlayed,Attendance
0,1930,Uruguay,Uruguay,Argentina,USA,Yugoslavia,70,13,18,590.549
1,1934,Italy,Italy,Czechoslovakia,Germany,Austria,70,16,17,363.000
2,1938,France,Italy,Hungary,Brazil,Sweden,84,15,18,375.700
3,1950,Brazil,Uruguay,Brazil,Sweden,Spain,88,13,22,1.045.246
4,1954,Switzerland,Germany FR,Hungary,Austria,Uruguay,140,16,26,768.607
5,1958,Sweden,Brazil,Sweden,France,Germany FR,126,16,35,819.810
6,1962,Chile,Brazil,Czechoslovakia,Chile,Yugoslavia,89,16,32,893.172
7,1966,England,England,Germany FR,Portugal,Soviet Union,89,16,32,1.563.135
8,1970,Mexico,Brazil,Italy,Germany FR,Uruguay,95,16,32,1.603.975
9,1974,Germany,Germany FR,Netherlands,Poland,Brazil,97,16,38,1.865.753


In [3]:
#https://www.kaggle.com/stefanoleone992/fifa-20-complete-player-dataset#players_20.csv 
#"Every player available in FIFA 15, 16, 17, 18, 19, and also FIFA 20"
players = "Resources/players_20.csv"
players_df = pd.read_csv(players)
players_df

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club,...,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,32,6/24/1987,170,72,Argentina,FC Barcelona,...,68+2,66+2,66+2,66+2,68+2,63+2,52+2,52+2,52+2,63+2
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,2/5/1985,187,83,Portugal,Juventus,...,65+3,61+3,61+3,61+3,65+3,61+3,53+3,53+3,53+3,61+3
2,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Junior,27,2/5/1992,175,68,Brazil,Paris Saint-Germain,...,66+3,61+3,61+3,61+3,66+3,61+3,46+3,46+3,46+3,61+3
3,200389,https://sofifa.com/player/200389/jan-oblak/20/...,J. Oblak,Jan Oblak,26,1/7/1993,188,87,Slovenia,Atlético Madrid,...,,,,,,,,,,
4,183277,https://sofifa.com/player/183277/eden-hazard/2...,E. Hazard,Eden Hazard,28,1/7/1991,175,74,Belgium,Real Madrid,...,66+3,63+3,63+3,63+3,66+3,61+3,49+3,49+3,49+3,61+3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18273,245006,https://sofifa.com/player/245006/shuai-shao/20...,Shao Shuai,邵帅,22,3/10/1997,186,79,China PR,Beijing Renhe FC,...,43+2,42+2,42+2,42+2,43+2,45+2,46+2,46+2,46+2,45+2
18274,250995,https://sofifa.com/player/250995/mingjie-xiao/...,Xiao Mingjie,Mingjie Xiao,22,1/1/1997,177,66,China PR,Shanghai SIPG FC,...,44+2,43+2,43+2,43+2,44+2,46+2,47+2,47+2,47+2,46+2
18275,252332,https://sofifa.com/player/252332/wei-zhang/20/...,Zhang Wei,张威,19,5/16/2000,186,75,China PR,Hebei China Fortune FC,...,47+2,49+2,49+2,49+2,47+2,47+2,49+2,49+2,49+2,47+2
18276,251110,https://sofifa.com/player/251110/haijian-wang/...,Wang Haijian,汪海健,18,8/2/2000,185,74,China PR,Shanghai Greenland Shenhua FC,...,48+2,48+2,48+2,48+2,48+2,48+2,49+2,49+2,49+2,48+2


In [4]:
url = pd.read_html("https://en.wikipedia.org/wiki/FIFA_Club_World_Cup")
url

[                          0                          1
 0                       NaN                        NaN
 1                   Founded         2000; 20 years ago
 2                    Region       International (FIFA)
 3           Number of teams  7 (from 6 confederations)
 4         Current champions       Liverpool(1st title)
 5   Most successful club(s)      Real Madrid(4 titles)
 6   Television broadcasters       List of broadcasters
 7                   Website           Official website
 8  2020 FIFA Club World Cup   2020 FIFA Club World Cup,
    0                                                  1  2
 0  “  The CSF is the entity in charge of controlling...  ”,
     0                                     1  \
 0 NaN       Match was won during extra time   
 1 NaN  Match was won on a penalty shoot-out   
 
                                       2                                     3  
 0       Match was won during extra time       Match was won during extra time  
 1  Match 

### Transform World Cup DataFrame

In [5]:
world_df[['Year','Country']]

Unnamed: 0,Year,Country
0,1930,Uruguay
1,1934,Italy
2,1938,France
3,1950,Brazil
4,1954,Switzerland
5,1958,Sweden
6,1962,Chile
7,1966,England
8,1970,Mexico
9,1974,Germany


In [6]:
#Duplicating year 2002 when two countries hosted the World Cup
dup = world_df['Country'] == 'Korea/Japan'
dup_try = world_df[dup]
world_df_dup=world_df.append([dup_try]*1,ignore_index=True)
world_df_dup

Unnamed: 0,Year,Country,Winner,Runners-Up,Third,Fourth,GoalsScored,QualifiedTeams,MatchesPlayed,Attendance
0,1930,Uruguay,Uruguay,Argentina,USA,Yugoslavia,70,13,18,590.549
1,1934,Italy,Italy,Czechoslovakia,Germany,Austria,70,16,17,363.000
2,1938,France,Italy,Hungary,Brazil,Sweden,84,15,18,375.700
3,1950,Brazil,Uruguay,Brazil,Sweden,Spain,88,13,22,1.045.246
4,1954,Switzerland,Germany FR,Hungary,Austria,Uruguay,140,16,26,768.607
5,1958,Sweden,Brazil,Sweden,France,Germany FR,126,16,35,819.810
6,1962,Chile,Brazil,Czechoslovakia,Chile,Yugoslavia,89,16,32,893.172
7,1966,England,England,Germany FR,Portugal,Soviet Union,89,16,32,1.563.135
8,1970,Mexico,Brazil,Italy,Germany FR,Uruguay,95,16,32,1.603.975
9,1974,Germany,Germany FR,Netherlands,Poland,Brazil,97,16,38,1.865.753


In [7]:
#Segregating the year when Korea and Japan hosted the World Cup by renaming each row
world_df_dup.loc[16,'Country'] = 'Korea Republic'
world_df_dup.loc[20,'Country'] = 'Japan'
world_df_dup

Unnamed: 0,Year,Country,Winner,Runners-Up,Third,Fourth,GoalsScored,QualifiedTeams,MatchesPlayed,Attendance
0,1930,Uruguay,Uruguay,Argentina,USA,Yugoslavia,70,13,18,590.549
1,1934,Italy,Italy,Czechoslovakia,Germany,Austria,70,16,17,363.000
2,1938,France,Italy,Hungary,Brazil,Sweden,84,15,18,375.700
3,1950,Brazil,Uruguay,Brazil,Sweden,Spain,88,13,22,1.045.246
4,1954,Switzerland,Germany FR,Hungary,Austria,Uruguay,140,16,26,768.607
5,1958,Sweden,Brazil,Sweden,France,Germany FR,126,16,35,819.810
6,1962,Chile,Brazil,Czechoslovakia,Chile,Yugoslavia,89,16,32,893.172
7,1966,England,England,Germany FR,Portugal,Soviet Union,89,16,32,1.563.135
8,1970,Mexico,Brazil,Italy,Germany FR,Uruguay,95,16,32,1.603.975
9,1974,Germany,Germany FR,Netherlands,Poland,Brazil,97,16,38,1.865.753


In [8]:
#Checking the Country Names
world_df_dup['Country'].unique()

array(['Uruguay', 'Italy', 'France', 'Brazil', 'Switzerland', 'Sweden',
       'Chile', 'England', 'Mexico', 'Germany', 'Argentina', 'Spain',
       'USA', 'Korea Republic', 'South Africa', 'Japan'], dtype=object)

In [9]:
world_df_dup.columns = map(str.lower, world_df_dup.columns)
world_df_dup = world_df_dup.rename(columns={"runners-up": "runners_up", "goalsscored": "goals_scored", "qualifiedteams" : "qualified_teams","matchesplayed":"matches_played"})
world_df_dup

Unnamed: 0,year,country,winner,runners_up,third,fourth,goals_scored,qualified_teams,matches_played,attendance
0,1930,Uruguay,Uruguay,Argentina,USA,Yugoslavia,70,13,18,590.549
1,1934,Italy,Italy,Czechoslovakia,Germany,Austria,70,16,17,363.000
2,1938,France,Italy,Hungary,Brazil,Sweden,84,15,18,375.700
3,1950,Brazil,Uruguay,Brazil,Sweden,Spain,88,13,22,1.045.246
4,1954,Switzerland,Germany FR,Hungary,Austria,Uruguay,140,16,26,768.607
5,1958,Sweden,Brazil,Sweden,France,Germany FR,126,16,35,819.810
6,1962,Chile,Brazil,Czechoslovakia,Chile,Yugoslavia,89,16,32,893.172
7,1966,England,England,Germany FR,Portugal,Soviet Union,89,16,32,1.563.135
8,1970,Mexico,Brazil,Italy,Germany FR,Uruguay,95,16,32,1.603.975
9,1974,Germany,Germany FR,Netherlands,Poland,Brazil,97,16,38,1.865.753


### Transform Players DataFrame

In [10]:
players_df.loc[players_df['nationality']=='USA',:] 

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club,...,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb


In [11]:
# players_df[players_df['nationality'] == 'United States'].replace("USA")

In [12]:
players_df.loc[players_df.nationality == 'United States', 'nationality'] = "USA"
players_df['nationality'].unique()

array(['Argentina', 'Portugal', 'Brazil', 'Slovenia', 'Belgium',
       'Germany', 'Netherlands', 'Croatia', 'Egypt', 'France', 'Senegal',
       'England', 'Spain', 'Italy', 'Uruguay', 'Poland', 'Denmark',
       'Gabon', 'Korea Republic', 'Costa Rica', 'Slovakia',
       'Bosnia Herzegovina', 'Serbia', 'Scotland', 'Hungary',
       'Switzerland', 'Greece', 'Austria', 'Morocco', 'Sweden', 'Wales',
       'Colombia', 'Czech Republic', 'Chile', 'Algeria', 'Ivory Coast',
       'Togo', 'Norway', 'Mexico', 'Iceland', 'Finland', 'Jamaica',
       'Albania', 'Guinea', 'Cameroon', 'Ghana', 'Montenegro', 'Ukraine',
       'Russia', 'DR Congo', 'Central African Rep.', 'Venezuela',
       'Nigeria', 'Armenia', 'Israel', 'Ecuador', 'Paraguay', 'Australia',
       'Turkey', 'Romania', 'Japan', 'Mali', 'USA', 'Kosovo',
       'Dominican Republic', 'Tanzania', 'China PR', 'Northern Ireland',
       'Republic of Ireland', 'Tunisia', 'Cape Verde', 'FYR Macedonia',
       'Burkina Faso', 'Kenya', 'Ang

In [13]:
# Selecting only the columns that I want
players_df = players_df[["long_name", "club", "joined","nationality","sofifa_id"]]
players_df

Unnamed: 0,long_name,club,joined,nationality,sofifa_id
0,Lionel Andrés Messi Cuccittini,FC Barcelona,7/1/2004,Argentina,158023
1,Cristiano Ronaldo dos Santos Aveiro,Juventus,7/10/2018,Portugal,20801
2,Neymar da Silva Santos Junior,Paris Saint-Germain,8/3/2017,Brazil,190871
3,Jan Oblak,Atlético Madrid,7/16/2014,Slovenia,200389
4,Eden Hazard,Real Madrid,7/1/2019,Belgium,183277
...,...,...,...,...,...
18273,邵帅,Beijing Renhe FC,7/13/2018,China PR,245006
18274,Mingjie Xiao,Shanghai SIPG FC,2/1/2019,China PR,250995
18275,张威,Hebei China Fortune FC,1/1/2019,China PR,252332
18276,汪海健,Shanghai Greenland Shenhua FC,2/1/2019,China PR,251110


### Transform Clubs DataFrame

In [14]:
clubs_country = url[3]
# clubs_country = pd.DataFrame(clubs_country) # Alap: This is not needed since read_html gives you DataFrame object.
clubs_country = clubs_country[["Season", "Hosts", "Champions"]]
clubs_country = clubs_country.rename(columns={"Season": "season", "Hosts": "host", "Champions" : "club"})
clubs_country

Unnamed: 0,season,host,club
0,2000,Brazil,Corinthians[n 2]
1,2001,Spain,Tournament cancelled
2,2005,Japan,São Paulo
3,2006,Japan,Internacional
4,2007,Japan,Milan
5,2008,Japan,Manchester United
6,2009,UAE,Barcelona[n 5]
7,2010,UAE,Internazionale
8,2011,Japan,Barcelona
9,2012,Japan,Corinthians


In [15]:
# Creating a Roster of 2018 winning team and when the players join the team

# Merging DataFrames clubs_country and players_df
pyr_ctry_m = pd.merge(clubs_country, players_df, on="club")

# Filtering for 2018
pyr_ctry_m = pyr_ctry_m.loc[pyr_ctry_m["season"] == 2018,:]

# Renaming long_name header to Roster
pyr_ctry_m = pyr_ctry_m.rename(columns={"long_name" : "roster", "joined" : "joined"})
pyr_ctry_m

Unnamed: 0,season,host,club,roster,joined,nationality,sofifa_id
148,2018,UAE,Real Madrid,Eden Hazard,7/1/2019,Belgium,183277
149,2018,UAE,Real Madrid,Luka Modrić,8/1/2012,Croatia,177003
150,2018,UAE,Real Madrid,Sergio Ramos García,8/1/2005,Spain,155862
151,2018,UAE,Real Madrid,Thibaut Courtois,8/9/2018,Belgium,192119
152,2018,UAE,Real Madrid,Toni Kroos,7/17/2014,Germany,182521
153,2018,UAE,Real Madrid,Carlos Henrique Venancio Casimiro,7/11/2013,Brazil,200145
154,2018,UAE,Real Madrid,Karim Benzema,7/9/2009,France,165153
155,2018,UAE,Real Madrid,Keylor Navas,8/3/2014,Costa Rica,193041
156,2018,UAE,Real Madrid,Francisco Román Alarcón Suárez,7/3/2013,Spain,197781
157,2018,UAE,Real Madrid,Raphaël Varane,7/1/2011,France,201535


### Create database connection

In [16]:
username = config.username
password = config.password
database = config.database

rds_connection_string = f"{username}:{password}@localhost:5432/{database}"
engine = create_engine(f'postgresql://{rds_connection_string}')
engine

Engine(postgresql://postgres:***@localhost:5432/etl)

### Load DataFrames into database

In [17]:
world_df_dup.to_sql(name='world_cup', con=engine, if_exists='replace', index=False
#                    dtype={"Year": Integer(),
#                        "Country": String(),
#                        "Country": String(),
#                        "Winner": String(),
#                        "Runners-Up": String(),
#                        "Third": String(),
#                        "Fourth": String(),
#                        "GoalsScored": Integer(),
#                        "QualifiedTeams": Integer(),
#                        "MatchesPlayed": Integer(),
#                        "Attendance": String()
                   )
#with engine.connect() as con:
#    con.execute('ALTER TABLE "world_cup" ADD PRIMARY KEY ("Year");')
world=pd.read_sql_query('select * from world_cup', con=engine).head()
world

Unnamed: 0,year,country,winner,runners_up,third,fourth,goals_scored,qualified_teams,matches_played,attendance
0,1930,Uruguay,Uruguay,Argentina,USA,Yugoslavia,70,13,18,590.549
1,1934,Italy,Italy,Czechoslovakia,Germany,Austria,70,16,17,363.000
2,1938,France,Italy,Hungary,Brazil,Sweden,84,15,18,375.700
3,1950,Brazil,Uruguay,Brazil,Sweden,Spain,88,13,22,1.045.246
4,1954,Switzerland,Germany FR,Hungary,Austria,Uruguay,140,16,26,768.607


In [18]:
players_df.to_sql(name='players', con=engine, if_exists='replace', index=False)
players=pd.read_sql_query('select * from players', con=engine).head()
players

Unnamed: 0,long_name,club,joined,nationality,sofifa_id
0,Lionel Andrés Messi Cuccittini,FC Barcelona,7/1/2004,Argentina,158023
1,Cristiano Ronaldo dos Santos Aveiro,Juventus,7/10/2018,Portugal,20801
2,Neymar da Silva Santos Junior,Paris Saint-Germain,8/3/2017,Brazil,190871
3,Jan Oblak,Atlético Madrid,7/16/2014,Slovenia,200389
4,Eden Hazard,Real Madrid,7/1/2019,Belgium,183277


### Load DataFrames into CSV and Database

In [19]:
clubs_file = "Resources/clubs.csv"
# Outputting DataFrame to csv
with open(clubs_file, 'w') as csv_file:
    clubs_country.to_csv(csv_file, index=False)

In [20]:
clubs_country.to_sql(name='clubs_country', con=engine, if_exists='replace', index=False)
clubs_country=pd.read_sql_query('select * from clubs_country', con=engine).head()
clubs_country

Unnamed: 0,season,host,club
0,2000,Brazil,Corinthians[n 2]
1,2001,Spain,Tournament cancelled
2,2005,Japan,São Paulo
3,2006,Japan,Internacional
4,2007,Japan,Milan


### Analytics questions: What is the ratio of players nationality and the number of times those countries have hosted the World Cup?

In [21]:
analytics=pd.read_sql_query("""select *
                            from (
                                select b.nationality, count(b.sofifa_id) as no_of_players_from_country
                                from players as b
                                group by b.nationality
                                ) a
                            full outer join (
                                select distinct a."country", count(*) as no_times_host 
                                from world_cup as a
                                group by a."country"
                                ) b
                            on a.nationality = b."country"
                            order by no_times_host asc""", con=engine)

analytics.head(18)

Unnamed: 0,nationality,no_of_players_from_country,country,no_times_host
0,England,1667,England,1.0
1,Sweden,358,Sweden,1.0
2,Argentina,886,Argentina,1.0
3,Japan,453,Japan,1.0
4,Uruguay,164,Uruguay,1.0
5,Chile,370,Chile,1.0
6,USA,347,USA,1.0
7,Korea Republic,322,Korea Republic,1.0
8,Spain,1035,Spain,1.0
9,Switzerland,229,Switzerland,1.0
