### DEPENDENCIES

In [None]:
import pandas as pd
import requests
import json
import csv
import os
from sqlalchemy import create_engine
import config

### Extract CSVs into DataFrames

In [None]:
#https://www.kaggle.com/abecklas/fifa-world-cup/version/5#WorldCups.csv
#All the countries that have hosted the world cup from the first one until 2014
#world = "\\Users\dharti.patel\Documents\Repositories\ETL-project\Resources\WorldCups.csv"
world = "Resources/WorldCups.csv"
world_df = pd.read_csv(world)

In [None]:
#https://www.kaggle.com/stefanoleone992/fifa-20-complete-player-dataset#players_20.csv 
#"Every player available in FIFA 15, 16, 17, 18, 19, and also FIFA 20"
players = "Resources/players_20.csv"
players_df = pd.read_csv(players)

In [None]:
url = pd.read_html("https://en.wikipedia.org/wiki/FIFA_Club_World_Cup")

### Transform World Cup DataFrame

In [None]:
world_df[['Year','Country']]

In [None]:
#Duplicating year 2002 when two countries hosted the World Cup
dup = world_df['Country'] == 'Korea/Japan'
dup_try = world_df[dup]
world_df_dup=world_df.append([dup_try]*1,ignore_index=True)
world_df_dup

In [None]:
#Segregating the year when Korea and Japan hosted the World Cup by renaming each row
world_df_dup.loc[16,'Country'] = 'Korea Republic'
world_df_dup.loc[20,'Country'] = 'Japan'
world_df_dup

In [None]:
#Checking the Country Names
world_df_dup['Country'].unique()

### Transform Players DataFrame

In [None]:
players_df.head()

In [None]:
# players_df[players_df['nationality'] == 'United States'].replace("USA")

In [None]:
players_df.loc[players_df.nationality == 'United States', 'nationality'] = "USA"
players_df['nationality'].unique()

In [None]:
# Selecting only the columns that I want
players_df = players_df[["long_name", "club", "joined"]]
players_df

### Transform Clubs DataFrame

In [None]:
clubs_country = url[3]
# clubs_country = pd.DataFrame(clubs_country) # Alap: This is not needed since read_html gives you DataFrame object.
clubs_country = clubs_country[["Season", "Hosts", "Champions"]]
clubs_country = clubs_country.rename(columns={"Champions" : "club"})
clubs_country

In [None]:
# Creating a Roster of 2018 winning team and when the players join the team

# Merging DataFrames clubs_country and players_df
pyr_ctry_m = pd.merge(clubs_country, players_df, on="club")

# Filtering for 2018
pyr_ctry_m = pyr_ctry_m.loc[pyr_ctry_m["Season"] == 2018,:]

# Renaming long_name header to Roster
pyr_ctry_m = pyr_ctry_m.rename(columns={"long_name" : "Roster", "joined" : "Joined"})
pyr_ctry_m

### Create database connection

In [None]:
username = config.username
password = config.password
database = config.database

rds_connection_string = f"{username}:{password}@localhost:5432/{database}"
engine = create_engine(f'postgresql://{rds_connection_string}')
engine

### Load DataFrames into database

In [None]:
world_df_dup.to_sql(name='world_cup', con=engine, if_exists='replace', index=False)
world=pd.read_sql_query('select * from world_cup', con=engine).head()
world

In [None]:
players_df.to_sql(name='players', con=engine, if_exists='replace', index=False)
players=pd.read_sql_query('select * from players', con=engine).head()
players

### Analytics questions: What is the ratio of players nationality and the number of times those countries have hosted the World Cup?

In [None]:
analytics=pd.read_sql_query("""select *
                            from (
                                select b.nationality, count(b.sofifa_id) as no_of_playersfromcountry
                                from players as b
                                group by b.nationality
                                ) a
                            full outer join (
                                select distinct a."Country", count(*) as no_times_host 
                                from world_cup as a
                                group by a."Country"
                                ) b
                            on a.nationality = b."Country"
                            order by no_times_host asc""", con=engine)

analytics.head(18)

### Load DataFrames into CSV

In [None]:
clubs_file = "Resources/clubs.csv"
# Outputting DataFrame to csv
with open(clubs_file, 'w') as csv_file:
    clubs_country.to_csv(csv_file, index=False)