### DEPENDENCIES

In [2]:
import pandas as pd
import requests
from sqlalchemy import create_engine
from sqlalchemy.types import Integer, Float, String, Date
import Resources.config as config

### Extract CSVs into DataFrames

In [3]:
# https://www.kaggle.com/abecklas/fifa-world-cup/version/5#WorldCups.csv
# All the countries that have hosted the world cup from the first one until 2014
world_df = pd.read_csv("Resources/WorldCups.csv", index_col = False)
world_df.head()

Unnamed: 0,Year,Country,Winner,Runners-Up,Third,Fourth,GoalsScored,QualifiedTeams,MatchesPlayed,Attendance
0,1930,Uruguay,Uruguay,Argentina,USA,Yugoslavia,70,13,18,590.549
1,1934,Italy,Italy,Czechoslovakia,Germany,Austria,70,16,17,363.000
2,1938,France,Italy,Hungary,Brazil,Sweden,84,15,18,375.700
3,1950,Brazil,Uruguay,Brazil,Sweden,Spain,88,13,22,1.045.246
4,1954,Switzerland,Germany FR,Hungary,Austria,Uruguay,140,16,26,768.607


In [43]:
# https://www.kaggle.com/stefanoleone992/fifa-20-complete-player-dataset#players_20.csv 
# "Every player available in FIFA 15, 16, 17, 18, 19, and also FIFA 20"
players_df = pd.read_csv("Resources/players_20.csv", index_col = False)
players_df.head()

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club,...,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,32,6/24/1987,170,72,Argentina,FC Barcelona,...,68+2,66+2,66+2,66+2,68+2,63+2,52+2,52+2,52+2,63+2
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,2/5/1985,187,83,Portugal,Juventus,...,65+3,61+3,61+3,61+3,65+3,61+3,53+3,53+3,53+3,61+3
2,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Junior,27,2/5/1992,175,68,Brazil,Paris Saint-Germain,...,66+3,61+3,61+3,61+3,66+3,61+3,46+3,46+3,46+3,61+3
3,200389,https://sofifa.com/player/200389/jan-oblak/20/...,J. Oblak,Jan Oblak,26,1/7/1993,188,87,Slovenia,Atlético Madrid,...,,,,,,,,,,
4,183277,https://sofifa.com/player/183277/eden-hazard/2...,E. Hazard,Eden Hazard,28,1/7/1991,175,74,Belgium,Real Madrid,...,66+3,63+3,63+3,63+3,66+3,61+3,49+3,49+3,49+3,61+3


In [44]:
# Reading the html table from wikipedia
fifa_wiki_tables = pd.read_html("https://en.wikipedia.org/wiki/FIFA_Club_World_Cup")
clubs_country = fifa_wiki_tables[3]
clubs_country.head()

Unnamed: 0,Edition,Season,Hosts,Champions,Score,Runners-up,Third place,Score.1,Fourth place,Ref
0,1.0,2000,Brazil,Corinthians[n 2],0–0,Vasco da Gama,Necaxa[n 3],1–1,Real Madrid,[138][139]
1,,2001,Spain,Tournament cancelled,Tournament cancelled,Tournament cancelled,Tournament cancelled,Tournament cancelled,Tournament cancelled,[140]
2,2.0,2005,Japan,São Paulo,1–0,Liverpool,Saprissa,3–2,Al-Ittihad,[141][142]
3,3.0,2006,Japan,Internacional,1–0,Barcelona,Al Ahly,2–1,América,[143][144]
4,4.0,2007,Japan,Milan,4–2,Boca Juniors,Urawa Red Diamonds[n 4],2–2,Étoile du Sahel,[146][147]


### Transform World Cup DataFrame

In [45]:
world_df[['Year','Country']]

Unnamed: 0,Year,Country
0,1930,Uruguay
1,1934,Italy
2,1938,France
3,1950,Brazil
4,1954,Switzerland
5,1958,Sweden
6,1962,Chile
7,1966,England
8,1970,Mexico
9,1974,Germany


In [46]:
# Duplicating year 2002 when two countries hosted the World Cup
dup = world_df['Country'] == 'Korea/Japan'
dup_try = world_df[dup]
world_df_dup=world_df.append([dup_try]*1,ignore_index=True)
world_df_dup

Unnamed: 0,Year,Country,Winner,Runners-Up,Third,Fourth,GoalsScored,QualifiedTeams,MatchesPlayed,Attendance
0,1930,Uruguay,Uruguay,Argentina,USA,Yugoslavia,70,13,18,590.549
1,1934,Italy,Italy,Czechoslovakia,Germany,Austria,70,16,17,363.000
2,1938,France,Italy,Hungary,Brazil,Sweden,84,15,18,375.700
3,1950,Brazil,Uruguay,Brazil,Sweden,Spain,88,13,22,1.045.246
4,1954,Switzerland,Germany FR,Hungary,Austria,Uruguay,140,16,26,768.607
5,1958,Sweden,Brazil,Sweden,France,Germany FR,126,16,35,819.810
6,1962,Chile,Brazil,Czechoslovakia,Chile,Yugoslavia,89,16,32,893.172
7,1966,England,England,Germany FR,Portugal,Soviet Union,89,16,32,1.563.135
8,1970,Mexico,Brazil,Italy,Germany FR,Uruguay,95,16,32,1.603.975
9,1974,Germany,Germany FR,Netherlands,Poland,Brazil,97,16,38,1.865.753


In [47]:
# Segregating the year when Korea and Japan hosted the World Cup by renaming each row
world_df_dup.loc[16,'Country'] = 'Korea Republic'
world_df_dup.loc[20,'Country'] = 'Japan'
world_df_dup

Unnamed: 0,Year,Country,Winner,Runners-Up,Third,Fourth,GoalsScored,QualifiedTeams,MatchesPlayed,Attendance
0,1930,Uruguay,Uruguay,Argentina,USA,Yugoslavia,70,13,18,590.549
1,1934,Italy,Italy,Czechoslovakia,Germany,Austria,70,16,17,363.000
2,1938,France,Italy,Hungary,Brazil,Sweden,84,15,18,375.700
3,1950,Brazil,Uruguay,Brazil,Sweden,Spain,88,13,22,1.045.246
4,1954,Switzerland,Germany FR,Hungary,Austria,Uruguay,140,16,26,768.607
5,1958,Sweden,Brazil,Sweden,France,Germany FR,126,16,35,819.810
6,1962,Chile,Brazil,Czechoslovakia,Chile,Yugoslavia,89,16,32,893.172
7,1966,England,England,Germany FR,Portugal,Soviet Union,89,16,32,1.563.135
8,1970,Mexico,Brazil,Italy,Germany FR,Uruguay,95,16,32,1.603.975
9,1974,Germany,Germany FR,Netherlands,Poland,Brazil,97,16,38,1.865.753


In [48]:
# Checking the Country Names
world_df_dup['Country'].unique()

array(['Uruguay', 'Italy', 'France', 'Brazil', 'Switzerland', 'Sweden',
       'Chile', 'England', 'Mexico', 'Germany', 'Argentina', 'Spain',
       'USA', 'Korea Republic', 'South Africa', 'Japan'], dtype=object)

In [49]:
# Column names - convert to lower case and rename
world_df_dup.columns = map(str.lower, world_df_dup.columns)
world_df_dup = world_df_dup.rename(columns={"runners-up": "runners_up", "goalsscored": "goals_scored", "qualifiedteams" : "qualified_teams","matchesplayed":"matches_played"})
world_df_dup.head()

Unnamed: 0,year,country,winner,runners_up,third,fourth,goals_scored,qualified_teams,matches_played,attendance
0,1930,Uruguay,Uruguay,Argentina,USA,Yugoslavia,70,13,18,590.549
1,1934,Italy,Italy,Czechoslovakia,Germany,Austria,70,16,17,363.000
2,1938,France,Italy,Hungary,Brazil,Sweden,84,15,18,375.700
3,1950,Brazil,Uruguay,Brazil,Sweden,Spain,88,13,22,1.045.246
4,1954,Switzerland,Germany FR,Hungary,Austria,Uruguay,140,16,26,768.607


In [50]:
# attendace column - convert data type from string to float
world_df_dup['attendance'] = world_df_dup['attendance'].str.replace('.', '', regex=False)
world_df_dup['attendance'].astype(int)
world_df_dup.head()

Unnamed: 0,year,country,winner,runners_up,third,fourth,goals_scored,qualified_teams,matches_played,attendance
0,1930,Uruguay,Uruguay,Argentina,USA,Yugoslavia,70,13,18,590549
1,1934,Italy,Italy,Czechoslovakia,Germany,Austria,70,16,17,363000
2,1938,France,Italy,Hungary,Brazil,Sweden,84,15,18,375700
3,1950,Brazil,Uruguay,Brazil,Sweden,Spain,88,13,22,1045246
4,1954,Switzerland,Germany FR,Hungary,Austria,Uruguay,140,16,26,768607


### Transform Players DataFrame

In [51]:
players_df.loc[players_df['nationality']=='USA',:] 

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club,...,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb


In [52]:
players_df.loc[players_df.nationality == 'United States', 'nationality'] = "USA"
players_df['nationality'].unique()

array(['Argentina', 'Portugal', 'Brazil', 'Slovenia', 'Belgium',
       'Germany', 'Netherlands', 'Croatia', 'Egypt', 'France', 'Senegal',
       'England', 'Spain', 'Italy', 'Uruguay', 'Poland', 'Denmark',
       'Gabon', 'Korea Republic', 'Costa Rica', 'Slovakia',
       'Bosnia Herzegovina', 'Serbia', 'Scotland', 'Hungary',
       'Switzerland', 'Greece', 'Austria', 'Morocco', 'Sweden', 'Wales',
       'Colombia', 'Czech Republic', 'Chile', 'Algeria', 'Ivory Coast',
       'Togo', 'Norway', 'Mexico', 'Iceland', 'Finland', 'Jamaica',
       'Albania', 'Guinea', 'Cameroon', 'Ghana', 'Montenegro', 'Ukraine',
       'Russia', 'DR Congo', 'Central African Rep.', 'Venezuela',
       'Nigeria', 'Armenia', 'Israel', 'Ecuador', 'Paraguay', 'Australia',
       'Turkey', 'Romania', 'Japan', 'Mali', 'USA', 'Kosovo',
       'Dominican Republic', 'Tanzania', 'China PR', 'Northern Ireland',
       'Republic of Ireland', 'Tunisia', 'Cape Verde', 'FYR Macedonia',
       'Burkina Faso', 'Kenya', 'Ang

In [53]:
# Selecting only the columns that we want
players_df = players_df[["long_name", "club", "joined","nationality","sofifa_id","player_traits"]]
players_df = players_df.rename(columns={"long_name": "name"})
players_df

Unnamed: 0,name,club,joined,nationality,sofifa_id,player_traits
0,Lionel Andrés Messi Cuccittini,FC Barcelona,7/1/2004,Argentina,158023,"Beat Offside Trap, Argues with Officials, Earl..."
1,Cristiano Ronaldo dos Santos Aveiro,Juventus,7/10/2018,Portugal,20801,"Long Throw-in, Selfish, Argues with Officials,..."
2,Neymar da Silva Santos Junior,Paris Saint-Germain,8/3/2017,Brazil,190871,"Power Free-Kick, Injury Free, Selfish, Early C..."
3,Jan Oblak,Atlético Madrid,7/16/2014,Slovenia,200389,"Flair, Acrobatic Clearance"
4,Eden Hazard,Real Madrid,7/1/2019,Belgium,183277,"Beat Offside Trap, Selfish, Finesse Shot, Spee..."
...,...,...,...,...,...,...
18273,邵帅,Beijing Renhe FC,7/13/2018,China PR,245006,
18274,Mingjie Xiao,Shanghai SIPG FC,2/1/2019,China PR,250995,
18275,张威,Hebei China Fortune FC,1/1/2019,China PR,252332,
18276,汪海健,Shanghai Greenland Shenhua FC,2/1/2019,China PR,251110,


### Transform Clubs DataFrame

In [54]:
clubs_country = clubs_country[["Season", "Hosts", "Champions"]]
clubs_country = clubs_country.rename(columns={"Season": "season", "Hosts": "host", "Champions" : "club"})
clubs_country

Unnamed: 0,season,host,club
0,2000,Brazil,Corinthians[n 2]
1,2001,Spain,Tournament cancelled
2,2005,Japan,São Paulo
3,2006,Japan,Internacional
4,2007,Japan,Milan
5,2008,Japan,Manchester United
6,2009,UAE,Barcelona[n 5]
7,2010,UAE,Internazionale
8,2011,Japan,Barcelona
9,2012,Japan,Corinthians


In [55]:
# Creating a Roster of 2018 winning team and when the players join the team

# Merging DataFrames clubs_country and players_df
pyr_ctry_m = pd.merge(clubs_country, players_df, on="club")

# Filtering for 2018
pyr_ctry_m = pyr_ctry_m.loc[pyr_ctry_m["season"] == 2018,:]

# Renaming long_name header to Roster
pyr_ctry_m = pyr_ctry_m.rename(columns={"name" : "roster"})
pyr_ctry_m

Unnamed: 0,season,host,club,roster,joined,nationality,sofifa_id,player_traits
148,2018,UAE,Real Madrid,Eden Hazard,7/1/2019,Belgium,183277,"Beat Offside Trap, Selfish, Finesse Shot, Spee..."
149,2018,UAE,Real Madrid,Luka Modrić,8/1/2012,Croatia,177003,"Argues with Officials, Finesse Shot, Speed Dri..."
150,2018,UAE,Real Madrid,Sergio Ramos García,8/1/2005,Spain,155862,"Injury Prone, Avoids Using Weaker Foot, Leader..."
151,2018,UAE,Real Madrid,Thibaut Courtois,8/9/2018,Belgium,192119,"Flair, Acrobatic Clearance"
152,2018,UAE,Real Madrid,Toni Kroos,7/17/2014,Germany,182521,"Argues with Officials, Finesse Shot, Crowd Fav..."
153,2018,UAE,Real Madrid,Carlos Henrique Venancio Casimiro,7/11/2013,Brazil,200145,"Long Throw-in, Injury Prone, Argues with Offic..."
154,2018,UAE,Real Madrid,Karim Benzema,7/9/2009,France,165153,"Finesse Shot, Speed Dribbler (CPU AI Only), Gi..."
155,2018,UAE,Real Madrid,Keylor Navas,8/3/2014,Costa Rica,193041,"Flair, Second Wind"
156,2018,UAE,Real Madrid,Francisco Román Alarcón Suárez,7/3/2013,Spain,197781,"Injury Free, Beat Offside Trap, Selfish, Fines..."
157,2018,UAE,Real Madrid,Raphaël Varane,7/1/2011,France,201535,


### Create database connection

In [56]:
username = config.username
password = config.password
database = config.database

rds_connection_string = f"{username}:{password}@localhost:5432/{database}"
engine = create_engine(f'postgresql://{rds_connection_string}')
engine

Engine(postgresql://postgres:***@localhost:5432/etl)

### Load DataFrames into database

In [57]:
world_df_dup.to_sql(
  name = 'world_cup',
  con = engine,
  if_exists = 'replace',
  index = False,
  dtype = {
    "year": Integer(),
    "country": String(),
    "winner": String(),
    "runners_up": String(),
    "third": String(),
    "fourth": String(),
    "goals_scored": Integer(),
    "qualified_teams": Integer(),
    "matches_played": Integer(),
    "attendance": Integer()
  }
)
# with engine.connect() as con:
#     con.execute('ALTER TABLE "world_cup" ADD PRIMARY KEY ("year");')
pd.read_sql_query('select * from world_cup', con=engine).head()

Unnamed: 0,year,country,winner,runners_up,third,fourth,goals_scored,qualified_teams,matches_played,attendance
0,1930,Uruguay,Uruguay,Argentina,USA,Yugoslavia,70,13,18,590549
1,1934,Italy,Italy,Czechoslovakia,Germany,Austria,70,16,17,363000
2,1938,France,Italy,Hungary,Brazil,Sweden,84,15,18,375700
3,1950,Brazil,Uruguay,Brazil,Sweden,Spain,88,13,22,1045246
4,1954,Switzerland,Germany FR,Hungary,Austria,Uruguay,140,16,26,768607


In [58]:
players_df.to_sql(
  name = 'players',
  con = engine,
  if_exists = 'replace',
  index = False,
  dtype = {
    "name": String(),
    "club": String(),
    "joined": Date(),
    "nationality": String(),
    "sofifa_id": Integer(),
    "player_traits": String() 
  }
)

pd.read_sql_query('select * from players', con=engine).head()
# Other ways to query all columns from the table
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html#pandas.read_sql
# pd.read_sql_table('players', engine).head()
# pd.read_sql('players', engine).head()
# pd.read_sql('select * from players', engine).head()

Unnamed: 0,name,club,joined,nationality,sofifa_id,player_traits
0,Lionel Andrés Messi Cuccittini,FC Barcelona,2004-07-01,Argentina,158023,"Beat Offside Trap, Argues with Officials, Earl..."
1,Cristiano Ronaldo dos Santos Aveiro,Juventus,2018-07-10,Portugal,20801,"Long Throw-in, Selfish, Argues with Officials,..."
2,Neymar da Silva Santos Junior,Paris Saint-Germain,2017-08-03,Brazil,190871,"Power Free-Kick, Injury Free, Selfish, Early C..."
3,Jan Oblak,Atlético Madrid,2014-07-16,Slovenia,200389,"Flair, Acrobatic Clearance"
4,Eden Hazard,Real Madrid,2019-07-01,Belgium,183277,"Beat Offside Trap, Selfish, Finesse Shot, Spee..."


In [59]:
clubs_country.to_sql(
  name = 'clubs_country',
  con = engine,
  if_exists = 'replace',
  index = False,
  dtype = {
    "season": Integer(),
    "host": String(),
    "club": String()
  }
)

pd.read_sql_query('select * from clubs_country', con=engine).head()

Unnamed: 0,season,host,club
0,2000,Brazil,Corinthians[n 2]
1,2001,Spain,Tournament cancelled
2,2005,Japan,São Paulo
3,2006,Japan,Internacional
4,2007,Japan,Milan


### Output DataFrame to CSV

In [60]:
# Outputting DataFrame to csv
clubs_file = "Resources/clubs.csv"
with open(clubs_file, 'w') as csv_file:
    clubs_country.to_csv(csv_file, index=False)

### Analytics questions: What is the ratio of players nationality and the number of times those countries have hosted the World Cup?

In [61]:
pd.read_sql_query("""
        select 
        p.nationality as country, 
        count(DISTINCT p.sofifa_id) as no_of_players_from_country,
        COALESCE((select count(country) from world_cup where country = p.nationality group by country),0) as no_times_host
        from players as p left outer join world_cup as wc
        on p.nationality = wc.country
        group by p.nationality, wc.country
        order by no_times_host desc""", con=engine).head(25)

Unnamed: 0,country,no_of_players_from_country,no_times_host
0,Germany,1216,2
1,Brazil,824,2
2,Italy,732,2
3,France,984,2
4,Mexico,340,2
5,USA,347,1
6,Chile,370,1
7,Argentina,886,1
8,England,1667,1
9,Uruguay,164,1


### Player questions: 2019 Winners and Players Traits

In [62]:
# Query Clubs Country DB Table for Club Champions in 2019     
pd.read_sql_query("SELECT * FROM clubs_country WHERE season = 2019 ",con=engine)

Unnamed: 0,season,host,club
0,2019,Qatar,Liverpool[n 11]


In [63]:
# Query Players DB to find out what which players are injury free?
pd.read_sql_query("SELECT club, name, player_traits FROM players WHERE player_traits like '%%Injury Free%%'",con=engine)

Unnamed: 0,club,name,player_traits
0,Paris Saint-Germain,Neymar da Silva Santos Junior,"Power Free-Kick, Injury Free, Selfish, Early C..."
1,Tottenham Hotspur,Harry Kane,"Injury Free, Avoids Using Weaker Foot, Argues ..."
2,Real Madrid,Francisco Román Alarcón Suárez,"Injury Free, Beat Offside Trap, Selfish, Fines..."
3,FC Bayern München,Philippe Coutinho Correia,"Injury Free, Selfish, Argues with Officials, C..."
4,Manchester City,Riyad Mahrez,"Injury Free, Beat Offside Trap, Selfish, Argue..."
...,...,...,...
372,Macclesfield Town,Emmanuel Osadebe,Injury Free
373,AS Nancy Lorraine,Mons Bassouamina,Injury Free
374,Willem II,James McGarry,"Injury Free, Speed Dribbler (CPU AI Only)"
375,Newport County,Adebayo Azeez,"Injury Free, Early Crosser"


### What Were the Personal Traits of the Liverpool Champions, Season 2019?

In [66]:
#Total the frequency of persona traits in the "player_traits" columns
liverpool_players_traits = pd.read_sql_query("SELECT club, name, player_traits FROM players WHERE club='Liverpool'",con=engine)
liverpool_players_traits.player_traits.str.replace("(u\'|[\[\]]|\')",'').str.lower().str.split(',',expand=True).stack().str.strip().value_counts()

selfish                             9
argues with officials               5
crowd favourite                     4
power free-kick                     4
early crosser                       4
leadership                          4
finesse shot                        3
dives into tackles (cpu ai only)    3
diver                               3
outside foot shot                   2
giant throw-in                      2
avoids using weaker foot            2
injury prone                        2
beat offside trap                   1
flair                               1
swerve pass                         1
skilled dribbling                   1
flair passes                        1
long passer (cpu ai only)           1
inflexible                          1
injury free                         1
speed dribbler (cpu ai only)        1
dtype: int64