In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import libraries
import pandas as pd
from sqlalchemy import create_engine
from config import password
import psycopg2

In [3]:
# Import data
data_file = 'Resources/Raw/WDIData_raw.csv'
df = pd.read_csv(data_file)

In [4]:
# Inspect data
print(df.shape)
df.head(5)

(422136, 64)


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
0,Arab World,ARB,"2005 PPP conversion factor, GDP (LCU per inter...",PA.NUS.PPP.05,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,"2005 PPP conversion factor, private consumptio...",PA.NUS.PRVT.PP.05,,,,,,,...,,,,,,,,,,
2,Arab World,ARB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,82.368101,82.783289,83.120303,83.533457,83.897596,84.171599,84.510171,,,
3,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,86.00762,86.428272,87.070576,88.176836,87.342739,89.130121,89.678685,90.273687,,
4,Arab World,ARB,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,73.466653,73.942103,75.244104,77.162305,75.538976,78.741152,79.665635,80.749293,,


In [5]:
# Inspect available indicators
for i in df['Indicator Name'].unique():
    print(i)

2005 PPP conversion factor, GDP (LCU per international $)
2005 PPP conversion factor, private consumption (LCU per international $)
Access to clean fuels and technologies for cooking (% of population)
Access to electricity (% of population)
Access to electricity, rural (% of rural population)
Access to electricity, urban (% of urban population)
Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+)
Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+)
Account ownership at a financial institution or with a mobile-money-service provider, male (% of population ages 15+)
Account ownership at a financial institution or with a mobile-money-service provider, older adults (% of population ages 25+)
Account ownership at a financial institution or with a mobile-money-service provider, poorest 40% (% of population ages 15+)
Account ownership at a financial institution or with

In [6]:
# Subset indicators
indicators = ['GDP growth (annual %)', 'GDP per capita (current US$)', 'GDP per capita growth (annual %)', 'GINI index (World Bank estimate)', 'Population density (people per sq. km of land area)', 'Population growth (annual %)', 'Population, total']
indicators

['GDP growth (annual %)',
 'GDP per capita (current US$)',
 'GDP per capita growth (annual %)',
 'GINI index (World Bank estimate)',
 'Population density (people per sq. km of land area)',
 'Population growth (annual %)',
 'Population, total']

In [7]:
# Filter dataset to keep only indicators of interest
df_subset = df[df['Indicator Name'].isin(indicators)]
df_subset.head(10)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
519,Arab World,ARB,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,,,,,,,...,4.772079,3.629086,6.656678,3.166151,2.455201,3.307618,3.247325,0.9998818,2.096747,
523,Arab World,ARB,GDP per capita (current US$),NY.GDP.PCAP.CD,,,,,,,...,5945.679,6889.092,7503.174,7551.283,7497.556,6459.109,6202.89,6285.215,6626.135,
524,Arab World,ARB,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,,,,,,,...,2.333285,1.269584,4.27776,0.921317,0.2888594,1.189303,1.20393,-0.9309965,0.177435,
540,Arab World,ARB,GINI index (World Bank estimate),SI.POV.GINI,,,,,,,...,,,,,,,,,,
1130,Arab World,ARB,Population density (people per sq. km of land ...,EN.POP.DNST,,8.43086,8.663154,8.903441,9.152526,9.410965,...,31.59402,32.33012,33.06767,33.80379,34.53398,35.2569,35.96876,36.6698,37.37237,
1131,Arab World,ARB,Population growth (annual %),SP.POP.GROW,,2.740584,2.755287,2.773671,2.797625,2.823683,...,2.383187,2.329922,2.281329,2.224341,2.160102,2.093418,2.019087,1.949024,1.915912,
1142,Arab World,ARB,"Population, total",SP.POP.TOTL,92197750.0,94724510.0,97334440.0,100034200.0,102832800.0,105736400.0,...,354890000.0,363158700.0,371443500.0,379705700.0,387907700.0,396028300.0,404024400.0,411899000.0,419790600.0,
2118,Caribbean small states,CSS,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,,,,,,,...,1.462283,1.11818,1.323901,1.220145,0.3300793,1.168251,-2.050514,0.2945761,1.638644,
2122,Caribbean small states,CSS,GDP per capita (current US$),NY.GDP.PCAP.CD,447.1016,475.6719,493.4611,515.4108,546.5377,579.4484,...,9075.706,9743.62,10007.37,10090.03,10149.2,9916.667,9242.035,9442.913,9910.398,
2123,Caribbean small states,CSS,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,,,,,,,...,0.7674477,0.4100026,0.6036123,0.5001837,-0.3706781,0.4849864,-2.686148,-0.3306925,1.031281,


In [8]:
# Drop unnecessary column
df_subset.drop('Indicator Code', axis=1, inplace=True)
df_subset[df_subset['Country Code'] == 'USA'].head(5)

Unnamed: 0,Country Name,Country Code,Indicator Name,1960,1961,1962,1963,1964,1965,1966,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
405066,United States,USA,GDP growth (annual %),,2.3,6.1,4.4,5.8,6.4,6.5,...,2.563767,1.550836,2.249546,1.842081,2.451973,2.88091,1.567215,2.21701,2.856988,
405070,United States,USA,GDP per capita (current US$),3007.123445,3066.562869,3243.843078,3374.515171,3573.941185,3827.52711,4146.316646,...,48466.823375,49883.113984,51603.497261,53106.90977,55032.957998,56803.472433,57904.201961,59927.929834,62641.01457,
405071,United States,USA,GDP per capita growth (annual %),,0.618121,4.480669,2.908272,4.340549,5.078098,5.277114,...,1.716748,0.816232,1.502171,1.138497,1.702634,2.125123,0.835128,1.564444,2.221829,
405087,United States,USA,GINI index (World Bank estimate),,,,,,,,...,40.4,,,41.0,,,41.5,,,
405677,United States,USA,Population density (people per sq. km of land ...,,20.05588,20.366723,20.661953,20.950959,21.214527,21.460952,...,33.815664,34.062064,34.312868,34.55157,34.806144,35.063731,35.318302,35.545227,35.766089,


In [9]:
# Use melt to transform array of years (1960-2018) to rows (one row per year per indicator)
df_melt = df_subset.melt(id_vars=["Country Name", "Country Code", "Indicator Name"], 
        var_name="Date", 
        value_name="Value")

In [10]:
df_melt[df_melt['Country Code'] == 'USA'].head(10)

Unnamed: 0,Country Name,Country Code,Indicator Name,Date,Value
1771,United States,USA,GDP growth (annual %),1960,
1772,United States,USA,GDP per capita (current US$),1960,3007.123
1773,United States,USA,GDP per capita growth (annual %),1960,
1774,United States,USA,GINI index (World Bank estimate),1960,
1775,United States,USA,Population density (people per sq. km of land ...,1960,
1776,United States,USA,Population growth (annual %),1960,1.701993
1777,United States,USA,"Population, total",1960,180671000.0
3619,United States,USA,GDP growth (annual %),1961,2.3
3620,United States,USA,GDP per capita (current US$),1961,3066.563
3621,United States,USA,GDP per capita growth (annual %),1961,0.6181212


In [11]:
# Use pivot_table to get a column per indicator
df_pivot = pd.pivot_table(df_melt, values = 'Value', index=['Country Name','Country Code', 'Date'], columns = 'Indicator Name').reset_index()
df_pivot[df_pivot['Country Code'] == 'USA'].head(10)

Indicator Name,Country Name,Country Code,Date,GDP growth (annual %),GDP per capita (current US$),GDP per capita growth (annual %),GINI index (World Bank estimate),Population density (people per sq. km of land area),Population growth (annual %),"Population, total"
14674,United States,USA,1960,,3007.123445,,,,1.701993,180671000.0
14675,United States,USA,1961,2.3,3066.562869,0.618121,,20.05588,1.65773,183691000.0
14676,United States,USA,1962,6.1,3243.843078,4.480669,,20.366723,1.537997,186538000.0
14677,United States,USA,1963,4.4,3374.515171,2.908272,,20.661953,1.439165,189242000.0
14678,United States,USA,1964,5.8,3573.941185,4.340549,,20.950959,1.389046,191889000.0
14679,United States,USA,1965,6.4,3827.52711,5.078098,,21.214527,1.250172,194303000.0
14680,United States,USA,1966,6.5,4146.316646,5.277114,,21.460952,1.154893,196560000.0
14681,United States,USA,1967,2.5,4336.426587,1.389951,,21.695913,1.088881,198712000.0
14682,United States,USA,1968,4.8,4695.92339,3.758819,,21.913623,0.998461,200706000.0
14683,United States,USA,1969,3.1,5032.144743,2.09737,,22.128822,0.977243,202677000.0


In [12]:
# Format values
for col in ['GDP growth (annual %)',
       'GDP per capita (current US$)', 'GDP per capita growth (annual %)',
       'GINI index (World Bank estimate)',
       'Population density (people per sq. km of land area)',
       'Population growth (annual %)']:

    df_pivot[col] = round(df_pivot[col],2)

df_pivot['Population, total'] = df_pivot['Population, total'].astype('int', errors='ignore')

In [13]:
# Check output
df_pivot[df_pivot['Country Code'] == 'USA'].head()

Indicator Name,Country Name,Country Code,Date,GDP growth (annual %),GDP per capita (current US$),GDP per capita growth (annual %),GINI index (World Bank estimate),Population density (people per sq. km of land area),Population growth (annual %),"Population, total"
14674,United States,USA,1960,,3007.12,,,,1.7,180671000.0
14675,United States,USA,1961,2.3,3066.56,0.62,,20.06,1.66,183691000.0
14676,United States,USA,1962,6.1,3243.84,4.48,,20.37,1.54,186538000.0
14677,United States,USA,1963,4.4,3374.52,2.91,,20.66,1.44,189242000.0
14678,United States,USA,1964,5.8,3573.94,4.34,,20.95,1.39,191889000.0


In [14]:
# Fix columns name to match DB Scheme
df_pivot.columns = [
    'country_name', 'country_code', 'year', 'gdp_growth_annual', 
    'gdp_per_cap', 'gdp_per_cap_growth_annual', 'gini_index', 
    'pop_density', 'pop_growth_annual', 'pop_total']

df_pivot.head()

Unnamed: 0,country_name,country_code,year,gdp_growth_annual,gdp_per_cap,gdp_per_cap_growth_annual,gini_index,pop_density,pop_growth_annual,pop_total
0,Afghanistan,AFG,1960,,59.77,,,,1.83,8996973.0
1,Afghanistan,AFG,1961,,59.86,,,14.04,1.9,9169410.0
2,Afghanistan,AFG,1962,,58.46,,,14.32,1.97,9351441.0
3,Afghanistan,AFG,1963,,78.71,,,14.62,2.03,9543205.0
4,Afghanistan,AFG,1964,,82.1,,,14.93,2.09,9744781.0


In [15]:
df_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15411 entries, 0 to 15410
Data columns (total 10 columns):
country_name                 15411 non-null object
country_code                 15411 non-null object
year                         15411 non-null object
gdp_growth_annual            11662 non-null float64
gdp_per_cap                  12086 non-null float64
gdp_per_cap_growth_annual    11659 non-null float64
gini_index                   1486 non-null float64
pop_density                  14830 non-null float64
pop_growth_annual            15358 non-null float64
pop_total                    15409 non-null float64
dtypes: float64(7), object(3)
memory usage: 1.2+ MB


In [16]:
df_test = df_pivot
df_test.head()

Unnamed: 0,country_name,country_code,year,gdp_growth_annual,gdp_per_cap,gdp_per_cap_growth_annual,gini_index,pop_density,pop_growth_annual,pop_total
0,Afghanistan,AFG,1960,,59.77,,,,1.83,8996973.0
1,Afghanistan,AFG,1961,,59.86,,,14.04,1.9,9169410.0
2,Afghanistan,AFG,1962,,58.46,,,14.32,1.97,9351441.0
3,Afghanistan,AFG,1963,,78.71,,,14.62,2.03,9543205.0
4,Afghanistan,AFG,1964,,82.1,,,14.93,2.09,9744781.0


In [22]:
# Convert pop_total to integer
df_test['pop_total'] = df_test['pop_total'].fillna(0.0).astype(int)
df_test.head()

Unnamed: 0,country_name,country_code,year,gdp_growth_annual,gdp_per_cap,gdp_per_cap_growth_annual,gini_index,pop_density,pop_growth_annual,pop_total
0,Afghanistan,AFG,1960,,59.77,,,,1.83,8996973
1,Afghanistan,AFG,1961,,59.86,,,14.04,1.9,9169410
2,Afghanistan,AFG,1962,,58.46,,,14.32,1.97,9351441
3,Afghanistan,AFG,1963,,78.71,,,14.62,2.03,9543205
4,Afghanistan,AFG,1964,,82.1,,,14.93,2.09,9744781


## Connect & Load Into Database

In [17]:
# Create connection
engine = create_engine(f'postgresql+psycopg2://postgres:{password}@localhost:5432/olympic_data')

# Confirm table
engine.table_names()

['summer', 'soccer', 'winter', 'regions', 'country', 'athlete']

In [24]:
# Load data frame to database
df_test.to_sql(name='country', con=engine, if_exists='append', index=False)

In [25]:
# Confirm data has been loaded
pd.read_sql_query('SELECT * FROM country', con=engine).head()

Unnamed: 0,country_name,country_code,year,gdp_growth_annual,gdp_per_cap,gdp_per_cap_growth_annual,gini_index,pop_density,pop_growth_annual,pop_total
0,Afghanistan,AFG,1960,,59.77,,,,1.83,8996973
1,Afghanistan,AFG,1961,,59.86,,,14.04,1.9,9169410
2,Afghanistan,AFG,1962,,58.46,,,14.32,1.97,9351441
3,Afghanistan,AFG,1963,,78.71,,,14.62,2.03,9543205
4,Afghanistan,AFG,1964,,82.1,,,14.93,2.09,9744781


## Export to CSV

In [26]:
df_test.to_csv("Resources/WDIData_clean.csv", index=False)