# World Bank Data

## This data was taken from the dataset on [Kaggle](https://www.kaggle.com/worldbank/world-development-indicators)

In [2]:
import pandas as pd
import numpy as np
from copy import deepcopy

In [3]:
def readData(filename):
    with open(filename) as f:
        data = pd.read_csv(f)
        
    return data

In [4]:
all_df = readData('world-development-indicators/indicators.csv')

In [14]:
def getSpecificDf(all_df, indicatorcode):
    specific_indicator = all_df[all_df.IndicatorCode == indicatorcode]
    years = list(range(min(all_df.Year), max(all_df.Year)+1))
    col_names = ['CountryCode', 'CountryName'] + years
    specific_indicator = specific_indicator[['CountryCode', 'CountryName', 'Year', 'Value']]
    for year in years:
        specific_indicator[year] = np.nan
    
    for index, row in specific_indicator.iterrows():
        row_value = deepcopy(row.Value)
        specific_indicator.loc[index, row.Year] = row_value

    specific_indicator = specific_indicator[col_names]
    specific_indicator_formatted = specific_indicator.groupby([specific_indicator.CountryCode, specific_indicator.CountryName]).sum()
    specific_indicator_formatted.reset_index(inplace=True)
    specific_indicator_formatted = specific_indicator_formatted.dropna(axis=1, how='all')
    
    years = specific_indicator_formatted.columns[2:]
    y_years = []
    for year in years:
        y_years.append('y'+str(year))
    adjusted_col_names = ['countrycode', 'countryname'] + y_years

    specific_indicator_formatted.columns = adjusted_col_names
    
    return specific_indicator_formatted

## Life Expectancy

In [15]:
life_expectancy = getSpecificDf(all_df, 'SP.DYN.LE00.IN')

In [16]:
life_expectancy.to_csv('world_bank_data/life_expectancy.csv', index=False)

## Population Size

In [17]:
population_total = getSpecificDf(all_df, 'SP.POP.TOTL')

In [18]:
population_total.to_csv('../refugee_data_dv/population_size.csv', index=False)

## Refugees by origin

In [19]:
refugees_by_origin = getSpecificDf(all_df, 'SM.POP.REFG.OR')

In [20]:
refugees_by_origin.to_csv('../refugee_data_dv/refugees_by_origin.csv', index=False)

### Refugees as percent of population

In [22]:
population_total2 = population_total.set_index('countrycode')

In [24]:
refugees_by_origin2 = refugees_by_origin.set_index('countrycode')

In [25]:
countries = refugees_by_origin.iloc[:,:2]
countries.index = countries['countrycode']

In [26]:
refugees_fraction = refugees_by_origin2.iloc[:,1:].divide(population_total2.iloc[:,1:]).dropna(axis=1, how='all').dropna(axis=0, how='all')

In [27]:
refugees_fraction = pd.concat([countries, refugees_fraction], axis=1, join='outer')

In [28]:
refugees_fraction.reset_index(drop=True, inplace=True)

In [29]:
refugees_fraction.to_csv('../refugee_data_dv/refugees_fraction.csv', index=False)

#### Z - scores

In [30]:
refugees_fraction_z = deepcopy(refugees_fraction)
for col in refugees_fraction.columns[2:]:
    refugees_fraction_z[col] = (refugees_fraction[col] - refugees_fraction[col].mean())/refugees_fraction[col].std(ddof=0)
    
refugees_fraction_z.to_csv('../refugee_data_dv/refugees_fraction_z.csv', index=False)

## Youth Literacy age 15-24

In [31]:
youth_literacy = getSpecificDf(all_df, 'SE.ADT.1524.LT.ZS')

In [32]:
youth_literacy.to_csv('world_bank_data/youth_literacy.csv', index=False)

## Youth Literacy age 15-24

In [33]:
youth_literacy = getSpecificDf(all_df, 'SE.ADT.1524.LT.ZS')

In [34]:
youth_literacy.to_csv('world_bank_data/youth_literacy.csv', index=False)

## under 5 mortality per 1000

In [35]:
under_5_mortality = getSpecificDf(all_df, 'SH.DYN.MORT')
under_5_mortality.to_csv('world_bank_data/under_5_mortality.csv', index=False)

## Population, ages 0-14 (% of total)

In [36]:
population_0_14 = getSpecificDf(all_df, 'SP.POP.0014.TO.ZS')
population_0_14.to_csv('world_bank_data/population_0_14.csv', index=False)

## Population, ages 15-64 (% of total)

In [37]:
population_15_64 = getSpecificDf(all_df, 'SP.POP.1564.TO.ZS')
population_15_64.to_csv('world_bank_data/population_15_64.csv', index=False)

## Population ages 65 and above (% of total)

In [38]:
population_65_up = getSpecificDf(all_df, 'SP.POP.65UP.TO.ZS')
population_65_up.to_csv('world_bank_data/population_65_up.csv', index=False)

## Population, female (% of total)

In [39]:
population_female = getSpecificDf(all_df, 'SP.POP.TOTL.FE.ZS')
population_female.to_csv('world_bank_data/population_female.csv', index=False)

## Population growth (annual %)

In [40]:
population_growth = getSpecificDf(all_df, 'SP.POP.GROW')
population_growth.to_csv('world_bank_data/population_growth.csv', index=False)

## % of female legislators

In [41]:
female_legislators = getSpecificDf(all_df, 'SG.GEN.PARL.ZS')
female_legislators.to_csv('world_bank_data/female_legislators.csv', index=False)
