# World Bank Data

## This data was taken from the dataset on [Kaggle](https://www.kaggle.com/worldbank/world-development-indicators)

In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy

In [2]:
def readData(filename):
    with open(filename) as f:
        data = pd.read_csv(f)
        
    return data

In [3]:
all_df = readData('world-development-indicators/indicators.csv')

In [4]:
def getSpecificDf(all_df, indicatorcode):
    specific_indicator = all_df[all_df.IndicatorCode == indicatorcode]
    years = list(range(min(all_df.Year), max(all_df.Year)+1))
    col_names = ['CountryCode', 'CountryName'] + years
    specific_indicator = specific_indicator[['CountryCode', 'CountryName', 'Year', 'Value']]
    for year in years:
        specific_indicator[year] = np.nan
    
    for index, row in specific_indicator.iterrows():
        row_value = deepcopy(row.Value)
        specific_indicator.loc[index, row.Year] = row_value

    specific_indicator = specific_indicator[col_names]
    specific_indicator_formatted = specific_indicator.groupby([specific_indicator.CountryCode, specific_indicator.CountryName]).sum()
    specific_indicator_formatted.reset_index(inplace=True)
    specific_indicator_formatted = specific_indicator_formatted.dropna(axis=1, how='all')
    
    return specific_indicator_formatted

## Life Expectancy

In [5]:
life_expectancy = getSpecificDf(all_df, 'SP.DYN.LE00.IN')

In [6]:
life_expectancy.to_csv('world_bank_data/life_expectancy.csv', index=False)

## Population Size

In [7]:
population_total = getSpecificDf(all_df, 'SP.POP.TOTL')

In [8]:
population_total.to_csv('../refugee_data_dv/population_size.csv', index=False)

## Refugees by origin

In [9]:
refugees_by_origin = getSpecificDf(all_df, 'SM.POP.REFG.OR')

In [10]:
refugees_by_origin.to_csv('../refugee_data_dv/refugees_by_origin.csv', index=False)

### Refugees as percent of population

In [11]:
population_total2 = population_total.set_index('CountryCode')

In [12]:
refugees_by_origin2 = refugees_by_origin.set_index('CountryCode')

In [13]:
countries = refugees_by_origin.iloc[:,:2]
countries.index = countries['CountryCode']

In [14]:
refugees_fraction = refugees_by_origin2.iloc[:,1:].divide(population_total2.iloc[:,1:]).dropna(axis=1, how='all').dropna(axis=0, how='all')

In [15]:
refugees_fraction = pd.concat([countries, refugees_fraction], axis=1, join='outer')

In [16]:
refugees_fraction.reset_index(drop=True, inplace=True)

In [18]:
refugees_fraction.to_csv('../refugee_data_dv/refugees_fraction.csv', index=False)

#### Z - scores

In [20]:
refugees_fraction_z = deepcopy(refugees_fraction)
for col in refugees_fraction.columns[2:]:
    refugees_fraction_z[col] = (refugees_fraction[col] - refugees_fraction[col].mean())/refugees_fraction[col].std(ddof=0)
    
refugees_fraction_z.to_csv('../refugee_data_dv/refugees_fraction_z.csv', index=False)