In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import dill

# Contents
* [Cleaning Economy](#economy)
* [Cleaning Health](#health)
* [Cleaning Education](#education)
* [Cleaning Development](#development)
* [Cleaning Finance](#finance)
* [Cleaning Environment](#environment)
* [Cleaning Poverty](#poverty)

* [Stacks](#stacks)
* [Train Stacks](#trainstacks)

In [2]:
def unwrap(indicator_name, compiled_df):
    df_index = compiled_df.at['Coverage_Range', indicator_name]
    df_index = string_to_list(df_index)
    df_index = [ind for ind in df_index if len(ind)==4]
    df_index = pd.to_datetime(df_index)
    countries = compiled_df.index
    countries = countries.drop('Coverage_Range')
    data = {}
    for country in countries:
        annual_data = compiled_df.at[country,indicator_name]
        
        if not pd.isnull(annual_data):
            time_series = string_to_list(annual_data)
            if len(time_series)==len(df_index):            
            #print country
                data[country]=time_series
            #print len(time_series), country, len(df_index)
        else:
            data[country]=np.nan
    unwrapping = pd.DataFrame(index=df_index, data=data, dtype=np.dtype(float) )
    return unwrapping
def string_to_list(string):
    list_ = string[1:-1]
    list_ = list_.split(',')
    list_ = [t.replace("'", '').strip() for t in list_]
    return list_
def data_to_list(string):
    list_ = string[1:-1]
    list_ = list_.split(',')
    list_ = [t.replace("'", '').strip() for t in list_]
    list_ = [float(t) for t in list_ if t != 'nan']
    list_ = [np.nan for t in list_ if t == 'nan']
    
    return list_

## Cleaning Economy <a class="anchor" id="economy"></a>

In [164]:
econ = pd.read_csv('Economy.csv', index_col=0)

In [4]:
page = requests.get('https://data.worldbank.org/indicator?tab=all')

In [5]:
soup4 = BeautifulSoup(page.text,'lxml')

In [6]:
def get_names(bsoup, react_id):
    chart = bsoup.find_all('ul',attrs={'data-reactid':'{}'.format(react_id)})
    s_list = chart[0].find_all('li')
    indicators = [attr.text for attr in s_list]
    return indicators

In [7]:
econ_ind = get_names(soup4, '599')

In [8]:
len(econ_ind)

261

In [9]:
econ_ind.remove(u'Portfolio Investment, net (BoP, current US$)')

In [10]:
econ_clean = econ[econ_ind]

In [11]:
econ_clean.shape

(346, 260)

In [12]:
econ_clean.head()

Unnamed: 0,"2005 PPP conversion factor, GDP (LCU per international $)","2005 PPP conversion factor, private consumption (LCU per international $)",Adjusted net national income (annual % growth),Adjusted net national income (constant 2010 US$),Adjusted net national income (current US$),Adjusted net national income per capita (annual % growth),Adjusted net national income per capita (constant 2010 US$),Adjusted net national income per capita (current US$),"Adjusted net savings, excluding particulate emission damage (% of GNI)","Adjusted net savings, excluding particulate emission damage (current US$)",...,Total debt service (% of GNI),"Total debt service (% of exports of goods, services and primary income)","Total reserves (includes gold, current US$)",Total reserves minus gold (current US$),Trade (% of GDP),Trade in services (% of GDP),"Transport services (% of service exports, BoP)","Transport services (% of service imports, BoP)","Travel services (% of service exports, BoP)","Travel services (% of service imports, BoP)"
Coverage_Range,"['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...",...,"['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965...","['1960', '1961', '1962', '1963', '1964', '1965..."
Canada,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[2012986400.1272302, 2296085100.2121396, 25619...","[1112520000.15252, 1345840000.21214, 185200000...","[35.101614925524395, 35.804682056991, 35.57546...","[6.50084104905362, 6.08834437311754, 5.8984837...","[19.4005839685265, 30.5744124947817, 29.964251...","[22.7608314600833, 33.8379871390855, 34.249798...","[37.7990178467193, 45.793325761953895, 48.0335...","[42.236438180705704, 36.6090458003959, 34.1302..."
EMDE ECA South Caucasus,,,,,,,,,,,...,,,,,,,,,,
OECD members,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[23.6192237137114, 23.4471522538317, 23.181608...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
Swaziland,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",...,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[82.618569621112, 76.7906705842924, 93.5812983...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."


In [13]:
econ_clean.loc['Coverage_Range'].unique()

array([ "['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']"], dtype=object)

In [14]:
test_ind = unwrap(econ_clean.columns[0], econ_clean)

In [15]:
mean_test = test_ind.mean(axis=1)

In [16]:
econ_dict = {}
for col in econ_clean.columns:
    ind_df = unwrap(col,econ_clean)
    econ_dict[col]=ind_df.mean(axis=1)

In [18]:
econ_index = econ_dict[econ_clean.columns[0]].index

In [19]:
econ_df = pd.DataFrame(data=econ_dict, index=econ_index)

In [20]:
econ_df.dropna(axis=0, how='all', inplace=True)
econ_df.dropna(axis=1, how='all', inplace=True)

In [22]:
econ_df.shape

(57, 260)

In [23]:
econ_df.index

DatetimeIndex(['1960-01-01', '1961-01-01', '1962-01-01', '1963-01-01',
               '1964-01-01', '1965-01-01', '1966-01-01', '1967-01-01',
               '1968-01-01', '1969-01-01', '1970-01-01', '1971-01-01',
               '1972-01-01', '1973-01-01', '1974-01-01', '1975-01-01',
               '1976-01-01', '1977-01-01', '1978-01-01', '1979-01-01',
               '1980-01-01', '1981-01-01', '1982-01-01', '1983-01-01',
               '1984-01-01', '1985-01-01', '1986-01-01', '1987-01-01',
               '1988-01-01', '1989-01-01', '1990-01-01', '1991-01-01',
               '1992-01-01', '1993-01-01', '1994-01-01', '1995-01-01',
               '1996-01-01', '1997-01-01', '1998-01-01', '1999-01-01',
               '2000-01-01', '2001-01-01', '2002-01-01', '2003-01-01',
               '2004-01-01', '2005-01-01', '2006-01-01', '2007-01-01',
               '2008-01-01', '2009-01-01', '2010-01-01', '2011-01-01',
               '2012-01-01', '2013-01-01', '2014-01-01', '2015-01-01',
      

In [25]:
econ_clean.to_csv('ML_econ_clean.csv')

In [26]:
econ_df.to_csv('ML_econ_world__annual_avg.csv')

## Cleaning Health <a class="anchor" id="health"></a>

In [28]:
health = pd.read_csv('Health.csv', index_col=0)

In [40]:
health_ind = get_names(soup4, '2762')

In [44]:
health_ind.remove('Population, ages 0-14, male')
health_ind.remove('Proportion of women subjected to physical and/or sexual violence in the last 12 months ( % of women age 15-49)')
health_ind.remove(u'Total Population for Age 65 and above (only 2005 and 2010) (in number of people)')
health_ind.remove('Population, ages 0-14, total')

In [45]:
health_clean = health[health_ind]

In [46]:
health_clean.shape

(2218, 217)

In [47]:
health_clean.loc['Coverage_Range'].unique()

array([ "['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']"], dtype=object)

In [52]:
def make_clean_df(clean_this_df):
    temp_dict = {}
    for col in clean_this_df.columns:
        ind_df = unwrap(col,clean_this_df)
        temp_dict[col]=ind_df.mean(axis=1)
        
    temp_index = temp_dict[clean_this_df.columns[0]].index
    cleaned_df = pd.DataFrame(data=temp_dict, index=temp_index)
    cleaned_df.dropna(axis=0, how='all', inplace=True)
    cleaned_df.dropna(axis=1, how='all', inplace=True)
    return cleaned_df

In [53]:
health_df = make_clean_df(health_clean)

In [54]:
health_df.head()

Unnamed: 0,ARI treatment (% of children under 5 taken to a health provider),"Adolescent fertility rate (births per 1,000 women ages 15-19)",Adults (ages 15+) and children (ages 0-14) newly infected with HIV,Adults (ages 15+) newly infected with HIV,Age dependency ratio (% of working-age population),"Age dependency ratio, old (% of working-age population)","Age dependency ratio, young (% of working-age population)",Antiretroviral therapy coverage (% of people living with HIV),Antiretroviral therapy coverage for PMTCT (% of pregnant women living with HIV),"Birth rate, crude (per 1,000 people)",...,Vitamin A supplementation coverage rate (% of children ages 6-59 months),Wanted fertility rate (births per woman),Women who believe a husband is justified in beating his wife (any of five reasons) (%),Women who believe a husband is justified in beating his wife when she argues with him (%),Women who believe a husband is justified in beating his wife when she burns the food (%),Women who believe a husband is justified in beating his wife when she goes out without telling him (%),Women who believe a husband is justified in beating his wife when she neglects the children (%),Women who believe a husband is justified in beating his wife when she refuses sex with him (%),Women who were first married by age 18 (% of women ages 20-24),Women's share of population ages 15+ living with HIV (%)
1960-01-01,,101.288734,,,78.929377,8.203312,70.73916,,,38.186931,...,,,,,,,,,,
1961-01-01,,101.265981,,,79.826724,8.28194,71.560053,,,37.870258,...,,,,,,,,,,
1962-01-01,,101.217591,,,80.61804,8.345378,72.288102,,,37.959706,...,,,,,,,,,,
1963-01-01,,100.664081,,,81.268845,8.394246,72.88873,,,37.878483,...,,,,,,,,,,
1964-01-01,,100.11248,,,81.690605,8.429127,73.274599,,,37.42392,...,,,,,,,,,,


In [55]:
health_df.to_csv('ML_health_world__annual_avg.csv')
health_clean.to_csv('ML_health_clean.csv')

## Cleaning Education <a class="anchor" id="education"></a>

In [56]:
edu = pd.read_csv('Education.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [57]:
edu_ind = get_names(soup4, '1128')

In [60]:
edu_ind.remove('Gross enrollment ratio, primary, both sexes (%)')

In [61]:
edu_clean = edu[edu_ind]

In [67]:
edu_clean.loc['Coverage_Range'].unique()[1]

"['1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2020', '2025', '2030', '2035', '2040', '2045', '2050', '2055', '2060', '2065', '2070', '2075', '2080', '2085', '2090', '2095', '2100']"

In [68]:
long_index =edu_clean.loc['Coverage_Range'].unique()[1]

In [88]:
edu_clean.shape

(287, 158)

In [104]:
temp = (edu_clean.loc['Coverage_Range'] == long_index)
temp2 = (edu_clean.loc['Coverage_Range'] != long_index)
long_range = edu_clean.columns[temp]
short_range = edu_clean.columns[temp2]

In [106]:
edu_clean1 = edu_clean[short_range]
edu_clean2 = edu_clean[long_range]

In [107]:
edu_df1 = make_clean_df(edu_clean1)
edu_df2 = make_clean_df(edu_clean2)

In [110]:
edu_df = pd.concat([edu_df1, edu_df2], axis=1)
edu_df.shape

(58, 158)

In [111]:
edu_df.to_csv('ML_edu_world__annual_avg.csv')
edu_clean.to_csv('ML_edu_clean.csv')

## Cleaning Development <a class="anchor" id="development"></a>

In [142]:
dev = pd.read_csv('Development Indicator.csv', index_col=0)

In [145]:
dev_ind =  get_names(soup4, '182')
dev_ind2 =  get_names(soup4, '3209')
dev_ind3 =  get_names(soup4, '3927')
dev_ind4 =  get_names(soup4, '4647')

In [146]:
dev_ind.extend(dev_ind2)
dev_ind.extend(dev_ind3)
dev_ind.extend(dev_ind4)

In [147]:
dev_clean = dev[dev_ind]

In [160]:
def make_clean_df2(clean_this_df, main_df):
    temp_dict = {}
    for col in clean_this_df.columns:
        ind_df = unwrap(col,main_df)
        temp_dict[col]=ind_df.mean(axis=1)
        
    temp_index = temp_dict[clean_this_df.columns[0]].index
    cleaned_df = pd.DataFrame(data=temp_dict, index=temp_index)
    cleaned_df.dropna(axis=0, how='all', inplace=True)
    cleaned_df.dropna(axis=1, how='all', inplace=True)
    return cleaned_df

In [161]:
dev_df = make_clean_df2(dev_clean, dev)

In [236]:
dev_df.to_csv('ML_dev_world__annual_avg.csv')
dev_clean.to_csv('ML_dev_clean.csv')

## Cleaning Finance <a class="anchor" id="finance"></a>

In [162]:
fin = pd.read_csv('Finance.csv', index_col=0)

In [230]:
fin_ind =  get_names(soup4, '2251')

In [231]:
nonsense = """Account (% age 15+)
Account at a financial institution (% age 15+) [ts]
Account at a financial institution, female (% age 15+) [ts]
Account at a financial institution, income, poorest 40% (% ages 15+) [ts]
Account at a financial institution, income, richest 60% (% ages 15+) [ts]
Account at a financial institution, male (% age 15+) [ts]
Account, female (% age 15+)
Account, income, poorest 40% (% ages 15+)
Account, income, richest 60% (% ages 15+)
Account, male (% age 15+)
Account, older adults (% ages 25+)
Account, primary education or less (% ages 15+)
Account, secondary education or more (% ages 15+)
Account, young adults (% ages 15-24)
Mobile account (% age 15+) [w2]
Mobile account, female (% age 15+) [w2]
Mobile account, income, poorest 40% (% ages 15+) [w2]
Mobile account, income, richest 60% (% ages 15+) [w2]
Mobile account, male (% age 15+) [w2]
Portfolio Investment, net (BoP, current US$)
"""

In [232]:
nonsense_list = nonsense.split('\n')

In [233]:
for non in nonsense_list:
    print non
    fin_ind.remove(non)    

Account (% age 15+)
Account at a financial institution (% age 15+) [ts]
Account at a financial institution, female (% age 15+) [ts]
Account at a financial institution, income, poorest 40% (% ages 15+) [ts]
Account at a financial institution, income, richest 60% (% ages 15+) [ts]
Account at a financial institution, male (% age 15+) [ts]
Account, female (% age 15+)
Account, income, poorest 40% (% ages 15+)
Account, income, richest 60% (% ages 15+)
Account, male (% age 15+)
Account, older adults (% ages 25+)
Account, primary education or less (% ages 15+)
Account, secondary education or more (% ages 15+)
Account, young adults (% ages 15-24)
Mobile account (% age 15+) [w2]
Mobile account, female (% age 15+) [w2]
Mobile account, income, poorest 40% (% ages 15+) [w2]
Mobile account, income, richest 60% (% ages 15+) [w2]
Mobile account, male (% age 15+) [w2]
Portfolio Investment, net (BoP, current US$)



ValueError: list.remove(x): x not in list

In [234]:
fin_clean = econ[fin_ind]

In [235]:
fin_df = make_clean_df(fin_clean)

In [237]:
fin_df.to_csv('ML_fin_world__annual_avg.csv')
fin_clean.to_csv('ML_fin_clean.csv')

## Cleaning Environment <a class="anchor" id="environment"></a>

In [238]:
env = pd.read_csv('Development Indicator.csv',index_col=0)

In [239]:
env_ind =  get_names(soup4, '1452')
env_ind2 =  get_names(soup4, '1557')
env_ind3 =  get_names(soup4, '433')

In [259]:
env_ind.remove('Value lost due to electrical outages (% of sales)')

In [260]:
env_clean = env[env_ind]

In [261]:
env_df = make_clean_df(env_clean)

In [262]:
env_df.to_csv('ML_env_world__annual_avg.csv')
env_clean.to_csv('ML_env_clean.csv')

## Cleaning Poverty <a class="anchor" id="poverty"></a>

In [244]:
poverty = pd.read_csv('Poverty.csv',index_col=0)

In [245]:
pov_ind = get_names(soup4,'3319')

In [246]:
pov_clean = poverty[pov_ind]

In [247]:
pov_df = make_clean_df(pov_clean)

In [248]:
pov_df.to_csv('ML_pov_world__annual_avg.csv')
pov_clean.to_csv('ML_pov_clean.csv')

# Making stacks  <a class="anchor" id="stacks"></a>

In [2]:
def string_to_list(string):
    list_ = string[1:-1]
    list_ = list_.split(',')
    list_ = [t.replace("'", '').strip() for t in list_]
    return list_

def country_unwrap_no_index(country, compiled_df):
    df_index = compiled_df.loc['Coverage_Range'].unique()[0]
    df_index = string_to_list(df_index)
    country_df = compiled_df.loc[country]
    data = {}
    for indicator, values in country_df.iteritems():
        if not pd.isnull(values):            
            time_series = string_to_list(values)
            if len(time_series)==len(df_index):
                data[indicator]=time_series
        else:
            data[indicator]=[np.nan]*len(df_index)
    unwrapping = pd.DataFrame(data=data, dtype=np.dtype(float) )
    unwrapping.interpolate()
    return unwrapping

In [3]:
def get_stack(cleaned_df):
    df_list = []
    country_list = cleaned_df.index.tolist()
    country_list.remove('Coverage_Range')
    for country in country_list:
        #print country
        temp_df = country_unwrap_no_index(country, cleaned_df)
        df_list.append(temp_df)
    stack_df = pd.concat(df_list).reset_index()
    stack_df.drop(columns=['index'], inplace=True)
    return stack_df

In [4]:
econ_clean = pd.read_csv('ML_data/ML_econ_clean.csv', index_col=0)
fin_clean =pd.read_csv('ML_data/ML_fin_clean.csv', index_col=0)

In [15]:
econ_clean = pd.read_csv('ML_data/ML_econ_clean.csv', index_col=0)
fin_clean =pd.read_csv('ML_data/ML_fin_clean.csv', index_col=0)
health_clean =pd.read_csv('ML_data/ML_health_clean.csv', index_col=0)
dev_clean =pd.read_csv('ML_data/ML_dev_clean.csv', index_col=0)
env_clean =pd.read_csv('ML_data/ML_env_clean.csv', index_col=0)
edu_clean =pd.read_csv('ML_data/ML_edu_clean.csv', index_col=0)
pov_clean =pd.read_csv('ML_data/ML_pov_clean.csv', index_col=0)

In [6]:
fin_clean.shape

(346, 65)

In [7]:
econ_clean.shape

(346, 260)

In [16]:
econ_fin_dev_combo = pd.concat([econ_clean, fin_clean, dev_clean], axis=1)

In [17]:
econ_fin_dev_combo.shape

(346, 460)

In [18]:
econ_fin_dev_combo.dropna(axis=0, how='all', inplace=True)
econ_fin_dev_combo.dropna(axis=1, how='all', inplace=True)
econfindev_stack = get_stack(econ_fin_dev_combo)

Afghanistan
Albania
Algeria
American Samoa
Andorra
Angola
Antigua and Barbuda
Arab World
Argentina
Armenia
Aruba
Australia
Austria
Azerbaijan
Bahamas, The
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bermuda
Bhutan
Bolivia
Bosnia and Herzegovina
Botswana
Brazil
British Virgin Islands
Brunei Darussalam
Bulgaria
Burkina Faso
Burundi
Cabo Verde
Cambodia
Cameroon
Canada
Cape Verde
Caribbean small states
Cayman Islands
Central African Republic
Central Europe and the Baltics
Chad
Channel Islands
Chile
China
Colombia
Comoros
Congo, Dem. Rep.
Congo, Rep.
Costa Rica
Cote d'Ivoire
Croatia
Cuba
Curacao
Cyprus
Czech Republic
Denmark
Djibouti
Dominica
Dominican Republic
Early-demographic dividend
East Asia & Pacific
East Asia & Pacific (IBRD only)
East Asia & Pacific (IDA & IBRD countries)
East Asia & Pacific (IDA & IBRD)
East Asia & Pacific (IDA total)
East Asia & Pacific (developing only)
East Asia & Pacific (excluding high income)
Ecuador
Egypt, Arab Rep.
El Salvador
Equatorial Guine

In [19]:
econfindev_stack.shape

(16994, 439)

In [20]:
econfindev_stack.dropna(axis=0, how='all', inplace=True)
econfindev_stack.dropna(axis=1, how='all', inplace=True)
econfindev_stack.shape

(15250, 439)

In [21]:
econfindev_stack.to_csv('ML_econfindev_stack.csv')

In [None]:
health_econ_dev_combo = pd.concat([econ_clean, health, dev_clean], axis=1)

In [421]:
health_clean.dropna(axis=0, how='all', inplace=True)
health_clean.dropna(axis=1, how='all', inplace=True)
health_stack = get_stack(health_clean)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Burundi, Rutana
Panama, Herrera
Afghanistan, Nangarhar
Lao PDR, Phongsali
Romania, Bihor
Bulgaria, Sofia
India, Andhra Pradesh
Croatia, Primorje-gorski Kota
Nigeria, Akwa Ibom
Dominican Republic, San Jose de Ocoa
Iraq, Baghdad
East Asia & Pacific (IDA & IBRD countries)
Hungary, Hajdu-bihar
Russian Federation, Komi Rep.
Poland, Podkarpackie
Albania, Kukes
Guinea-Bissau, Bafata
Trinidad and Tobago, Sangre Grande
Croatia, Dubrovnik-neretva
Turkey, Hatay
Algeria, Saida
Algeria, Bordj Bou Arrer
Honduras, Francisco Morazan
Moldova, Rezina
Vietnam, Binh Thuan
Middle East & North Africa (IBRD only)
Madagascar, Haute Matsiatra
Honduras, Copan
Algeria, Tlemcen
Azerbaijan, Absheron
Iraq, Dahuk
Tunisia, Le Kef
Burundi, Cibitoke
Korea, Rep., Chungchongnam-do
Cuba, Guantanamo
Venezuela, RB, Barinas
Lesotho, Berea
Russian Federation, Tverskaya Oblast
China, Anhui Sheng
Turkey, Giresun
Algeria, Tindouf
Romania, Galati
India, Tripura
Peru, Cajamarca
Turkey, Osmaniye
Dominican Republic, La Vega
Indonesi

Nigeria, Anambra
China, Nei Mongol Zizhiqu
Thailand, Ranong
Colombia, Vaupes
Samoa
New Zealand
Turkey, Batman
El Salvador, La Paz
Iraq, Qadissiya
Bolivia, Pando
Guatemala, Sacatepequez
Korea, Dem. People’s Rep.
Kazakhstan, Mangistauskaya
Ethiopia, Amhara
China, Yunnan Sheng
Thailand, Singburi
Cameroon, Ouest
Iran, Islamic Rep., Kermanshah
Albania, Lezhe
Sri Lanka, North Western
Bangladesh
Kazakhstan, Akmolinskaya
Ecuador, Carchi
Ecuador, Esmeraldas
Iran, Islamic Rep., Ardebil
East Asia & Pacific (IDA total)
Turkey, Siirt
Cambodia, Preah Sihanouk
Russian Federation, Leningradskaya Oblast
Tunisia, Monastir
Cuba
Romania, Constanta
Brazil, Santa Catarina
Slovenia, Pomurska
Armenia
Nigeria, Ekiti
Romania, Caras-severin
Kenya, North Eastern
Moldova, Nisporeni
Tajikistan, Sogd
Burundi, Kayanza
North America
Sweden
British Virgin Islands
Europe & Central Asia (IDA & IBRD)
Zambia, Copperbelt
Honduras, Lempira
Nigeria, Nassarawa
Egypt, Arab Rep.
Vietnam, Thai Binh
Afghanistan, Ghazni
Cambodia, K

Nicaragua, Atlantico Sur
Tanzania, Kilimanjaro
Algeria, Oran
Tajikistan, Regions of Republican Subordination (RRS)
Middle East & North Africa (IDA & IBRD countries)
Mozambique, Zambezia
Colombia, Antioquia
Romania, Dolj
Mexico, Nuevo Leon
Cameroon, Littoral
Nigeria, Katsina
St. Lucia
Russian Federation, Bryanskaya Oblast
Russian Federation, Kostromskaya Oblast
Dominican Republic, Independencia
Croatia, Krapina-zagorje
Thailand, Surin
India, Uttarakhand
Lao PDR, Louangphabang
Moldova, Hincesti
Turkey, Tekirdag
Guatemala, El Progreso
Madagascar, Vatovavy Fitovinany
Tunisia, Nabeul
Nigeria, Bayelsa
Peru, Ayacucho
Moldova, Basarabeasca
Egypt, Arab Rep., Giza
Jordan, Madaba
India, Jammu/Kashmir
Guinea, Kankan
Russian Federation, Rostovskaya Oblast
Turkey, Denizli
Vietnam, Lam Dong
China, Jiangxi Sheng
Hong Kong SAR, China
Venezuela, RB, Anzoategui
Moldova, Floresti
Russian Federation, Kalmykiya Rep.
Congo, Dem. Rep., Nord-Kivu
Jamaica, Westmoreland
Iran, Islamic Rep., Hormozgan
Hungary, Fej

Jamaica, Hanover
Brazil, Paraiba
Turkey, Yalova
Nigeria, Lagos
Hungary, Gyor-moson-sopron
Egypt, Arab Rep., Ismailia
South Asia (IDA & IBRD)
Burkina Faso, Sahel
Uzbekistan, Jizzakh
American Samoa
Niger, Maradi
Madagascar, Sava
Argentina, Misiones
Timor-Leste, Lautem
Uzbekistan, Surkhandarya
Jamaica, Saint Mary
Tanzania, Mwanza
Egypt, Arab Rep., Gharbia
Cabo Verde
Armenia, Vayots Dzor
Chile, Arica y Painacota
Zambia, Northern
Yemen, Rep., Aden
IDA total
Thailand, Phatthalung
Cote d'Ivoire, Bas Sassandra
United Arab Emirates
Mexico, Chihuahua
Lebanon, Nabatiye
Algeria, Djelfa
Ethiopia, SNNPR
Czech Republic, Olomouc
Czech Republic, Zlin
Colombia, Valle Del Cauca
Bulgaria, Shumen
Solomon Islands
Uzbekistan, Sirdarya
Iran, Islamic Rep., Tehran
Vietnam, Bac Lieu
Timor-Leste, Dili
Gambia, The, Kuntaur
Venezuela, RB, Monagas
Bangladesh, Rajshahi
Estonia, Voru
Sudan, Red Sea
Tanzania, Rukwa
Panama, Darien
Ukraine
Russian Federation, Belgorodskaya Oblast
Tonga
Korea, Rep., Kyonggi-do
Thailand, P

Algeria, Medea
Turkey, Artvin
New Caledonia
Romania, Hunedoara
Indonesia, Sulawesi Utara, Prop.
Niger, Agadez
Benin, Couffo
Honduras, Valle
Moldova, Cantemir
Kazakhstan, Western
Vietnam, Binh Dinh
Tunisia
Azerbaijan, Nakhchivan
Kyrgyz Republic, Bishkek City
Gambia, The
Kazakhstan, Almaty City area
Romania, Dimbovita
Guinea-Bissau, Tombali
Ghana, Western
Timor-Leste, Ainaro
Myanmar, Kachin
Turkey, Adana
El Salvador, La Union
OECD members
Philippines, Davao Region
Tanzania, Kagera
Montenegro, Zabljak
Vietnam, Hai Duong
Burkina Faso, Centre
Algeria, Tissemsilt
Nicaragua, Carazo
Iran, Islamic Rep., North Khorassan
Turkey, Van
Colombia, Bogota
Croatia, Virovitica-podravina
Algeria, Souk-Ahras
Chile, Araucania
China, Sichuan Sheng
Montenegro, Mojkovac
Turkey, Karaman
Tanzania, Mara
Argentina, Jujuy
Upper middle income
Georgia, Kvemo Kartli
Turkey
Liberia, Grand Cape Mount
Indonesia, Jawa Barat, Prop.
Burkina Faso, Hauts-bassins
Cameroon, Nord-Ouest
Uzbekistan, Khorezm
Syrian Arab Republic, T

Montenegro, Kolasin
Guatemala, Santa Rosa
Honduras, Ocotepeque
Lao PDR, Xekong
South Sudan, Upper Nile
Tunisia, Mahdia
Guinea, Faranah
Thailand, Nakhon Pathom
Tanzania, Shinyanga
Mauritania, Guidimakha
Ecuador, Zona No Delimitada
Mongolia, To'v
Myanmar, Ayeyawaddy
Ecuador, Cotopaxi
Tunisia, Siliana
Thailand, Phuket
Panama, Embera
Bhutan, Haa
Romania, Gorj
Yemen, Rep., Al Mahwit
Slovak Republic
Aruba
Malawi, Northern Region
Argentina
Syrian Arab Republic, As-Suweida
Bahrain
Guinea, Kindia
Brazil, Rio Grande Do Norte
Iran, Islamic Rep., Gilan
Mongolia, Xentii
Cuba, Camaguey
Turkey, Bolu
Korea, Rep., Kyongsangnam-do
Syrian Arab Republic, Damascus
Burkina Faso, Sud-ouest
Vietnam, Ca Mau
South Asia (IDA total)
India, Meghalaya
Cuba, Sancti Spiritus
Iraq, Erbil
Syrian Arab Republic, Dayr-Az-Zor
West Bank and Gaza
Tunisia, Sidi Bouz
Turkey, Diyarbakir
Armenia, Gergharkunik
Venezuela, RB, Tachira
Ecuador, Los Rios
Timor-Leste, Bobonaro
Guam
Azerbaijan
Nigeria, Kogi
Georgia, Imereti
Slovak Repu

(128586, 217)

In [422]:
health_stack.shape

(128586, 217)

In [423]:
health_stack.dropna(axis=0, how='all', inplace=True)
health_stack.dropna(axis=1, how='all', inplace=True)
health_stack.shape

(15207, 217)

In [424]:
health_stack.to_csv('ML_health_stack.csv')

In [427]:
edu_clean.dropna(axis=0, how='all', inplace=True)
edu_clean.dropna(axis=1, how='all', inplace=True)
edu_stack = get_stack(edu_clean)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Canada
Sao Tome and Principe
Turkmenistan
Lao PDR
Arab World
Lithuania
Cambodia
Switzerland
Ethiopia
OECD members
Swaziland
South Asia
Argentina
Bolivia
Bahamas, The
Burkina Faso
Bahrain
Saudi Arabia
Lebanon
South Asia (IDA & IBRD)
Japan
Channel Islands
American Samoa
Northern Mariana Islands
Slovenia
Guatemala
IDA total
Bosnia and Herzegovina
Guinea
Russian Federation
World
St. Lucia
Congo, Rep.
Dominica
Liberia
South Asia (IDA total)
Maldives
St. Martin (French part)
Pakistan
Oman
Tanzania
Early-demographic dividend
Cabo Verde
Mauritania
Greenland
Gabon
Monaco
New Zealand
Spain
European Union
Jamaica
Albania
Samoa
Korea, Dem. People’s Rep.
Slovak Republic
Kazakhstan
Guam
Uruguay
India
Azerbaijan
Lesotho
Middle East & North Africa
Europe & Central Asia (IDA & IBRD countries)
United Arab Emirates
Latin America & Caribbean
Aruba
Upper middle income
Tajikistan
Pacific island small states
Turkey
Afghanistan
Venezuela, RB
Bangladesh
East Asia & Pacific
Solomon Islands
Korea, Rep.
Palau
San

In [428]:
edu_stack.shape

(16588, 158)

In [429]:
edu_stack.dropna(axis=0, how='all', inplace=True)
edu_stack.dropna(axis=1, how='all', inplace=True)
edu_stack.shape

(14638, 112)

In [430]:
edu_stack.to_csv('ML_edu_stack.csv')

In [431]:
dev_clean.dropna(axis=0, how='all', inplace=True)
dev_clean.dropna(axis=1, how='all', inplace=True)
dev_stack = get_stack(dev_clean)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Canada
Sao Tome and Principe
Turkmenistan
Lao PDR
Arab World
Lithuania
Cambodia
Switzerland
Ethiopia
OECD members
Swaziland
South Asia
Argentina
Bolivia
Bahamas, The
Burkina Faso
Bahrain
Saudi Arabia
Lebanon
South Asia (IDA & IBRD)
Japan
Channel Islands
American Samoa
Northern Mariana Islands
Slovenia
Guatemala
IDA total
Bosnia and Herzegovina
Guinea
Russian Federation
World
St. Lucia
Congo, Rep.
Dominica
Liberia
South Asia (IDA total)
Maldives
St. Martin (French part)
Pakistan
Oman
Tanzania
Early-demographic dividend
Cabo Verde
Mauritania
Greenland
Gabon
Monaco
New Zealand
Spain
European Union
Jamaica
Albania
Samoa
Korea, Dem. People’s Rep.
Slovak Republic
Kazakhstan
Guam
Uruguay
India
Azerbaijan
Lesotho
Middle East & North Africa
Europe & Central Asia (IDA & IBRD countries)
United Arab Emirates
Latin America & Caribbean
Aruba
Upper middle income
Tajikistan
Pacific island small states
Turkey
Afghanistan
Venezuela, RB
Bangladesh
East Asia & Pacific
Solomon Islands
Korea, Rep.
Palau
San

In [432]:
dev_stack.shape

(16588, 132)

In [433]:
dev_stack.dropna(axis=0, how='all', inplace=True)
dev_stack.dropna(axis=1, how='all', inplace=True)
dev_stack.shape

(15250, 132)

In [437]:
dev_stack.to_csv('ML_dev_stack.csv')

In [434]:
fin_clean.dropna(axis=0, how='all', inplace=True)
fin_clean.dropna(axis=1, how='all', inplace=True)
fin_stack = get_stack(fin_clean)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Canada
OECD members
Swaziland
Cameroon
Burkina Faso
East Asia & Pacific (IDA & IBRD countries)
Sub-Saharan Africa
Russian Federation
Dominica
Early-demographic dividend
Middle East & North Africa (IBRD only)
Madagascar
Yemen, Rep.
Upper middle income
Tajikistan
Turkey
Nauru
Norway
Montenegro
Indonesia
Libya
Finland
Central African Republic
Liechtenstein
Micronesia, Fed. Sts.
United States
Portugal
Fiji
Kyrgyz Republic
Kuwait
Latin America & Caribbean (IBRD only)
Costa Rica
Nigeria
Ecuador
Australia
Tuvalu
IDA only
South Asia (IBRD only)
East Asia & Pacific (developing only)
Denmark
Post-demographic dividend
Morocco
Estonia
Kosovo
Lebanon
Colombia
Latin America & Caribbean (IDA total)
Palau
Nepal
Netherlands
Suriname
Middle East & North Africa (developing only)
Jordan
Eritrea
Not classified
IBRD only
Equatorial Guinea
Serbia
Greece
Heavily indebted poor countries (HIPC)
Sao Tome and Principe
Arab World
Bolivia
Ghana
Saudi Arabia
South Asia (IDA & IBRD)
American Samoa
Guatemala
World
Spa

In [435]:
fin_stack.shape

(16994, 65)

In [436]:
fin_stack.dropna(axis=0, how='all', inplace=True)
fin_stack.dropna(axis=1, how='all', inplace=True)
fin_stack.shape

(14396, 65)

In [438]:
fin_stack.to_csv('ML_fin_stack.csv')

In [439]:
env_clean.dropna(axis=0, how='all', inplace=True)
env_clean.dropna(axis=1, how='all', inplace=True)
env_stack = get_stack(env_clean)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Canada
Sao Tome and Principe
Turkmenistan
Lao PDR
Arab World
Lithuania
Cambodia
Switzerland
Ethiopia
OECD members
Swaziland
South Asia
Argentina
Bolivia
Bahamas, The
Burkina Faso
Bahrain
Saudi Arabia
Lebanon
South Asia (IDA & IBRD)
Japan
Channel Islands
American Samoa
Northern Mariana Islands
Slovenia
Guatemala
IDA total
Bosnia and Herzegovina
Guinea
Russian Federation
World
St. Lucia
Congo, Rep.
Dominica
Liberia
South Asia (IDA total)
Maldives
St. Martin (French part)
Pakistan
Oman
Tanzania
Early-demographic dividend
Cabo Verde
Mauritania
Greenland
Gabon
Monaco
New Zealand
Spain
European Union
Jamaica
Albania
Samoa
Korea, Dem. People’s Rep.
Slovak Republic
Kazakhstan
Guam
Uruguay
India
Azerbaijan
Lesotho
Middle East & North Africa
Europe & Central Asia (IDA & IBRD countries)
United Arab Emirates
Latin America & Caribbean
Aruba
Upper middle income
Tajikistan
Pacific island small states
Turkey
Afghanistan
Venezuela, RB
Bangladesh
East Asia & Pacific
Solomon Islands
Korea, Rep.
Palau
San

In [440]:
env_stack.shape

(16588, 49)

In [441]:
env_stack.dropna(axis=0, how='all', inplace=True)
env_stack.dropna(axis=1, how='all', inplace=True)
env_stack.shape

(14669, 49)

In [442]:
env_stack.to_csv('ML_env_stack.csv')

In [443]:
pov_clean.dropna(axis=0, how='all', inplace=True)
pov_clean.dropna(axis=1, how='all', inplace=True)
pov_stack = get_stack(pov_clean)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Canada
Sao Tome and Principe
Turkmenistan
Lao PDR
Arab World
Lithuania
Cambodia
Switzerland
Ethiopia
OECD members
Swaziland
South Asia
Argentina
Bolivia
Bahamas, The
Burkina Faso
Bahrain
Saudi Arabia
Lebanon
South Asia (IDA & IBRD)
Japan
Channel Islands
American Samoa
Northern Mariana Islands
Slovenia
Guatemala
IDA total
Bosnia and Herzegovina
Guinea
Russian Federation
World
St. Lucia
Congo, Rep.
Dominica
Liberia
South Asia (IDA total)
Maldives
St. Martin (French part)
Pakistan
Oman
Tanzania
Early-demographic dividend
Cabo Verde
Mauritania
Greenland
Gabon
Monaco
New Zealand
Spain
European Union
Jamaica
Albania
Samoa
Korea, Dem. People’s Rep.
Slovak Republic
Kazakhstan
Guam
Uruguay
India
Azerbaijan
Lesotho
Middle East & North Africa
Europe & Central Asia (IDA & IBRD countries)
United Arab Emirates
Latin America & Caribbean
Aruba
Upper middle income
Tajikistan
Pacific island small states
Turkey
Afghanistan
Venezuela, RB
Bangladesh
East Asia & Pacific
Solomon Islands
Korea, Rep.
Palau
San

In [444]:
pov_stack.shape

(16588, 25)

In [445]:
pov_stack.dropna(axis=0, how='all', inplace=True)
pov_stack.dropna(axis=1, how='all', inplace=True)
pov_stack.shape

(2116, 25)

In [447]:
pov_stack.to_csv('ML_pov_stack.csv')

# Train Stacks <a class="anchor" id="trainstacks"></a>

In [4]:
econ_clean = pd.read_csv('ML_data/ML_econ_clean.csv', index_col=0)
fin_clean = pd.read_csv('ML_data/ML_fin_clean.csv', index_col=0)
health_clean = pd.read_csv('ML_data/ML_health_clean.csv', index_col=0)
dev_clean = pd.read_csv('ML_data/ML_dev_clean.csv', index_col=0)
env_clean = pd.read_csv('ML_data/ML_env_clean.csv', index_col=0)
edu_clean = pd.read_csv('ML_data/ML_edu_clean.csv', index_col=0)
pov_clean = pd.read_csv('ML_data/ML_pov_clean.csv', index_col=0)

In [7]:
fin_clean.columns

Index([u'Automated teller machines (ATMs) (per 100,000 adults)',
       u'Bank capital to assets ratio (%)',
       u'Bank liquid reserves to bank assets ratio (%)',
       u'Bank nonperforming loans to total gross loans (%)',
       u'Borrowers from commercial banks (per 1,000 adults)',
       u'Broad money (% of GDP)', u'Broad money (current LCU)',
       u'Broad money growth (annual %)',
       u'Broad money to total reserves ratio',
       u'Claims on central government (annual growth as % of broad money)',
       u'Claims on central government, etc. (% GDP)',
       u'Claims on other sectors of the domestic economy (% of GDP)',
       u'Claims on other sectors of the domestic economy (annual growth as % of broad money)',
       u'Claims on private sector (annual growth as % of broad money)',
       u'Commercial bank branches (per 100,000 adults)',
       u'Consumer price index (2010 = 100)',
       u'DEC alternative conversion factor (LCU per US$)',
       u'Deposit interest rate 

In [25]:
train_econ = pd.concat([econ_clean, fin_clean, dev_clean, pov_clean], axis=1)

In [26]:
train_econ.dropna(axis=0, how='all', inplace=True)
train_econ.dropna(axis=1, how='all', inplace=True)
train_econ_stack = get_stack(train_econ)
train_econ_stack.shape

(16994, 459)

In [27]:
train_econ_stack.dropna(axis=0, how='all', inplace=True)
train_econ_stack.dropna(axis=1, how='all', inplace=True)
train_econ_stack.shape

(15250, 459)

In [28]:
train_econ_stack.to_csv('ML_train_econ_stack.csv')

In [None]:
train_econ_stack = make_train_stack([health_clean, fin_clean, dev_clean, pov_clean)

In [5]:
def make_train_stack(clean_list):
    train_this = pd.concat(clean_list, axis=1)
    print 'step 1'
    train_this.dropna(axis=0, how='all', inplace=True)
    train_this.dropna(axis=1, how='all', inplace=True)
    print 'step 2'
    train_stack = get_stack(train_this)
    print train_stack.shape
    train_stack.dropna(axis=0, how='all', inplace=True)
    train_stack.dropna(axis=1, how='all', inplace=True)
    print train_stack.shape
    
    return train_stack

In [33]:
train_health_stack = make_train_stack([health_clean, econ_clean, dev_clean])

step 1
step 2
(128586, 600)
(15254, 600)


In [34]:
train_health_stack.to_csv('ML_train_health_stack.csv')

In [None]:
1+3

In [35]:
train_edu_stack = make_train_stack([edu_clean, dev_clean, econ_clean])
train_edu_stack.to_csv('ML_train_edu_stack.csv')

step 1
step 2
(16588, 547)
(15250, 501)


In [36]:
train_dev_stack = make_train_stack([dev_clean, econ_clean])
train_dev_stack.to_csv('ML_train_dev_stack.csv')

step 1
step 2
(16588, 389)
(15250, 389)


In [37]:
train_fin_stack = make_train_stack([fin_clean, dev_clean, econ_clean])
train_fin_stack.to_csv('ML_train_fin_stack.csv')

step 1
step 2
(16994, 439)
(15250, 439)


In [38]:
train_pov_stack = make_train_stack([pov_clean, dev_clean, econ_clean])
train_pov_stack.to_csv('ML_train_pov_stack.csv')

step 1
step 2
(16588, 409)
(15250, 409)


In [39]:
train_env_stack = make_train_stack([env_clean, dev_clean])
train_env_stack.to_csv('ML_train_env_stack.csv')

step 1
step 2
(16588, 172)
(15250, 172)
