In [50]:
import pandas as pd
import numpy as np

from sklearn import preprocessing as pp
from fancyimpute import MICE, SoftImpute, KNN, MatrixFactorization, BiScaler
%pylab inline

df = pd.read_csv('/home/lara/Documents/Repository/Capstone-1_WorldBank_GenderData/finalvars.csv')

# Supervised Learning Modules
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


Populating the interactive namespace from numpy and matplotlib


# Data Wrangling

In [51]:
#count non-NaN rows in each column
df.count()

Unnamed: 0     3945
country        3945
life           3699
bc              639
matdeath       3435
teen           3600
gdp            3644
healthspend    3523
eduspend       2100
dtype: int64

In [52]:
#Drop all rows with NaN values and count how many rows are left
dfdrop = df.dropna()
dfdrop.bc.count()

368

In [29]:
#split data into "Causes" and "Effects" of Contraceptive Use
dfcause = df[['bc', 'gdp', 'healthspend', 'eduspend']]
dfeffect = df[['country', 'bc', 'life', 'matdeath', 'teen']]

#count non-NaN values in "Cause" df
dfcause.dropna().count()

bc             370
gdp            370
healthspend    370
eduspend       370
dtype: int64

In [53]:
#Count non-NaN rows in "Effect" df
dfeffect.dropna().count()

country     624
bc          624
life        624
matdeath    624
teen        624
dtype: int64

In [54]:
#save "Effect" df without NaN rows because there are very few rows with NaNs
effect = dfeffect.dropna()

## Implementing multiple algorithms from the Fancy Impute package to see which works best

In [39]:
#use MICE algorithm in Fancy Impute package to interpolate NaN values in "Causes" df
micefilled = MICE().complete(dfcause)
#turn micefilled nmpy array into pandas df
mice = pd.DataFrame(micefilled, columns= dfcause.columns)

[MICE] Completing matrix with shape (3945, 4)
[MICE] Starting imputation round 1/110, elapsed time 0.001
[MICE] Starting imputation round 2/110, elapsed time 0.014
[MICE] Starting imputation round 3/110, elapsed time 0.022
[MICE] Starting imputation round 4/110, elapsed time 0.028
[MICE] Starting imputation round 5/110, elapsed time 0.034
[MICE] Starting imputation round 6/110, elapsed time 0.040
[MICE] Starting imputation round 7/110, elapsed time 0.047
[MICE] Starting imputation round 8/110, elapsed time 0.053
[MICE] Starting imputation round 9/110, elapsed time 0.059
[MICE] Starting imputation round 10/110, elapsed time 0.065
[MICE] Starting imputation round 11/110, elapsed time 0.098
[MICE] Starting imputation round 12/110, elapsed time 0.106
[MICE] Starting imputation round 13/110, elapsed time 0.125
[MICE] Starting imputation round 14/110, elapsed time 0.152
[MICE] Starting imputation round 15/110, elapsed time 0.163
[MICE] Starting imputation round 16/110, elapsed time 0.175
[MI

In [40]:
mice

Unnamed: 0,bc,gdp,healthspend,eduspend
0,4.900000,-10756.178100,2.551312,4.130685
1,44.072898,119.899037,3.065070,4.442978
2,40.198648,192.153528,0.550723,4.161138
3,10.300000,203.651041,0.607900,4.204472
4,43.313948,224.914712,0.969459,4.133318
5,13.600000,257.175795,0.801919,4.463184
6,18.600000,280.245644,1.062420,4.438390
7,43.618894,380.400955,0.522136,4.454933
8,22.800000,384.131681,1.385584,4.612008
9,40.504821,458.955782,2.586127,4.465732


In [42]:
mice.to_csv('mice.csv')

In [45]:
cause = pd.read_csv('/home/lara/Documents/Repository/Capstone-1_WorldBank_GenderData/mice.csv')

In [46]:
cause.head()

Unnamed: 0.1,Unnamed: 0,bc,gdp,healthspend,eduspend
0,0,4.9,119.0,2.551312,4.130685
1,1,44.072898,119.899037,3.06507,4.442978
2,2,40.198648,192.153528,0.550723,4.161138
3,3,10.3,203.651041,0.6079,4.204472
4,4,43.313948,224.914712,0.969459,4.133318


## The MICE algorithm from Fancy Impute is interpolating negative numbers for GDP, so exploring other algorithms for imputing appears to be necessary

In [6]:
#use KNN algorithm in Fancy Impute package to interpolate NaN values in "Causes" df
knnfilled = KNN(k=2).complete(dfcause)
#turn filled nmpy array into pandas df
dfknn = pd.DataFrame(knnfilled, columns= dfcause.columns)

Imputing row 1/3945 with 3 missing, elapsed time: 9.067
Imputing row 101/3945 with 2 missing, elapsed time: 9.076
Imputing row 201/3945 with 1 missing, elapsed time: 9.084
Imputing row 301/3945 with 1 missing, elapsed time: 9.091
Imputing row 401/3945 with 2 missing, elapsed time: 9.100
Imputing row 501/3945 with 1 missing, elapsed time: 9.108
Imputing row 601/3945 with 1 missing, elapsed time: 9.114
Imputing row 701/3945 with 2 missing, elapsed time: 9.122
Imputing row 801/3945 with 4 missing, elapsed time: 9.129
Imputing row 901/3945 with 0 missing, elapsed time: 9.137
Imputing row 1001/3945 with 1 missing, elapsed time: 9.145
Imputing row 1101/3945 with 1 missing, elapsed time: 9.153
Imputing row 1201/3945 with 0 missing, elapsed time: 9.162
Imputing row 1301/3945 with 1 missing, elapsed time: 9.169
Imputing row 1401/3945 with 0 missing, elapsed time: 9.177
Imputing row 1501/3945 with 3 missing, elapsed time: 9.185
Imputing row 1601/3945 with 1 missing, elapsed time: 9.193
Imputing 

In [37]:
dfknn.to_csv('knn.csv')

In [27]:
df

Unnamed: 0.1,Unnamed: 0,country,life,bc,matdeath,teen,gdp,healthspend,eduspend
0,0,Afghanistan,55.125878,4.9,1100.0,153.8456,,,
1,1,Afghanistan,55.487537,,1050.0,150.0468,119.899037,,
2,2,Afghanistan,55.857195,,996.0,146.2480,192.153528,0.550723,
3,3,Afghanistan,56.235293,10.3,941.0,140.4764,203.651041,0.607900,
4,4,Afghanistan,56.626317,,881.0,134.7048,224.914712,0.969459,
5,5,Afghanistan,57.027244,13.6,821.0,128.9332,257.175795,0.801919,
6,6,Afghanistan,57.432561,18.6,776.0,123.1616,280.245644,1.062420,
7,7,Afghanistan,57.833829,,724.0,117.3900,380.400955,0.522136,
8,8,Afghanistan,58.225024,22.8,676.0,111.4708,384.131681,1.385584,
9,9,Afghanistan,58.603683,,631.0,105.5516,458.955782,2.586127,


### KNN seems to have interpolated an excessively high number for GDP in the 

In [22]:
#use SoftImpute algorithm in Fancy Impute package to interpolate NaN values in "Causes" df
softfilled = SoftImpute().complete(dfcause)
#turn filled nmpy array into pandas df
softdf = pd.DataFrame(softfilled, columns= dfcause.columns)

[SoftImpute] Max Singular Value of X_init = 1373789.282534
[SoftImpute] Iter 1: observed MAE=93.283127 rank=1
[SoftImpute] Iter 2: observed MAE=93.154539 rank=1
[SoftImpute] Iter 3: observed MAE=93.094391 rank=1
[SoftImpute] Iter 4: observed MAE=93.054360 rank=1
[SoftImpute] Iter 5: observed MAE=93.022870 rank=1
[SoftImpute] Iter 6: observed MAE=92.994990 rank=1
[SoftImpute] Iter 7: observed MAE=92.969587 rank=1
[SoftImpute] Iter 8: observed MAE=92.945994 rank=1
[SoftImpute] Iter 9: observed MAE=92.923982 rank=1
[SoftImpute] Iter 10: observed MAE=92.903403 rank=1
[SoftImpute] Iter 11: observed MAE=92.884149 rank=1
[SoftImpute] Iter 12: observed MAE=92.866125 rank=1
[SoftImpute] Iter 13: observed MAE=92.849251 rank=1
[SoftImpute] Iter 14: observed MAE=92.833558 rank=1
[SoftImpute] Iter 15: observed MAE=92.818977 rank=1
[SoftImpute] Iter 16: observed MAE=92.805323 rank=1
[SoftImpute] Iter 17: observed MAE=92.792563 rank=1
[SoftImpute] Iter 18: observed MAE=92.781024 rank=1
[SoftImpute] I

In [24]:
softdf

Unnamed: 0,bc,gdp,healthspend,eduspend
0,4.900000,0.274759,0.000037,0.000032
1,0.213342,119.899037,0.016004,0.013659
2,0.341907,192.153528,0.550723,0.021890
3,10.300000,203.651041,0.607900,0.023202
4,0.400201,224.914712,0.969459,0.025622
5,13.600000,257.175795,0.801919,0.029300
6,18.600000,280.245644,1.062420,0.031929
7,0.676864,380.400955,0.522136,0.043334
8,22.800000,384.131681,1.385584,0.043764
9,0.816640,458.955782,2.586127,0.052283


### Soft Impute seems to be interpolating excessively small numbers for all values

In [14]:
#use MatrixFactorization algorithm in Fancy Impute package to interpolate NaN values in "Causes" df
mffilled = MatrixFactorization().complete(dfcause)
#turn filled nmpy array into pandas df
mfdf = pd.DataFrame(mffilled, columns= dfcause.columns)

train: 1 of 1 mini-batches from (3945, 4)
downhill: compiling evaluation function
downhill: compiling Adam optimizer
downhill: setting: rms_halflife = 14
downhill: setting: rms_regularizer = 1e-08
downhill: setting: patience = 5
downhill: setting: validate_every = 10
downhill: setting: min_improvement = 0.005
downhill: setting: max_gradient_norm = 5
downhill: setting: max_gradient_elem = 0
downhill: setting: learning_rate = TensorConstant{0.001}
downhill: setting: momentum = 0
downhill: setting: nesterov = False
downhill: validation 0 loss=119600978.912601 error=119600978.831751 grad(U)=145681.469725 grad(V)=350017.812784 *
downhill: Adam 1 loss=119600978.912601 error=119600978.831751 grad(U)=145681.469725 grad(V)=350017.812784
downhill: Adam 2 loss=119600942.509747 error=119600942.428949 grad(U)=145673.538692 grad(V)=346319.107871
downhill: Adam 3 loss=119600893.332482 error=119600893.251755 grad(U)=145664.698765 grad(V)=342303.074931
downhill: Adam 4 loss=119600835.751961 error=11960

In [15]:
mfdf.head()

Unnamed: 0,bc,gdp,healthspend,eduspend
0,4.9,1.48341,0.260645,-0.912306
1,-2.142803,119.899037,-4.608279,-1.949659
2,0.92532,192.153528,0.550723,0.69655
3,10.3,203.651041,0.6079,0.571148
4,-0.453075,224.914712,0.969459,1.825503


Matrix Factorization resulted in negative numbers for Health and Education Spending, which do not make sense.

In [56]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('RF', RandomForestClassifier(n_jobs = -1, n_estimators = 500)))

In [57]:
Y = df.bc
X = pd.concat(
    [cause[['gdp', 'healthspend', 'eduspend']]],
    axis = 1)

In [58]:
#a function to evaluate each model
def run_models(train_data):
    results = []
    names = []

    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=11)
        cv_results = model_selection.cross_val_score(model, train_data, Y, cv=kfold, scoring='accuracy')
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
'''
    # boxplot algorithm comparison
    fig = pyplot.figure()
    fig.suptitle(title)
    ax = fig.add_subplot(111)
    pyplot.boxplot(results)
    ax.set_xticklabels(names)
    pyplot.ylim(0,1)
    pyplot.show()
    '''

'\n    # boxplot algorithm comparison\n    fig = pyplot.figure()\n    fig.suptitle(title)\n    ax = fig.add_subplot(111)\n    pyplot.boxplot(results)\n    ax.set_xticklabels(names)\n    pyplot.ylim(0,1)\n    pyplot.show()\n    '

In [59]:
run_models(X)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').