In [3]:
import pandas as pd
import numpy as np

from sklearn import preprocessing as pp
from fancyimpute import MICE
%pylab inline

df = pd.read_csv('/home/lara/Documents/Repository/Capstone-1_WorldBank_GenderData/finalvars.csv')

# Supervised Learning Modules
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


Populating the interactive namespace from numpy and matplotlib


# Data Wrangling

In [4]:
#count non-NaN rows in each column
df.count()

Unnamed: 0     3945
country        3945
life           3699
bc              639
matdeath       3435
teen           3600
gdp            3644
healthspend    3523
eduspend       2100
dtype: int64

In [5]:
#Drop all rows with NaN values and count how many rows are left
dfdrop = df.dropna()
dfdrop.bc.count()

368

In [6]:
#split data into "Causes" and "Effects" of Contraceptive Use
dfcause = df[['bc', 'gdp', 'healthspend', 'eduspend']]
dfeffect = df[['country', 'bc', 'life', 'matdeath', 'teen']]

#count non-NaN values in "Cause" df
dfcause.dropna().count()

bc             370
gdp            370
healthspend    370
eduspend       370
dtype: int64

In [7]:
#Count non-NaN rows in "Effect" df
dfeffect.dropna().count()

country     624
bc          624
life        624
matdeath    624
teen        624
dtype: int64

In [8]:
#save "Effect" df without NaN rows because there are very few rows with NaNs
effect = dfeffect.dropna()

## Using Fancy Impute to interpolate NaN values in the "Causes" df because just dropping them eliminates too much data. The MICE algorithm was chosen because its method looks at each values rather than just a blanketed approach.

In [17]:
#use MICE algorithm in Fancy Impute package to interpolate NaN values in "Causes" df
micefilled = MICE().complete(dfcause)
#turn micefilled nmpy array into pandas df
mice = pd.DataFrame(micefilled, columns= dfcause.columns)

[MICE] Completing matrix with shape (3945, 4)
[MICE] Starting imputation round 1/110, elapsed time 0.001
[MICE] Starting imputation round 2/110, elapsed time 0.019
[MICE] Starting imputation round 3/110, elapsed time 0.029
[MICE] Starting imputation round 4/110, elapsed time 0.038
[MICE] Starting imputation round 5/110, elapsed time 0.047
[MICE] Starting imputation round 6/110, elapsed time 0.056
[MICE] Starting imputation round 7/110, elapsed time 0.064
[MICE] Starting imputation round 8/110, elapsed time 0.107
[MICE] Starting imputation round 9/110, elapsed time 0.112
[MICE] Starting imputation round 10/110, elapsed time 0.128
[MICE] Starting imputation round 11/110, elapsed time 0.142
[MICE] Starting imputation round 12/110, elapsed time 0.153
[MICE] Starting imputation round 13/110, elapsed time 0.168
[MICE] Starting imputation round 14/110, elapsed time 0.264
[MICE] Starting imputation round 15/110, elapsed time 0.303
[MICE] Starting imputation round 16/110, elapsed time 0.330
[MI

In [18]:
mice

Unnamed: 0,bc,gdp,healthspend,eduspend
0,4.900000,-8026.244605,2.983257,4.309097
1,41.966244,119.899037,3.420147,4.677297
2,44.172676,192.153528,0.550723,4.316451
3,10.300000,203.651041,0.607900,4.486272
4,44.870887,224.914712,0.969459,4.201294
5,13.600000,257.175795,0.801919,4.253354
6,18.600000,280.245644,1.062420,4.770771
7,47.093911,380.400955,0.522136,4.537792
8,22.800000,384.131681,1.385584,4.206358
9,41.861826,458.955782,2.586127,4.495530


The MICE algorithm seems to have interpolated mostly logical values, however there are a couple values for GPD that were interpolated to be negative numbers. As negative numbers do not make sense in this context, these values are replaced with their absolute value

In [19]:
cause = mice.abs()

In [20]:
cause.head()

Unnamed: 0,bc,gdp,healthspend,eduspend
0,4.9,8026.244605,2.983257,4.309097
1,41.966244,119.899037,3.420147,4.677297
2,44.172676,192.153528,0.550723,4.316451
3,10.3,203.651041,0.6079,4.486272
4,44.870887,224.914712,0.969459,4.201294


In [22]:
cause.gdp[0] = 119

## The MICE algorithm from Fancy Impute is interpolating negative numbers for GDP, so exploring other algorithms for imputing appears to be necessary

In [16]:
#use KNN algorithm in Fancy Impute package to interpolate NaN values in "Causes" df
knnfilled = KNN(k=2).complete(dfcause)
#turn filled nmpy array into pandas df
dfknn = pd.DataFrame(knnfilled, columns= dfcause.columns)

NameError: name 'KNN' is not defined

In [None]:
dfknn.to_csv('knn.csv')

In [None]:
df

### KNN seems to have interpolated an excessively high number for GDP in the 

In [None]:
#use SoftImpute algorithm in Fancy Impute package to interpolate NaN values in "Causes" df
softfilled = SoftImpute().complete(dfcause)
#turn filled nmpy array into pandas df
softdf = pd.DataFrame(softfilled, columns= dfcause.columns)

In [None]:
softdf

### Soft Impute seems to be interpolating excessively small numbers for all values

In [None]:
#use MatrixFactorization algorithm in Fancy Impute package to interpolate NaN values in "Causes" df
mffilled = MatrixFactorization().complete(dfcause)
#turn filled nmpy array into pandas df
mfdf = pd.DataFrame(mffilled, columns= dfcause.columns)

In [None]:
mfdf.head()

Matrix Factorization resulted in negative numbers for Health and Education Spending, which do not make sense.

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DT', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('RF', RandomForestClassifier(n_jobs = -1, n_estimators = 500)))

In [None]:
Y = cause.bc
X = cause[['gdp', 'healthspend', 'eduspend']]

In [None]:
#a function to evaluate each model
def run_models(IV,DV):
    results = []
    names = []

    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=11)

        cv_results = model_selection.cross_val_score(model, X=IV, y=DV, cv=kfold, scoring='accuracy')
        print "I created a model", name
        results.append(cv_results)
        print" I added the results to the list"
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
'''
    # boxplot algorithm comparison
    fig = pyplot.figure()
    fig.suptitle(title)
    ax = fig.add_subplot(111)
    pyplot.boxplot(results)
    ax.set_xticklabels(names)
    pyplot.ylim(0,1)
    pyplot.show()
'''  

In [None]:
run_models(X, Y)