In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing as pp
%pylab inline

cause = pd.read_csv('/home/lara/Documents/Repository/Capstone-1_WorldBank_GenderData/causes.csv')
effect = pd.read_csv('/home/lara/Documents/Repository/Capstone-1_WorldBank_GenderData/effects.csv')


# Supervised Learning Modules
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn import model_selection

from sklearn.linear_model import SGDRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


Populating the interactive namespace from numpy and matplotlib


# Looping through various models

In [2]:
models = []
models.append(('SGDR', SGDRegressor()))
models.append(('GaussianPR', GaussianProcessRegressor()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('DTree', DecisionTreeRegressor()))
models.append(('GradientBR', GradientBoostingRegressor()))
models.append(('SVR', SVR()))
models.append(('RF', RandomForestRegressor(n_jobs = -1, n_estimators = 500)))

In [3]:
Y = cause.pop('bc')
X = cause

In [13]:
#a function to evaluate each model
def run_models(x,y):
    results = []
    names = []

    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, shuffle = True, random_state=11)
        cv_results = model_selection.cross_val_score(model, x, y, cv=kfold, scoring='neg_mean_squared_error')
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
'''
    # boxplot algorithm comparison
    fig = pyplot.figure()
    fig.suptitle(title)
    ax = fig.add_subplot(111)
    pyplot.boxplot(results)
    ax.set_xticklabels(names)
    pyplot.ylim(0,1)
    pyplot.show()
'''

'\n    # boxplot algorithm comparison\n    fig = pyplot.figure()\n    fig.suptitle(title)\n    ax = fig.add_subplot(111)\n    pyplot.boxplot(results)\n    ax.set_xticklabels(names)\n    pyplot.ylim(0,1)\n    pyplot.show()\n'

In [14]:
run_models(X, Y)

SGDR: -71155615444173776472866578451398656.000000 (100488112409887734351412239781593088.000000)
GaussianPR: -2880.365325 (360.577292)
KNN: -380.370038 (67.390897)
DTree: -419.900412 (109.487188)
GradientBR: -214.404800 (49.169755)
SVR: -542.577257 (62.328021)
RF: -221.542367 (51.034570)


## Tuning the SVM

In [8]:
Cs = [0.001, 0.1, 1, 10, 100]
Es = [0.0001, 0.001, 0.1, 1, 10]

for i in Cs:
    for e in Es:
        results =[]
        kfold = model_selection.KFold(n_splits=10, shuffle = True, random_state=11)
        results = model_selection.cross_val_score(SVR(C=i, epsilon = e), X, Y, cv=kfold, scoring='neg_mean_squared_error')
        score = results.mean()
        print "C=", i, " and Epsilon =", e, "has score =",score



C= 0.001  and Epsilon = 0.0001 has score = -543.261142024
C= 0.001  and Epsilon = 0.001 has score = -543.258900696
C= 0.001  and Epsilon = 0.1 has score = -543.167182494
C= 0.001  and Epsilon = 1 has score = -543.123666591
C= 0.001  and Epsilon = 10 has score = -535.061387644
C= 0.1  and Epsilon = 0.0001 has score = -543.042260735
C= 0.1  and Epsilon = 0.001 has score = -543.042846983
C= 0.1  and Epsilon = 0.1 has score = -543.249956514
C= 0.1  and Epsilon = 1 has score = -543.009580546
C= 0.1  and Epsilon = 10 has score = -535.042783806
C= 1  and Epsilon = 0.0001 has score = -542.633125612
C= 1  and Epsilon = 0.001 has score = -542.632876452
C= 1  and Epsilon = 0.1 has score = -542.577256648
C= 1  and Epsilon = 1 has score = -541.769006908
C= 1  and Epsilon = 10 has score = -533.539445992
C= 10  and Epsilon = 0.0001 has score = -532.291140532
C= 10  and Epsilon = 0.001 has score = -532.290232137
C= 10  and Epsilon = 0.1 has score = -532.188390945
C= 10  and Epsilon = 1 has score = -53

## Tuning the Gradient Boosting Regressor

In [18]:
loss_opt= ['ls', 'lad', 'huber', 'quantile']
estimators = [10, 100, 500, 1000]
depth = [1,3,5,10,20,50,100]
leaves = [1, 2, 3, 4, 5, 10, 20, 50, 100]

for l in loss_opt:
    for e in estimators:
        for d in depth:
            for leaf in leaves:
                results =[]
                kfold = model_selection.KFold(n_splits=10, shuffle = True, random_state=11)
                results = model_selection.cross_val_score(GradientBoostingRegressor(loss=l, n_estimators = e, max_depth = d, min_samples_leaf = leaf), X, Y, cv=kfold, scoring='neg_mean_squared_error')
                score = results.mean()
                print "Loss Function:", l, "n_estimators =", e, "max_depth=", d, "min leaf samples=", leaf, "has score =",score


Loss Function: ls n_estimators = 10 max_depth= 1 has score = -392.628879161
Loss Function: ls n_estimators = 10 max_depth= 1 has score = -392.628879161
Loss Function: ls n_estimators = 10 max_depth= 1 has score = -392.628879161
Loss Function: ls n_estimators = 10 max_depth= 1 has score = -392.628879161
Loss Function: ls n_estimators = 10 max_depth= 1 has score = -392.628879161
Loss Function: ls n_estimators = 10 max_depth= 1 has score = -392.628879161
Loss Function: ls n_estimators = 10 max_depth= 1 has score = -392.628879161
Loss Function: ls n_estimators = 10 max_depth= 1 has score = -392.628879161
Loss Function: ls n_estimators = 10 max_depth= 1 has score = -390.980468023
Loss Function: ls n_estimators = 10 max_depth= 3 has score = -332.126828264
Loss Function: ls n_estimators = 10 max_depth= 3 has score = -332.126828264
Loss Function: ls n_estimators = 10 max_depth= 3 has score = -332.815771566
Loss Function: ls n_estimators = 10 max_depth= 3 has score = -332.733597786
Loss Functio

Loss Function: ls n_estimators = 100 max_depth= 20 has score = -289.628836658
Loss Function: ls n_estimators = 100 max_depth= 50 has score = -386.002375018
Loss Function: ls n_estimators = 100 max_depth= 50 has score = -250.39028811
Loss Function: ls n_estimators = 100 max_depth= 50 has score = -234.912715746
Loss Function: ls n_estimators = 100 max_depth= 50 has score = -223.167574234
Loss Function: ls n_estimators = 100 max_depth= 50 has score = -214.860477711
Loss Function: ls n_estimators = 100 max_depth= 50 has score = -205.884858397
Loss Function: ls n_estimators = 100 max_depth= 50 has score = -214.65960549
Loss Function: ls n_estimators = 100 max_depth= 50 has score = -249.33981658
Loss Function: ls n_estimators = 100 max_depth= 50 has score = -289.628836658
Loss Function: ls n_estimators = 100 max_depth= 100 has score = -389.663793189
Loss Function: ls n_estimators = 100 max_depth= 100 has score = -250.524577093
Loss Function: ls n_estimators = 100 max_depth= 100 has score = -

Loss Function: ls n_estimators = 1000 max_depth= 5 has score = -209.548098498
Loss Function: ls n_estimators = 1000 max_depth= 5 has score = -225.407948414
Loss Function: ls n_estimators = 1000 max_depth= 5 has score = -274.934705053
Loss Function: ls n_estimators = 1000 max_depth= 10 has score = -249.667293707
Loss Function: ls n_estimators = 1000 max_depth= 10 has score = -241.228477003
Loss Function: ls n_estimators = 1000 max_depth= 10 has score = -225.273371073
Loss Function: ls n_estimators = 1000 max_depth= 10 has score = -217.933371895
Loss Function: ls n_estimators = 1000 max_depth= 10 has score = -208.632502153
Loss Function: ls n_estimators = 1000 max_depth= 10 has score = -205.242669796
Loss Function: ls n_estimators = 1000 max_depth= 10 has score = -206.797200495
Loss Function: ls n_estimators = 1000 max_depth= 10 has score = -225.174953346
Loss Function: ls n_estimators = 1000 max_depth= 10 has score = -274.934705053
Loss Function: ls n_estimators = 1000 max_depth= 20 has

Loss Function: lad n_estimators = 100 max_depth= 1 has score = -324.676808433
Loss Function: lad n_estimators = 100 max_depth= 1 has score = -324.783019683
Loss Function: lad n_estimators = 100 max_depth= 1 has score = -324.462460547
Loss Function: lad n_estimators = 100 max_depth= 1 has score = -327.580895981
Loss Function: lad n_estimators = 100 max_depth= 1 has score = -328.694905728
Loss Function: lad n_estimators = 100 max_depth= 1 has score = -353.995586755
Loss Function: lad n_estimators = 100 max_depth= 3 has score = -256.876450334
Loss Function: lad n_estimators = 100 max_depth= 3 has score = -250.973283056
Loss Function: lad n_estimators = 100 max_depth= 3 has score = -251.267789085
Loss Function: lad n_estimators = 100 max_depth= 3 has score = -246.541300245
Loss Function: lad n_estimators = 100 max_depth= 3 has score = -254.384339594
Loss Function: lad n_estimators = 100 max_depth= 3 has score = -253.702288626
Loss Function: lad n_estimators = 100 max_depth= 3 has score = -

Loss Function: lad n_estimators = 500 max_depth= 50 has score = -199.925360065
Loss Function: lad n_estimators = 500 max_depth= 50 has score = -199.126995416
Loss Function: lad n_estimators = 500 max_depth= 50 has score = -188.030571534
Loss Function: lad n_estimators = 500 max_depth= 50 has score = -198.050188456
Loss Function: lad n_estimators = 500 max_depth= 50 has score = -191.947726773
Loss Function: lad n_estimators = 500 max_depth= 50 has score = -209.884967744
Loss Function: lad n_estimators = 500 max_depth= 50 has score = -239.156813333
Loss Function: lad n_estimators = 500 max_depth= 50 has score = -261.554255293
Loss Function: lad n_estimators = 500 max_depth= 50 has score = -292.889327384
Loss Function: lad n_estimators = 500 max_depth= 100 has score = -196.616947141
Loss Function: lad n_estimators = 500 max_depth= 100 has score = -190.174798349
Loss Function: lad n_estimators = 500 max_depth= 100 has score = -191.084430106
Loss Function: lad n_estimators = 500 max_depth= 

Loss Function: huber n_estimators = 10 max_depth= 5 has score = -311.052043325
Loss Function: huber n_estimators = 10 max_depth= 5 has score = -314.269627912
Loss Function: huber n_estimators = 10 max_depth= 5 has score = -356.214749647
Loss Function: huber n_estimators = 10 max_depth= 5 has score = -369.073900202
Loss Function: huber n_estimators = 10 max_depth= 10 has score = -293.860888199
Loss Function: huber n_estimators = 10 max_depth= 10 has score = -286.398537761
Loss Function: huber n_estimators = 10 max_depth= 10 has score = -290.481579233
Loss Function: huber n_estimators = 10 max_depth= 10 has score = -281.432153439
Loss Function: huber n_estimators = 10 max_depth= 10 has score = -291.264437383
Loss Function: huber n_estimators = 10 max_depth= 10 has score = -294.52144041
Loss Function: huber n_estimators = 10 max_depth= 10 has score = -305.912828566
Loss Function: huber n_estimators = 10 max_depth= 10 has score = -356.36892181
Loss Function: huber n_estimators = 10 max_dep

Loss Function: huber n_estimators = 100 max_depth= 100 has score = -290.195389731
Loss Function: huber n_estimators = 500 max_depth= 1 has score = -270.567568688
Loss Function: huber n_estimators = 500 max_depth= 1 has score = -269.594394911
Loss Function: huber n_estimators = 500 max_depth= 1 has score = -269.052113357
Loss Function: huber n_estimators = 500 max_depth= 1 has score = -269.085608736
Loss Function: huber n_estimators = 500 max_depth= 1 has score = -269.818834035
Loss Function: huber n_estimators = 500 max_depth= 1 has score = -270.389655738
Loss Function: huber n_estimators = 500 max_depth= 1 has score = -277.284196841
Loss Function: huber n_estimators = 500 max_depth= 1 has score = -279.947536171
Loss Function: huber n_estimators = 500 max_depth= 1 has score = -300.293430921
Loss Function: huber n_estimators = 500 max_depth= 3 has score = -196.487154935
Loss Function: huber n_estimators = 500 max_depth= 3 has score = -189.637593344
Loss Function: huber n_estimators = 50

Loss Function: huber n_estimators = 1000 max_depth= 20 has score = -236.326543175
Loss Function: huber n_estimators = 1000 max_depth= 20 has score = -221.761458458
Loss Function: huber n_estimators = 1000 max_depth= 20 has score = -215.937634889
Loss Function: huber n_estimators = 1000 max_depth= 20 has score = -207.133548075
Loss Function: huber n_estimators = 1000 max_depth= 20 has score = -208.496927644
Loss Function: huber n_estimators = 1000 max_depth= 20 has score = -224.886453843
Loss Function: huber n_estimators = 1000 max_depth= 20 has score = -280.547479925
Loss Function: huber n_estimators = 1000 max_depth= 50 has score = -346.989079276
Loss Function: huber n_estimators = 1000 max_depth= 50 has score = -266.611212823
Loss Function: huber n_estimators = 1000 max_depth= 50 has score = -239.441411257
Loss Function: huber n_estimators = 1000 max_depth= 50 has score = -223.299665445
Loss Function: huber n_estimators = 1000 max_depth= 50 has score = -217.982921018
Loss Function: h

Loss Function: quantile n_estimators = 100 max_depth= 3 has score = -672.962366153
Loss Function: quantile n_estimators = 100 max_depth= 3 has score = -668.946032481
Loss Function: quantile n_estimators = 100 max_depth= 3 has score = -679.725147099
Loss Function: quantile n_estimators = 100 max_depth= 3 has score = -674.43224982
Loss Function: quantile n_estimators = 100 max_depth= 3 has score = -678.485904205
Loss Function: quantile n_estimators = 100 max_depth= 3 has score = -800.935789428
Loss Function: quantile n_estimators = 100 max_depth= 5 has score = -591.902133754
Loss Function: quantile n_estimators = 100 max_depth= 5 has score = -590.635229099
Loss Function: quantile n_estimators = 100 max_depth= 5 has score = -562.394814559
Loss Function: quantile n_estimators = 100 max_depth= 5 has score = -558.400723441
Loss Function: quantile n_estimators = 100 max_depth= 5 has score = -555.736754302
Loss Function: quantile n_estimators = 100 max_depth= 5 has score = -558.076251341
Loss 

KeyboardInterrupt: 

## Gaussian Process

In [20]:
alphas= [1e-100,1e-50, 1e-20, 1e-10, 1e-5, 1, 10, 100 ]

for a in alphas:
    results =[]
    kfold = model_selection.KFold(n_splits=10, shuffle = True, random_state=11)
    results = model_selection.cross_val_score(GaussianProcessRegressor(alpha = a), X, Y, cv=kfold, scoring='neg_mean_squared_error')
    score = results.mean()
    print "Alpha =",a , "has score =",score


Alpha = 1e-100 has score = -2880.36532539
Alpha = 1e-50 has score = -2880.36532539
Alpha = 1e-20 has score = -2880.36532539
Alpha = 1e-10 has score = -2880.36532539
Alpha = 1e-05 has score = -2880.36539287
Alpha = 1 has score = -2883.96792488
Alpha = 10 has score = -2887.25987022
Alpha = 100 has score = -2887.94906485


## Random Forest

In [21]:
estimators = [10, 100, 500, 1000]
depth = [1,3,5,10,20,50,100]
leaves = [1, 2, 3, 4, 5, 10, 20, 50, 100]

for e in estimators:
    for d in depth:
         for leaf in leaves:
            results =[]
            kfold = model_selection.KFold(n_splits=10, shuffle = True, random_state=11)
            results = model_selection.cross_val_score(RandomForestRegressor(n_estimators = e, max_depth = d, min_samples_leaf = leaf), X, Y, cv=kfold, scoring='neg_mean_squared_error')
            score = results.mean()
            print "n_estimators =", e, "max_depth=", d, "min leaf samples=", leaf, "has score =",score


n_estimators = 10 max_depth= 1 min leaf samples= 1 has score = -374.132925709
n_estimators = 10 max_depth= 1 min leaf samples= 2 has score = -374.168343604
n_estimators = 10 max_depth= 1 min leaf samples= 3 has score = -369.418467256
n_estimators = 10 max_depth= 1 min leaf samples= 4 has score = -380.305123745
n_estimators = 10 max_depth= 1 min leaf samples= 5 has score = -370.857172529
n_estimators = 10 max_depth= 1 min leaf samples= 10 has score = -374.402727122
n_estimators = 10 max_depth= 1 min leaf samples= 20 has score = -375.751760947
n_estimators = 10 max_depth= 1 min leaf samples= 50 has score = -367.484983291
n_estimators = 10 max_depth= 1 min leaf samples= 100 has score = -372.904719621
n_estimators = 10 max_depth= 3 min leaf samples= 1 has score = -300.030601786
n_estimators = 10 max_depth= 3 min leaf samples= 2 has score = -314.078554527
n_estimators = 10 max_depth= 3 min leaf samples= 3 has score = -304.924495315
n_estimators = 10 max_depth= 3 min leaf samples= 4 has scor

n_estimators = 100 max_depth= 20 min leaf samples= 10 has score = -271.836116313
n_estimators = 100 max_depth= 20 min leaf samples= 20 has score = -306.357778716
n_estimators = 100 max_depth= 20 min leaf samples= 50 has score = -340.619989449
n_estimators = 100 max_depth= 20 min leaf samples= 100 has score = -361.508517393
n_estimators = 100 max_depth= 50 min leaf samples= 1 has score = -225.665861694
n_estimators = 100 max_depth= 50 min leaf samples= 2 has score = -229.055745558
n_estimators = 100 max_depth= 50 min leaf samples= 3 has score = -230.516753014
n_estimators = 100 max_depth= 50 min leaf samples= 4 has score = -242.005673726
n_estimators = 100 max_depth= 50 min leaf samples= 5 has score = -249.881880232
n_estimators = 100 max_depth= 50 min leaf samples= 10 has score = -272.393593275
n_estimators = 100 max_depth= 50 min leaf samples= 20 has score = -308.83830967
n_estimators = 100 max_depth= 50 min leaf samples= 50 has score = -342.339775492
n_estimators = 100 max_depth= 50 

KeyboardInterrupt: 