In [22]:
import pandas as pd
import numpy as np

from sklearn import preprocessing as pp
%pylab inline

cause = pd.read_csv('/home/lara/Documents/Repository/Capstone-1_WorldBank_GenderData/causes.csv')
effect = pd.read_csv('/home/lara/Documents/Repository/Capstone-1_WorldBank_GenderData/effects.csv')


# Supervised Learning Modules
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn import model_selection

from sklearn.linear_model import SGDRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


Populating the interactive namespace from numpy and matplotlib


# Looping through various models

In [2]:
models = []
models.append(('SGDR', SGDRegressor()))
models.append(('GaussianPR', GaussianProcessRegressor()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('DTree', DecisionTreeRegressor()))
models.append(('GradientBR', GradientBoostingRegressor()))
models.append(('SVR', SVR()))
models.append(('RF', RandomForestRegressor(n_jobs = -1, n_estimators = 500)))

In [23]:
Y = cause.pop('bc')
Xcause = cause

In [4]:
#a function to evaluate each model
def run_models(x,y):
    results = []
    names = []

    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, shuffle = True, random_state=11)
        cv_results = model_selection.cross_val_score(model, x, y, cv=kfold, scoring='neg_mean_squared_error')
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
'''
    # boxplot algorithm comparison
    fig = pyplot.figure()
    fig.suptitle(title)
    ax = fig.add_subplot(111)
    pyplot.boxplot(results)
    ax.set_xticklabels(names)
    pyplot.ylim(0,1)
    pyplot.show()
'''

'\n    # boxplot algorithm comparison\n    fig = pyplot.figure()\n    fig.suptitle(title)\n    ax = fig.add_subplot(111)\n    pyplot.boxplot(results)\n    ax.set_xticklabels(names)\n    pyplot.ylim(0,1)\n    pyplot.show()\n'

In [5]:
run_models(Xcause, Y)

SGDR: -55724353233858278438095828111327232.000000 (108062339573833758181850121699852288.000000)
GaussianPR: -2880.365325 (360.577292)
KNN: -380.370038 (67.390897)
DTree: -408.722496 (109.783612)
GradientBR: -214.962253 (49.798543)
SVR: -542.577257 (62.328021)
RF: -223.246727 (55.182924)


## Tuning the SVM

In [24]:
def run_svr(parameters, x, y):
    kfold = model_selection.KFold(n_splits=10, shuffle = True, random_state=11)
    svr = SVR()
    reg = GridSearchCV(svr, parameters, cv = kfold, scoring='neg_mean_squared_error')
    reg.fit(x, y)
    
    print reg.best_estimator_
    print reg.best_score_

In [20]:
SVR().get_params().keys()

['kernel',
 'C',
 'verbose',
 'degree',
 'epsilon',
 'shrinking',
 'max_iter',
 'tol',
 'cache_size',
 'coef0',
 'gamma']

In [25]:
params = {'C': [0.001, 0.1, 1, 10, 100], 'epsilon': [0.0001, 0.001, 0.1, 1, 10]}
run_svr(params, Xcause, Y)

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.0001, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
-516.892182296


## Tuning the Gradient Boosting Regressor

In [26]:
def run_gradi(parameters, x, y):
    kfold = model_selection.KFold(n_splits=10, shuffle = True, random_state=11)
    gbr = GradientBoostingRegressor()
    reg = GridSearchCV(gbr, parameters, cv = kfold, scoring='neg_mean_squared_error')
    reg.fit(x, y)
    
    print reg.best_estimator_
    print reg.best_score_

In [27]:
GradientBoostingRegressor().get_params().keys()

['presort',
 'loss',
 'verbose',
 'subsample',
 'max_leaf_nodes',
 'learning_rate',
 'warm_start',
 'min_samples_leaf',
 'n_estimators',
 'min_samples_split',
 'init',
 'min_weight_fraction_leaf',
 'criterion',
 'random_state',
 'min_impurity_split',
 'max_features',
 'alpha',
 'max_depth']

In [30]:
params = {'loss':['ls', 'lad', 'huber', 'quantile'], 'n_estimators': [10, 100, 500, 1000], 'max_depth':[1,3,5,10,20,50,100], 'min_samples_leaf':[1, 2, 3, 4, 5, 10, 20, 50, 100]}

In [31]:
run_gradi(params, Xcause, Y)

KeyboardInterrupt: 

## Gaussian Process

In [32]:
def run_gauss(parameters, x, y):
    kfold = model_selection.KFold(n_splits=10, shuffle = True, random_state=11)
    gpr = GaussianProcessRegressor()
    reg = GridSearchCV(gpr, parameters, cv = kfold, scoring='neg_mean_squared_error')
    reg.fit(x, y)
    
    print reg.best_estimator_
    print reg.best_score_

In [34]:
GaussianProcessRegressor().get_params().keys()

['kernel',
 'optimizer',
 'copy_X_train',
 'random_state',
 'n_restarts_optimizer',
 'alpha',
 'normalize_y']

In [35]:
params = {'alpha':[1e-100,1e-50, 1e-20, 1e-10, 1e-5, 1, 10, 100 ]}
run_gauss(params, Xcause, Y)

GaussianProcessRegressor(alpha=1e-100, copy_X_train=True, kernel=None,
             n_restarts_optimizer=0, normalize_y=False,
             optimizer='fmin_l_bfgs_b', random_state=None)
-2879.99136301


## Random Forest

In [None]:
def run_rf(parameters, x, y):
    kfold = model_selection.KFold(n_splits=10, shuffle = True, random_state=11)
    rf = RandomForestRegressor()
    reg = GridSearchCV(rf, parameters, cv = kfold, scoring='neg_mean_squared_error')
    reg.fit(x, y)
    
    print reg.best_estimator_
    print reg.best_score_

In [None]:
RandomForestRegressor().get_params().keys()

In [None]:
params = {'n_estimators': [10, 100, 500, 1000], 'max_depth': [1,3,5,10,20,50,100], 'min_samples_leaf': [1, 2, 3, 4, 5, 10, 20, 50, 100]}
run_rf(params, Xcause,Y)