# Machine Learning Code: Gradient Boostin Regression

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import scipy as sp
import glob as glob
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

full_data= pd.read_csv("/Users/jenniferpolson/Documents/School/2017-F/BE 223A/Final Project/features_encoded_processed.csv")

In [7]:
def acc_bin(ytrue, ypred):
    '''Creates the scoring metric for the GridSearchCV
    '''
    ypred_b = (ypred > 0.5).astype(int)
    tn, fp, fn, tp = confusion_matrix(ytrue, ypred_b).ravel()
    acc  = round((tp + tn)/(tp + tn + fp + fn)*100 , 3)
    return acc


scoring = {'Accuracy': make_scorer(acc_bin)}

In [8]:
def gradient_boost_reg_CV (fold, total):
    '''Performs Parameter Tuning on the Gradient Boost Regressor Model to produce probability labels for each entry in the test set.
    Input: 
      - fold: the test data
      - total: the entire dataset
    
    Output:
      - test: the 'fold' dataset, with an addtional row added with the predictive labels
    '''
    test = full_data.loc[full_data['Folds'] == fold].dropna()
    test_n = test.iloc[:, :-3]

    train = pd.concat([full_data, test]).drop_duplicates(keep=False).dropna()
    train_n = train.iloc[:, :-3]

    trainArr = train_n.as_matrix()
    trainRes = train.as_matrix(['Labels'])
    testArr = test_n.as_matrix()
    
    lr = 0.01
    
    #NUMBER OF ESTIMATORS
    param_test1 = {'n_estimators':np.arange(20,111,10).tolist()}
    gsearch1 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=lr, min_samples_split=500,
                              min_samples_leaf=50,max_depth=8,max_features='sqrt', subsample=0.8,random_state=10), 
                   param_grid = param_test1, scoring=scoring,n_jobs=4,iid=False, cv=5, refit = False)
    gsearch1.fit(trainArr, trainRes)
    n_estimators = list(gsearch1.best_params_.values())[0]
    display(gsearch1.best_params_)

    param_test2 = {'max_depth':list(range(1,16,1)), 'min_samples_split':list(range(100,1001,100))}
    gsearch2 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=lr, n_estimators=n_estimators,
                                            max_features='sqrt', subsample=0.8, random_state=10), 
                   param_grid = param_test2, scoring=scoring,n_jobs=4,iid=False, cv=5, refit = False)
    gsearch2.fit(trainArr, trainRes)
    display(gsearch2.best_params_)
    max_depth=list(gsearch2.best_params_.values())[0]
    min_samples_split = list(gsearch2.best_params_.values())[1]

    #Grid seach on subsample and max_features
    param_test3 = {'min_samples_leaf':list(range(1,71,10))}
    gsearch3 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=lr, n_estimators=n_estimators,max_depth=max_depth, min_samples_split=min_samples_split,
                                                max_features='sqrt', subsample=0.8, random_state=10), 
                   param_grid = param_test3, scoring=scoring,n_jobs=4,iid=False, cv=5, refit = False)
    gsearch3.fit(trainArr, trainRes)
    display(gsearch3.best_params_)
    min_samples_leaf = list(gsearch3.best_params_.values())[0]

    #Grid seach on subsample and max_features
    param_test4 = {'max_features':range(5,20,2)}
    gsearch4 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=lr, n_estimators=n_estimators,max_depth=max_depth, min_samples_split=min_samples_split,
                                                min_samples_leaf=min_samples_leaf, subsample=0.8, random_state=10), 
                   param_grid = param_test4, scoring=scoring,n_jobs=4,iid=False, cv=5, refit = False)
    gsearch4.fit(trainArr, trainRes)
    display(gsearch4.best_params_)
    max_features = list(gsearch4.best_params_.values())[0]

    #Grid seach on subsample and max_features
    param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
    gsearch5=GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=lr, n_estimators=n_estimators,max_depth=max_depth, min_samples_split=min_samples_split,
                                                min_samples_leaf=min_samples_leaf,max_features=max_features, random_state=10), 
                   param_grid = param_test5, scoring=scoring,n_jobs=4,iid=False, cv=5, refit = False)
    gsearch5.fit(trainArr, trainRes)
    display(gsearch5.best_params_)
    subsample = list(gsearch5.best_params_.values())[0]

    rf = GradientBoostingRegressor(learning_rate=lr/100, n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split,
                                                subsample=subsample, max_features=max_features, min_samples_leaf=min_samples_leaf, random_state=10)
    
    rf.fit(trainArr, trainRes) # fit the data to the algorithm
    
    #predictions
    results = rf.predict(testArr)
    test['predictions_probability'] = results
    
    #plot the feature importances
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    
    features = list(test)
    sort_features = []
    sort_import = []
    for f in range(testArr.shape[1]):
        sort_features.append(features[indices[f]])
        sort_import.append(importances[indices[f]])

    df = pd.DataFrame({'Indices': sort_features, 'Importance': sort_import})

    feature_list = ['OrgCode', 'Timeofday', 'Modality', 'Age', 'Gender', 'Weekday']
    feature_sums = []
    print("Feature ranking:")
    for i in feature_list:
        org = df[df['Indices'].str.contains(i)]
        addition = org['Importance'].sum()
        print("%a: %f" % (i, addition))
        feature_sums.append(addition)

    plt.figure(figsize=(10,6))
    plt.title("Feature importances")
    plt.bar(np.arange(len(feature_list)), np.asarray(feature_sums), 
            color="teal", align="center")
    plt.xticks(np.arange(len(feature_list)), feature_list, rotation = 60)
    plt.tick_params(axis='both', which='major', labelsize=12)
    plt.xlim([-1, len(feature_list)])
    plt.show()


    return test

In [None]:
fold0 = gradient_boost_reg_CV(0, full_data)
fold1 = gradient_boost_reg_CV(1, full_data)
fold2 = gradient_boost_reg_CV(2, full_data)
fold3 = gradient_boost_reg_CV(3, full_data)
fold4 = gradient_boost_reg_CV(4, full_data)

In [9]:
from sklearn.metrics import confusion_matrix

def binary_metrics (data):
    #binarize the variable
    data = data

    tn, fp, fn, tp = confusion_matrix(data["Labels"], data['Predictions']).ravel()

    #Accuracy
    acc  = round((tp + tn)/(tp + tn + fp + fn)*100 , 3)

    #Recall
    rec  = round(tp/(tp + fn)*100, 3)

    #Precision
    prec = round(tp/(tp + fp)*100,3)

    #F1 Score
    f1   = round(2*tp/((2*tp) + fp + fn)*100,3)
    
    metrics = [acc, rec, prec, f1]
    return metrics

In [None]:
b0 = binary_metrics(fold0)
b1 = binary_metrics(fold1)
b2 = binary_metrics(fold2)
b3 = binary_metrics(fold3)
b4 = binary_metrics(fold4)


binary = pd.DataFrame([b0, b1, b2, b3, b4])
binary.columns = ['Accuracy', 'Recall', 'Precision', 'F-1 Score']
binary.index = ['Fold 0', 'Fold 1', 'Fold 2', 'Fold 3', 'Fold 4']
binary