## This final notebook will run best models of Logistic Regression, SVC, and Ensembles on same dataset to compare
## Dataset: cleaned without missing values from all 8 features (392 observations)
## Columns: 3 cases (all 8 features, 6 features LASSO, 5 features importances)
## cut off values: 0.3, 0.35, 0.4, 0.45

In [34]:
## import packages and modules
import boto3, botocore
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from itertools import product

## fetch file content from s3
s3 = boto3.resource('s3')
bucket = s3.Bucket('danhtran358-data-445-bucket')

bucket_object = bucket.Object('project_cleaned_data.csv')
## read file content to data-frame
diabetes_cleaned = pd.read_csv(bucket_object.get().get('Body'))
diabetes_cleaned

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,0
1,0,137,40,35,168,43.1,2.288,33,1
2,3,78,50,32,88,31.0,0.248,26,1
3,2,197,70,45,543,30.5,0.158,53,1
4,1,189,60,23,846,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
387,0,181,88,44,510,43.3,0.222,26,1
388,1,128,88,39,110,36.5,1.057,37,1
389,2,88,58,26,16,28.4,0.766,22,0
390,10,101,76,48,180,32.9,0.171,63,0


In [35]:
## function to write write data_frame to csv file object in S3 bucket
def write_data_to_s3(file_name, data_frame):
    ## file object in s3 bucket
    data_file = bucket.Object(file_name)
    
    ## add content from the lists of recall scores
    content = data_frame.to_csv(index=False)

    ## store as new csv file
    data_file.put(Body = content)
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_rf_data(file_name):
    try:
        ## file object in s3 bucket
        rf_data_file = bucket.Object(file_name)
        
        rf_data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            results = expand_grid(rf_dictionary)
            
            ## create columns for all types of cut-off values and scores
            for i in range(len(cut_off)):
                for j in range(len(score_to_evaluate)):
                    col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                    results[col] = 0.0
                    
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(rf_data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(rf_data_file.get().get('Body'))

    
## function to read AdaBoosting/Gradient Boosting data stored in s3 csv to dataframe
def read_ada_data(file_name):
    try:
        ## file object in s3 bucket
        boosting_data_file = bucket.Object(file_name)
        
        boosting_data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            results = expand_grid(ada_dictionary)
            
            ## create columns for all types of cut-off values and scores
            for i in range(len(cut_off)):
                for j in range(len(score_to_evaluate)):
                    col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                    results[col] = 0.0
                    
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(boosting_data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(boosting_data_file.get().get('Body'))

    
## function to read AdaBoosting/Gradient Boosting data stored in s3 csv to dataframe
def read_grad_data(file_name):
    try:
        ## file object in s3 bucket
        boosting_data_file = bucket.Object(file_name)
        
        boosting_data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            results = expand_grid(grad_dictionary)
            
            ## create columns for all types of cut-off values and scores
            for i in range(len(cut_off)):
                for j in range(len(score_to_evaluate)):
                    col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                    results[col] = 0.0
                    
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(boosting_data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(boosting_data_file.get().get('Body'))
    

## function to read Random Forest data stored in s3 csv to dataframe
def read_data_from_s3(file_name, X = None):
    try:
        ## file object in s3 bucket
        data_file = bucket.Object(file_name)
        
        data_file.load()
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            ## file does not exist yet, create new file
            if X is None:
                results = expand_grid(basic_dictionary)

                ## create columns for all types of cut-off values and scores
                for i in range(len(cut_off)):
                    for j in range(len(score_to_evaluate)):
                        col = str(cut_off[i]) + '_' + score_to_evaluate[j]
                        results[col] = 0.0
            
            else:
                ## empty dataframe with first row has 0 for total loops
                empty_list = list()
                results = pd.DataFrame(empty_list, columns = X.columns)
                results.at[0, 'total_loops'] = 0
                   
            ## write brand new and empty file to s3
            write_data_to_s3(file_name, results)
            
            ## return the dataframe from newly created file
            return pd.read_csv(data_file.get().get('Body'))
    else:
        ## return the dataframe already stored
        return pd.read_csv(data_file.get().get('Body'))

In [36]:
## Use dataframes to store parameters to build models and store total scores
def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())

basic_dictionary = {'input_layer': [5, 6, 8], 'total_loops' : [0]}

rf_dictionary = {'input_layer': [5, 6, 8], 'total_loops' : [0], 'n_tree': [500], 'depth': [3]}

ada_dictionary = {'input_layer': [5, 6, 8], 'total_loops' : [0], 'n_tree': [500], 'depth': [3], 'learning_rate': [0.001]}

grad_dictionary = {'input_layer': [5, 6, 8], 'total_loops' : [0], 'n_tree': [1000, 1500], 'depth': [3], 'learning_rate': [0.001]}

## lists of cut-off values and types of score to evaluate models
cut_off = [0.3, 0.35, 0.4, 0.45]
score_to_evaluate = ['precision', 'recall', 'f1']

In [37]:
## update the scores in result dataset after each ensemble model is built
def update_ensemble_result_scores(pred, Y_test, results, combo_number):
    
    for cut_off_id in range(len(cut_off)):
        
        ## classify labels
        current_cut_off = cut_off[cut_off_id]
        pred_labels = np.where(pred < current_cut_off, 0, 1)
        
        for score_id in range(len(score_to_evaluate)):
            
            ## updated the appropriate score
            current_score = score_to_evaluate[score_id]
            score_column = str(current_cut_off) + '_' + current_score
            if current_score == 'precision':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + precision_score(Y_test, pred_labels, zero_division = 0)
            
            elif current_score == 'recall':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + recall_score(Y_test, pred_labels)
                
            elif current_score == 'f1': 
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + f1_score(Y_test, pred_labels)

In [38]:
## update the scores in result dataset after each model is built
def update_svc_linear_result_scores(X_train, X_test, Y_train, Y_test, results, combo_number):
    
    ## Building the svc with kernel = 'linear'
    md_svc_linear = SVC(kernel = 'linear', probability = True).fit(X_train, Y_train)

    ## Making predictions on the test dataset
    pred = md_svc_linear.predict_proba(X_test)[:,1]
    
    for cut_off_id in range(len(cut_off)):
        
        ## classify labels
        current_cut_off = cut_off[cut_off_id]
        pred_labels = np.where(pred < current_cut_off, 0, 1)
        
        for score_id in range(len(score_to_evaluate)):
            
            ## updated the appropriate score
            current_score = score_to_evaluate[score_id]
            score_column = str(current_cut_off) + '_' + current_score
            
            if current_score == 'precision':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + precision_score(Y_test, pred_labels, zero_division = 0)
            
            elif current_score == 'recall':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + recall_score(Y_test, pred_labels)
                
            elif current_score == 'f1': 
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + f1_score(Y_test, pred_labels)

In [39]:
## update the scores in result dataset after each model is built
def update_logistic_result_scores(X_train, X_test, Y_train, Y_test, results, combo_number):

    ## build logistic models
    logit_md = LogisticRegression().fit(X_train, Y_train)

    ## predict the likelihood
    pred = logit_md.predict_proba(X_test)[:,1]
    
    for cut_off_id in range(len(cut_off)):
        
        ## classify labels
        current_cut_off = cut_off[cut_off_id]
        pred_labels = np.where(pred < current_cut_off, 0, 1)
        
        for score_id in range(len(score_to_evaluate)):
            
            ## updated the appropriate score
            current_score = score_to_evaluate[score_id]
            score_column = str(current_cut_off) + '_' + current_score
            
            if current_score == 'precision':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + precision_score(Y_test, pred_labels, zero_division = 0)
            
            elif current_score == 'recall':
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + recall_score(Y_test, pred_labels)
                
            elif current_score == 'f1': 
                results.at[combo_number, score_column] = results.at[combo_number, score_column] + f1_score(Y_test, pred_labels)

In [40]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome'])
Y = diabetes_cleaned['Outcome']

scaler = MinMaxScaler()

## read data stored in s3 file
rf_data_file_name = 'project_final_rf_data.csv'
rf_results = read_rf_data(rf_data_file_name)
ada_data_file_name = 'project_final_ada_data.csv'
ada_results = read_ada_data(ada_data_file_name)
grad_data_file_name = 'project_final_grad_data.csv'
grad_results = read_grad_data(grad_data_file_name)
svc_data_file_name = 'project_final_svc_result.csv'
svc_results = read_data_from_s3(svc_data_file_name)
logistic_data_file_name = 'project_final_logistic_result.csv'
logistic_results = read_data_from_s3(logistic_data_file_name)

## total_loops column keeps the number of loops already done, we only loop the rest until 100 times done
for loop_number in range(svc_results.at[0, 'total_loops'], 100):
    
    ## Dataset 1
    X_train_8_features, X_test_8_features, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)
    
    ## Dataset 2 inputs
    X_train_6_features = X_train_8_features.drop(columns = ['BloodPressure', 'Insulin'], axis = 1)
    X_test_6_features = X_test_8_features.drop(columns = ['BloodPressure', 'Insulin'], axis = 1)
    
    ## Dataset 4 inputs
    X_train_5_features = X_train_8_features.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'], axis = 1)
    X_test_5_features = X_test_8_features.drop(columns = ['Pregnancies', 'BloodPressure', 'SkinThickness'], axis = 1)
                
    ## scale input variables to 0-1 scale
    X_train_8_features = scaler.fit_transform(X_train_8_features)
    X_test_8_features = scaler.fit_transform(X_test_8_features)
    X_train_6_features = scaler.fit_transform(X_train_6_features)
    X_test_6_features = scaler.fit_transform(X_test_6_features)
    X_train_5_features = scaler.fit_transform(X_train_5_features)
    X_test_5_features = scaler.fit_transform(X_test_5_features)
    
    ## Build random forest for each parameter combination and store scores
    for combo_number in range(rf_results.shape[0]):
        parameters = rf_results.loc[combo_number]
            
        if parameters['input_layer'] == 5:
            X_train = X_train_5_features
            X_test = X_test_5_features
        elif parameters['input_layer'] == 6:
            X_train = X_train_6_features
            X_test = X_test_6_features
        elif parameters['input_layer'] == 8:
            X_train = X_train_8_features
            X_test = X_test_8_features
        
        ## Building model, predicting results, and update scores
        md_rf = RandomForestClassifier(max_depth = parameters['depth'],
                                       n_estimators = int(parameters['n_tree'])).fit(X_train, Y_train)
        pred = md_rf.predict_proba(X_test)[:, 1]
        update_ensemble_result_scores(pred, Y_test, rf_results, combo_number)
    
    ## Build AdaBoost model for each parameter combination and store scores
    for combo_number in range(ada_results.shape[0]):
        parameters = ada_results.loc[combo_number]
            
        if parameters['input_layer'] == 5:
            X_train = X_train_5_features
            X_test = X_test_5_features
        elif parameters['input_layer'] == 6:
            X_train = X_train_6_features
            X_test = X_test_6_features
        elif parameters['input_layer'] == 8:
            X_train = X_train_8_features
            X_test = X_test_8_features
        
        ## Building model, predicting results, and update scores
        md_ada = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = parameters['depth']),
                                    n_estimators = int(parameters['n_tree']),
                                    learning_rate = parameters['learning_rate']).fit(X_train, Y_train)
        pred = md_ada.predict_proba(X_test)[:, 1]
        update_ensemble_result_scores(pred, Y_test, ada_results, combo_number)
    
    ## Build Gradient Boosting model for each parameter combination and store scores
    for combo_number in range(grad_results.shape[0]):
        parameters = grad_results.loc[combo_number]
            
        if parameters['input_layer'] == 5:
            X_train = X_train_5_features
            X_test = X_test_5_features
        elif parameters['input_layer'] == 6:
            X_train = X_train_6_features
            X_test = X_test_6_features
        elif parameters['input_layer'] == 8:
            X_train = X_train_8_features
            X_test = X_test_8_features
        
        ## Building model, predicting results, and update scores
        md_grad = GradientBoostingClassifier(max_depth = parameters['depth'],
                                             n_estimators = int(parameters['n_tree']),
                                             learning_rate = parameters['learning_rate']).fit(X_train, Y_train)
        pred = md_grad.predict_proba(X_test)[:, 1]
        update_ensemble_result_scores(pred, Y_test, grad_results, combo_number)
    
    ## Build SVC model linear kernel for each parameter combination and store scores
    for combo_number in range(svc_results.shape[0]):
        parameters = svc_results.loc[combo_number]
            
        if parameters['input_layer'] == 5:
            X_train = X_train_5_features
            X_test = X_test_5_features
        elif parameters['input_layer'] == 6:
            X_train = X_train_6_features
            X_test = X_test_6_features
        elif parameters['input_layer'] == 8:
            X_train = X_train_8_features
            X_test = X_test_8_features
        
        update_svc_linear_result_scores(X_train, X_test, Y_train, Y_test, svc_results, combo_number)
    
    ## Build Logistic model for each parameter combination and store scores
    for combo_number in range(logistic_results.shape[0]):
        parameters = svc_results.loc[combo_number]
            
        if parameters['input_layer'] == 5:
            X_train = X_train_5_features
            X_test = X_test_5_features
        elif parameters['input_layer'] == 6:
            X_train = X_train_6_features
            X_test = X_test_6_features
        elif parameters['input_layer'] == 8:
            X_train = X_train_8_features
            X_test = X_test_8_features
        
        update_logistic_result_scores(X_train, X_test, Y_train, Y_test, logistic_results, combo_number)
        
    rf_results['total_loops'] = loop_number + 1
    ada_results['total_loops'] = loop_number + 1
    grad_results['total_loops'] = loop_number + 1
    svc_results['total_loops'] = loop_number + 1
    logistic_results['total_loops'] = loop_number + 1
    ## Writing data to s3
    write_data_to_s3(rf_data_file_name, rf_results)
    write_data_to_s3(ada_data_file_name, ada_results)
    write_data_to_s3(grad_data_file_name, grad_results)
    write_data_to_s3(svc_data_file_name, svc_results)
    write_data_to_s3(logistic_data_file_name, logistic_results)
    


In [41]:
rf_results.drop(columns = ['total_loops', 'n_tree', 'depth']) / rf_results.at[0, 'total_loops']

Unnamed: 0,input_layer,0.3_precision,0.3_recall,0.3_f1,0.35_precision,0.35_recall,0.35_f1,0.4_precision,0.4_recall,0.4_f1,0.45_precision,0.45_recall,0.45_f1
0,0.05,0.551962,0.882308,0.674351,0.585533,0.834231,0.682532,0.625248,0.775769,0.684227,0.667385,0.702308,0.673952
1,0.06,0.536276,0.875,0.660811,0.57806,0.803846,0.666925,0.617009,0.718846,0.656621,0.674562,0.642308,0.648847
2,0.08,0.534251,0.899615,0.666418,0.579299,0.847692,0.683827,0.61743,0.773077,0.68049,0.661862,0.686923,0.665886


In [42]:
ada_results.drop(columns = ['total_loops', 'n_tree', 'depth', 'learning_rate']) / ada_results.at[0, 'total_loops']

Unnamed: 0,input_layer,0.3_precision,0.3_recall,0.3_f1,0.35_precision,0.35_recall,0.35_f1,0.4_precision,0.4_recall,0.4_f1,0.45_precision,0.45_recall,0.45_f1
0,0.05,0.56352,0.793077,0.651727,0.581741,0.765,0.65321,0.59697,0.738077,0.651607,0.611314,0.698846,0.641427
1,0.06,0.567912,0.771538,0.647054,0.587797,0.736538,0.645981,0.614159,0.708462,0.649248,0.629946,0.666538,0.638531
2,0.08,0.55834,0.781154,0.645145,0.57938,0.753846,0.648847,0.597855,0.723077,0.647411,0.609192,0.690769,0.639121


In [43]:
grad_results.drop(columns = ['total_loops', 'depth', 'learning_rate']) / grad_results.at[0, 'total_loops']

Unnamed: 0,input_layer,n_tree,0.3_precision,0.3_recall,0.3_f1,0.35_precision,0.35_recall,0.35_f1,0.4_precision,0.4_recall,0.4_f1,0.45_precision,0.45_recall,0.45_f1
0,0.05,10.0,0.546654,0.836154,0.654574,0.57426,0.786154,0.655423,0.605411,0.728462,0.650858,0.639482,0.65,0.631907
1,0.05,15.0,0.552279,0.837308,0.658191,0.577964,0.799231,0.662292,0.604508,0.743462,0.65615,0.63417,0.680385,0.644289
2,0.06,10.0,0.542847,0.807692,0.642539,0.583173,0.749231,0.64734,0.611183,0.672308,0.631239,0.644667,0.6,0.611922
3,0.06,15.0,0.547419,0.808077,0.645635,0.584287,0.76,0.652661,0.609305,0.701538,0.643597,0.638915,0.629615,0.625036
4,0.08,10.0,0.540913,0.829231,0.64912,0.575755,0.781154,0.655551,0.602948,0.719615,0.646483,0.638898,0.639231,0.628492
5,0.08,15.0,0.546402,0.833077,0.654176,0.576617,0.792692,0.660384,0.60249,0.741154,0.655296,0.629999,0.671538,0.640154


In [44]:
svc_results.drop(columns = ['total_loops']) / svc_results.at[0, 'total_loops']

Unnamed: 0,input_layer,0.3_precision,0.3_recall,0.3_f1,0.35_precision,0.35_recall,0.35_f1,0.4_precision,0.4_recall,0.4_f1,0.45_precision,0.45_recall,0.45_f1
0,0.05,0.583477,0.831538,0.679307,0.617385,0.792692,0.686335,0.642519,0.741923,0.680002,0.668916,0.7,0.674752
1,0.06,0.583917,0.831538,0.679984,0.609948,0.785,0.67954,0.638038,0.734231,0.674693,0.658899,0.686923,0.663253
2,0.08,0.577355,0.829231,0.674927,0.606397,0.783077,0.676292,0.63165,0.736538,0.671987,0.652619,0.685769,0.659336


In [45]:
logistic_results.drop(columns = ['total_loops']) / logistic_results.at[0, 'total_loops']

Unnamed: 0,input_layer,0.3_precision,0.3_recall,0.3_f1,0.35_precision,0.35_recall,0.35_f1,0.4_precision,0.4_recall,0.4_f1,0.45_precision,0.45_recall,0.45_f1
0,0.05,0.572066,0.853077,0.678931,0.615346,0.794615,0.686044,0.652609,0.733077,0.681795,0.675995,0.672308,0.663959
1,0.06,0.567528,0.853077,0.676444,0.599019,0.79,0.673753,0.634217,0.726538,0.669358,0.665669,0.667692,0.657748
2,0.08,0.562934,0.850385,0.672439,0.593935,0.793077,0.672636,0.626703,0.726538,0.666165,0.656134,0.668077,0.653799


In [46]:
## Random forest 500 trees depth 5,   cleaned dataset 5 features, cut-off = 0.35,   precision = 0.585533,   recall = 0.834231,   f1 = 0.682532
## SVC model linear kernel,           cleaned dataset 5 features, cut-off = 0.35,   precision = 0.617385,   recall = 0.792692,   f1 = 0.686335
## Logistic model,                    cleaned dataset 5 features, cut-off = 0.35,   precision = 0.615346,   recall = 0.794615,   f1 = 0.686044

## Logistic model looks good

In [None]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome', 'Pregnancies', 'BloodPressure', 'SkinThickness'])
Y = diabetes_cleaned['Outcome']

scaler = MinMaxScaler()

cut_off = 0.35
    
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## scale input variables to 0-1 scale
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## Building random forest and predict
md_rf = RandomForestClassifier(max_depth = 5, n_estimators = 500).fit(X_train, Y_train)
rf_pred = md_rf.predict_proba(X_test)[:, 1]

## Building the svc with kernel = 'linear' and predict
md_svc_linear = SVC(kernel = 'linear', probability = True).fit(X_train, Y_train)
svc_pred = md_svc_linear.predict_proba(X_test)[:,1]

## build logistic models and predict
logit_md = LogisticRegression().fit(X_train, Y_train)
logit_pred = logit_md.predict_proba(X_test)[:,1]

## Input variables
X_rf_stacked = pd.concat([pd.DataFrame(rf_pred),pd.DataFrame(svc_pred),pd.DataFrame(logit_pred)], axis = 1)

## Random forest model
md_rf_stacked = RandomForestClassifier(max_depth = 3, n_estimators = 500).fit(X_rf_stacked, Y_test)

## Extracting ensemble likelihood
pred_rf_stacked = md_rf_stacked.predict_proba(X_rf_stacked)[:,1]

## Classifying
pred_rf_stacked = np.where(pred_rf_stacked < cut_off, 0, 1)

print('Precision score:', precision_score(Y_test, pred_rf_stacked))
print('Recall score:   ', recall_score(Y_test, pred_rf_stacked))
print('F1 score:       ', f1_score(Y_test, pred_rf_stacked))

In [None]:
## Score is boosted after stacking
## There might be a data leak

In [None]:
## defining input and target variables
X = diabetes_cleaned.drop(columns = ['Outcome', 'Pregnancies', 'BloodPressure', 'SkinThickness'])
Y = diabetes_cleaned['Outcome']

scaler = MinMaxScaler()

cut_off = 0.35
    
## Split data into testing and training datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Split test datasets in 2, 1 to build stacked model, the other is to validate the model
X_test, X_stacked_vaidate, Y_test, Y_stacked_validate = train_test_split(X_test, Y_test, test_size = 0.5, stratify = Y_test)

## scale input variables to 0-1 scale
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## Building random forest and predict
md_rf = RandomForestClassifier(max_depth = 5, n_estimators = 500).fit(X_train, Y_train)
rf_pred = md_rf.predict_proba(X_test)[:, 1]

## Building the svc with kernel = 'linear' and predict
md_svc_linear = SVC(kernel = 'linear', probability = True).fit(X_train, Y_train)
svc_pred = md_svc_linear.predict_proba(X_test)[:,1]

## build logistic models and predict
logit_md = LogisticRegression().fit(X_train, Y_train)
logit_pred = logit_md.predict_proba(X_test)[:,1]

## Input variables
X_rf_stacked = pd.concat([pd.DataFrame(rf_pred),pd.DataFrame(svc_pred),pd.DataFrame(logit_pred)], axis = 1)

## Random forest model
md_rf_stacked = RandomForestClassifier(max_depth = 3, n_estimators = 500).fit(X_rf_stacked, Y_test)

## Validating the stacked model
## Predicting on dataset to validate the stacked model
rf_pred = md_rf.predict_proba(X_stacked_vaidate)[:, 1]
svc_pred = md_svc_linear.predict_proba(X_stacked_vaidate)[:,1]
logit_pred = logit_md.predict_proba(X_stacked_vaidate)[:,1]

## Input variables
X_rf_stacked_validate = pd.concat([pd.DataFrame(rf_pred),pd.DataFrame(svc_pred),pd.DataFrame(logit_pred)], axis = 1)

## Extracting ensemble likelihood
pred_rf_stacked = md_rf_stacked.predict_proba(X_rf_stacked_validate)[:,1]

## Classifying
pred_rf_stacked = np.where(pred_rf_stacked < cut_off, 0, 1)

print('Precision score:', precision_score(Y_stacked_validate, pred_rf_stacked))
print('Recall score:   ', recall_score(Y_stacked_validate, pred_rf_stacked))
print('F1 score:       ', f1_score(Y_stacked_validate, pred_rf_stacked))