# Step 3 - Make Model Predictions on Test Data
1. Get best model for each cluster  
2. Predict binary and level traffic values
3. Write predictions to database and csv

In [None]:
import numpy as np
import pandas as pd
import psycopg2 as pg
import datetime as dt
from sklearn import preprocessing
from collections import OrderedDict
from collections import defaultdict
from pprint import pprint
import cPickle as pickle
import gc
import socket
import boto3
from boto.utils import get_instance_metadata
import ast
from Segments import Segments
from Times import Times
from Cluster import Cluster
from Utility import Utility
from AWS import AWS
import time
import datetime
import os
import shutil
import joblib
import string

# clustering
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sqlalchemy import create_engine

# modeling
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
start_time = time.time()

### set inputs

In [None]:
# set environment
aws = None
s3_bucket_name = 'dse-cohort3-group3'
s3_dat_dir = 'PreprocessedWazeData'

# assume connection file is always present
sampling_args_file = '../conf/pipeline_args.txt'

fr = open(sampling_args_file, 'r')
fa = fr.read()
file_args = ast.literal_eval(fa)

save_dir = file_args['save_dir']
conn_str_file = file_args['conn_str_file']
sqlalchemy_conn_str_file = file_args['sqlalchemy_conn_str_file']

In [None]:
pg_conn_str = open(conn_str_file, 'r').read()
conn = pg.connect(pg_conn_str) 

In [None]:
# dicts to track test results
test_results_dict = {
    'stage_1': {},
    'stage_2': {}
}

In [None]:
util = Utility(file_args)
util.conn = conn

In [None]:
train_data, test_data = util.get_modeling_data()

In [None]:
print('--- getting data took {0:.1f} seconds ---'.format(time.time() - start_time))

# 1.  first stage - level_binary

In [None]:
first_modeling_stage_start = time.time()

### stage 1 avg baseline

In [None]:
# create prediction dataframes
targets = [c for c in train_data.columns if c.startswith('level')]
train_preds = train_data[['date','time','date_idx','time_idx','segment_id','day_of_week','cluster']+targets].copy()
test_preds = test_data[['date','time','date_idx','time_idx','segment_id','day_of_week','cluster']+targets].copy()

In [None]:
# make test predictions - move to step 4
if file_args['model_avg_baseline']:
    print('making predictions using avg baseline model...')
    print('target variable is {}'.format(file_args['target_first_stage']))
    pred_suffix = '_preds_avg_baseline'
    model_test_results = {}
    
    # load averages for making predictions
    fn = os.path.join(save_dir, 'stage1_model_avg_baseline.pkl')
    model = joblib.load(fn)
    
    # join predictions to train and test dataframes
    train_preds_avg = train_data.merge(model, how='left', on=['time_idx','segment_id','day_of_week'])
    test_preds_avg = test_data.merge(model, how='left', on=['time_idx','segment_id','day_of_week'])
    train_preds_avg = train_preds_avg[['date_idx','time_idx','segment_id',file_args['target_first_stage']+pred_suffix]]
    test_preds_avg = test_preds_avg[['date_idx','time_idx','segment_id',file_args['target_first_stage']+pred_suffix]]
    
    # fill null predictions with 0
    train_preds_avg[file_args['target_first_stage']+pred_suffix].fillna(value=0, inplace=True)
    test_preds_avg[file_args['target_first_stage']+pred_suffix].fillna(value=0, inplace=True)
      
    # add results to prediction dataframes
    train_preds = train_preds.merge(train_preds_avg, how='left', on=['time_idx','date_idx','segment_id'])
    test_preds = test_preds.merge(test_preds_avg, how='left', on=['time_idx','date_idx','segment_id'])
    
    # calculate scores for individual clusters
    for clust in test_data['cluster'].unique():
        tmp_preds = test_preds.loc[test_preds['cluster']==clust,file_args['target_first_stage']+pred_suffix]
        tmp_actuals = test_preds.loc[test_preds['cluster']==clust,file_args['target_first_stage']]
        model_test_results[clust] = f1_score(tmp_actuals, tmp_preds, average=file_args['scoring_metric'][3:])
    
    test_results_dict['stage_1']['model_avg_baseline'] = model_test_results
    
else:
    print('not making predictions using avg baseline model...')

### stage 1 non-baseline models

In [None]:
model_dict = {
    'random_forest': RandomForestClassifier(random_state=file_args['seed']),
    'knn': KNeighborsClassifier(),
    'extra_trees': ExtraTreesClassifier(random_state=file_args['seed']),
    'gradient_boosting': GradientBoostingClassifier(random_state=file_args['seed']),
    'logistic_regression': LogisticRegression(random_state=file_args['seed']),
    'gaussian_nb': GaussianNB()
}

In [None]:
def predict_stage1_model_on_full(model_key):
    if file_args['model_full_{}'.format(model_key)]:
        pred_suffix = '_preds_full_{}'.format(model_key)
        model_test_results = {}

        print('splitting features and targets...')
        level_cols = [c for c in train_data.columns if c.startswith('level')]
        X_trn = train_data.drop(labels=['date','time','cluster']+level_cols, axis=1)
        Y_trn = train_data.loc[:,file_args['target_first_stage']].values.ravel()
        X_tst = test_data.drop(labels=['date','time','cluster']+level_cols, axis=1)
        Y_tst = test_data.loc[:,file_args['target_first_stage']].values.ravel()

        print('making predictions...')
        fn = os.path.join(save_dir, 'stage1_model_full_{}.pkl'.format(model_key))
        model = joblib.load(fn)
        trn_preds = model.predict(X_trn)
        tst_preds = model.predict(X_tst)

        # add results to prediction dataframes
        train_preds.loc[:,file_args['target_first_stage']+pred_suffix] = trn_preds
        test_preds.loc[:,file_args['target_first_stage']+pred_suffix] = tst_preds

        # calculate scores for individual clusters
        for clust in test_data['cluster'].unique():
            tmp_preds = test_preds.loc[test_preds['cluster']==clust,file_args['target_first_stage']+pred_suffix]
            tmp_actuals = test_preds.loc[test_preds['cluster']==clust,file_args['target_first_stage']]
            model_test_results[clust] = f1_score(tmp_actuals, tmp_preds, average=file_args['scoring_metric'][3:])

        test_results_dict['stage_1']['model_full_{}'.format(model_key)] = model_test_results

In [None]:
for model_key in model_dict.keys():
    # predict stage 1 using model on full data
    if file_args['model_full_{}'.format(model_key)]:
        print('predicting full data using {}'.format(model_key))
        pred_suffix = '_preds_full_{}'.format(model_key)
        model_test_results = {}

        # split features and targets
        level_cols = [c for c in train_data.columns if c.startswith('level')]
        X_trn = train_data.drop(labels=['date','time','cluster']+level_cols, axis=1)
        Y_trn = train_data.loc[:,file_args['target_first_stage']].values.ravel()
        X_tst = test_data.drop(labels=['date','time','cluster']+level_cols, axis=1)
        Y_tst = test_data.loc[:,file_args['target_first_stage']].values.ravel()

        # load model and make predictions
        fn = os.path.join(save_dir, 'stage1_model_full_{}.pkl'.format(model_key))
        model = joblib.load(fn)
        trn_preds = model.predict(X_trn)
        tst_preds = model.predict(X_tst)

        # add results to prediction dataframes
        train_preds.loc[:,file_args['target_first_stage']+pred_suffix] = trn_preds
        test_preds.loc[:,file_args['target_first_stage']+pred_suffix] = tst_preds

        # calculate scores for individual clusters
        for clust in test_data['cluster'].unique():
            tmp_preds = test_preds.loc[test_preds['cluster']==clust,file_args['target_first_stage']+pred_suffix]
            tmp_actuals = test_preds.loc[test_preds['cluster']==clust,file_args['target_first_stage']]
            model_test_results[clust] = f1_score(tmp_actuals, tmp_preds, average=file_args['scoring_metric'][3:])

        # add results to results dict
        test_results_dict['stage_1']['model_full_{}'.format(model_key)] = model_test_results
    
    # predict stage 1 by cluster
    if file_args['model_clusters_{}'.format(model_key)]:
        print('predicting clusters using {}'.format(model_key))
        pred_suffix = '_preds_cluster_{}'.format(model_key)
        test_results_dict['stage_1']['model_clusters_{}'.format(model_key)] = {}

        # create dataframes to append predictions
        train_preds_ensemble = pd.DataFrame(columns=['time_idx','date_idx','segment_id',file_args['target_first_stage']+pred_suffix])
        test_preds_ensemble = pd.DataFrame(columns=['time_idx','date_idx','segment_id',file_args['target_first_stage']+pred_suffix])

        for clust in test_data['cluster'].unique():
            model_test_results = {}

            # subset data to cluster
            train_clust = train_data[train_data['cluster']==clust].copy()
            test_clust = test_data[test_data['cluster']==clust].copy()

            # calculate negative to positive ratio for each cluster
            trn_clust_ratio = util.get_neg_pos_ratio(train_clust)
            tst_clust_ratio = util.get_neg_pos_ratio(test_clust)

            # unskew individual clusters
            if (file_args['unskew_train_clusters'] and trn_ratio > file_args['unskew_ratio']):
                print('unskewing train data to negative positive ratio of {}...'.format(file_args['unskew_ratio']))
                train_clust = util.unskew_data(train_clust, file_args['unskew_ratio'])
            if (file_args['unskew_test'] and tst_ratio > file_args['unskew_ratio']):
                print('unskewing test data to negative positive ratio of {}...'.format(file_args['unskew_ratio']))
                test_clust = util.unskew_data(test_clust, file_args['unskew_ratio'])

            # split features and targets
            level_cols = [col for col in train_data.columns if col.startswith('level')]
            X_trn = train_clust.drop(labels=['date','time','cluster']+level_cols, axis=1)
            Y_trn = train_clust.loc[:,file_args['target_first_stage']].values.ravel()
            X_tst = test_clust.drop(labels=['date','time','cluster']+level_cols, axis=1)
            Y_tst = test_clust.loc[:,file_args['target_first_stage']].values.ravel()

            # make predictions
            fn = os.path.join(save_dir, 'stage1_model_clusters_{}_cluster_{}.pkl'.format(model_key, clust))
            model = joblib.load(fn)
            trn_preds = model.predict(X_trn)
            tst_preds = model.predict(X_tst)

            # calculate score
            model_test_results = f1_score(Y_tst, tst_preds, average=file_args['scoring_metric'][3:])

            # add results to dict
            test_results_dict['stage_1']['model_clusters_{}'.format(model_key)][clust] = model_test_results

            # create cluster prediction dataframes
            train_clust_preds = train_clust[['date_idx','time_idx','segment_id']].copy()
            test_clust_preds = test_clust[['date_idx','time_idx','segment_id']].copy()
            train_clust_preds.loc[:,file_args['target_first_stage']+pred_suffix] = trn_preds
            test_clust_preds.loc[:,file_args['target_first_stage']+pred_suffix] = tst_preds

            # add cluster predictions to full ensemble predictions dataframe
            train_preds_ensemble = train_preds_ensemble.append(train_clust_preds)
            test_preds_ensemble = test_preds_ensemble.append(test_clust_preds)

        # add prediction columns to dataframes
        train_preds = train_preds.merge(train_preds_ensemble, how='left', 
                                        on=['date_idx','time_idx','segment_id'])
        test_preds = test_preds.merge(test_preds_ensemble, how='left', 
                                      on=['date_idx','time_idx','segment_id'])

In [None]:
def predict_stage1_model_on_clusters(model_key):
    if file_args['model_clusters_{}'.format(model_key)]:
        pred_suffix = '_preds_cluster_{}'.format(model_key)
        test_results_dict['stage_1']['model_clusters_{}'.format(model_key)] = {}

        # create dataframes to append predictions
        train_preds_ensemble = pd.DataFrame(columns=['time_idx','date_idx','segment_id',file_args['target_first_stage']+pred_suffix])
        test_preds_ensemble = pd.DataFrame(columns=['time_idx','date_idx','segment_id',file_args['target_first_stage']+pred_suffix])

        for clust in test_data['cluster'].unique():
            print('making predictions for cluster {}...'.format(clust))

            model_test_results = {}

            # subset data to cluster
            train_clust = train_data[train_data['cluster']==clust].copy()
            test_clust = test_data[test_data['cluster']==clust].copy()

            # calculate negative to positive ratio for each cluster
            trn_clust_ratio = util.get_neg_pos_ratio(train_clust)
            tst_clust_ratio = util.get_neg_pos_ratio(test_clust)

            # unskew individual clusters
            if (file_args['unskew_train_clusters'] and trn_ratio > file_args['unskew_ratio']):
                print('unskewing train data to negative positive ratio of {}...'.format(file_args['unskew_ratio']))
                train_clust = util.unskew_data(train_clust, file_args['unskew_ratio'])
            if (file_args['unskew_test'] and tst_ratio > file_args['unskew_ratio']):
                print('unskewing test data to negative positive ratio of {}...'.format(file_args['unskew_ratio']))
                test_clust = util.unskew_data(test_clust, file_args['unskew_ratio'])

            # split features and targets
            level_cols = [col for col in train_data.columns if col.startswith('level')]
            X_trn = train_clust.drop(labels=['date','time','cluster']+level_cols, axis=1)
            Y_trn = train_clust.loc[:,file_args['target_first_stage']].values.ravel()
            X_tst = test_clust.drop(labels=['date','time','cluster']+level_cols, axis=1)
            Y_tst = test_clust.loc[:,file_args['target_first_stage']].values.ravel()

            # make predictions
            fn = os.path.join(save_dir, 'stage1_model_clusters_{}_cluster_{}.pkl'.format(model_key, clust))
            model = joblib.load(fn)
            trn_preds = model.predict(X_trn)
            tst_preds = model.predict(X_tst)

            # calculate score
            model_test_results = f1_score(Y_tst, tst_preds, average=file_args['scoring_metric'][3:])

            # add model val results to dict
            test_results_dict['stage_1']['model_clusters_{}'.format(model_key)][clust] = model_test_results

            # create cluster prediction dataframes
            train_clust_preds = train_clust[['date_idx','time_idx','segment_id']].copy()
            test_clust_preds = test_clust[['date_idx','time_idx','segment_id']].copy()
            train_clust_preds.loc[:,file_args['target_first_stage']+pred_suffix] = trn_preds
            test_clust_preds.loc[:,file_args['target_first_stage']+pred_suffix] = tst_preds

            train_preds_ensemble = train_preds_ensemble.append(train_clust_preds)
            test_preds_ensemble = test_preds_ensemble.append(test_clust_preds)

        # add prediction columns to dataframes
        train_preds = train_preds.merge(train_preds_ensemble, how='left', 
                                        on=['date_idx','time_idx','segment_id'])
        test_preds = test_preds.merge(test_preds_ensemble, how='left', 
                                      on=['date_idx','time_idx','segment_id'])

### stage 1 best ensemble

In [None]:
if file_args['model_clusters_ensemble']:
    print('making predictions using best models on validation data for each cluster...')
    print('target variable is {}'.format(file_args['target_first_stage']))
    pred_suffix = '_preds_cluster_ensemble'
    test_results_dict['stage_1']['model_clusters_ensemble'] = {}

    # create dataframes to append predictions
    train_preds_ensemble = pd.DataFrame(columns=['time_idx','date_idx','segment_id',file_args['target_first_stage']+pred_suffix])
    test_preds_ensemble = pd.DataFrame(columns=['time_idx','date_idx','segment_id',file_args['target_first_stage']+pred_suffix])
    
    for clust in test_data['cluster'].unique():
        print('making predictions for cluster {}...'.format(clust))

        model_test_results = {}
        
        # subset data to cluster
        train_clust = train_data[train_data['cluster']==clust].copy()
        test_clust = test_data[test_data['cluster']==clust].copy()
               
        # calculate negative to positive ratio for each cluster
        trn_clust_ratio = util.get_neg_pos_ratio(train_clust)
        tst_clust_ratio = util.get_neg_pos_ratio(test_clust)
        
        # unskew individual clusters
        if (file_args['unskew_train_clusters'] and trn_ratio > file_args['unskew_ratio']):
            print('unskewing train data to negative positive ratio of {}...'.format(file_args['unskew_ratio']))
            train_clust = util.unskew_data(train_clust, file_args['unskew_ratio'])
        if (file_args['unskew_test'] and tst_ratio > file_args['unskew_ratio']):
            print('unskewing test data to negative positive ratio of {}...'.format(file_args['unskew_ratio']))
            test_clust = util.unskew_data(test_clust, file_args['unskew_ratio'])

        # split features and targets
        level_cols = [col for col in train_data.columns if col.startswith('level')]
        X_trn = train_clust.drop(labels=['date','time','cluster']+level_cols, axis=1)
        Y_trn = train_clust.loc[:,file_args['target_first_stage']].values.ravel()
        X_tst = test_clust.drop(labels=['date','time','cluster']+level_cols, axis=1)
        Y_tst = test_clust.loc[:,file_args['target_first_stage']].values.ravel()

        # get best model and make predictions
        model, model_type = util.get_best_model(1, clust)
        print('best model for cluster {} is {}'.format(clust, model_type))

        if model_type == 'model_avg_baseline':
            # join average based predictions to train and test dataframes
            train_clust_preds = train_clust.merge(model, how='left', on=['time_idx','segment_id','day_of_week'])
            test_clust_preds = test_clust.merge(model, how='left', on=['time_idx','segment_id','day_of_week'])
            
            # fill null predictions with 0
            train_clust_preds[file_args['target_first_stage']+'_preds_avg_baseline'].fillna(value=0, inplace=True)
            test_clust_preds[file_args['target_first_stage']+'_preds_avg_baseline'].fillna(value=0, inplace=True)
            
            trn_preds = train_clust_preds[file_args['target_first_stage']+'_preds_avg_baseline'].values
            tst_preds = test_clust_preds[file_args['target_first_stage']+'_preds_avg_baseline'].values            
        else:
            trn_preds = model.predict(X_trn)
            tst_preds = model.predict(X_tst)

        # calculate score and add to results
        model_test_results = f1_score(Y_tst, tst_preds, average=file_args['scoring_metric'][3:])
        test_results_dict['stage_1']['model_clusters_ensemble'][clust] = model_test_results
        
        # create cluster prediction dataframes
        train_clust_preds = train_clust[['date_idx','time_idx','segment_id']].copy()
        test_clust_preds = test_clust[['date_idx','time_idx','segment_id']].copy()
        train_clust_preds.loc[:,file_args['target_first_stage']+pred_suffix] = trn_preds
        test_clust_preds.loc[:,file_args['target_first_stage']+pred_suffix] = tst_preds
        
        train_preds_ensemble = train_preds_ensemble.append(train_clust_preds)
        test_preds_ensemble = test_preds_ensemble.append(test_clust_preds)
    
    # add prediction columns to dataframes
    train_preds = train_preds.merge(train_preds_ensemble, how='left', 
                                    on=['date_idx','time_idx','segment_id'])
    test_preds = test_preds.merge(test_preds_ensemble, how='left', 
                                  on=['date_idx','time_idx','segment_id'])

In [None]:
print('--- first stage predictions took {0:.1f} seconds ---'.format(time.time() - first_modeling_stage_start))

# 2. second stage - multiclass level

In [None]:
second_modeling_stage_start = time.time()

In [None]:
train_data_pos = train_data[train_data['level_binary'] == 1]
test_data_pos_preds = test_preds.loc[test_preds['level_binary_preds_cluster_ensemble'] == 1,
                                    ['date_idx','time_idx','segment_id']]
test_data_pos = test_data_pos_preds.merge(test_data, how='left', on=['date_idx','time_idx','segment_id'])

### stage 2 avg baseline

In [None]:
# run model to create test predictions 
if file_args['model_avg_baseline']:
    print('making predictions using avg baseline model...')
    print('target variable is {}'.format(file_args['target_second_stage']))
    pred_suffix = '_preds_avg_baseline'
    model_test_results = {}
    
    # load averages for making predictions
    fn = os.path.join(save_dir, 'stage2_model_avg_baseline.pkl')
    model = joblib.load(fn)
    
    # join predictions to train and test dataframes
    train_preds_avg = train_data_pos.merge(model, how='left', on=['time_idx','segment_id','day_of_week'])
    test_preds_avg = test_data_pos.merge(model, how='left', on=['time_idx','segment_id','day_of_week'])
    train_preds_avg = train_preds_avg[['date_idx','time_idx','segment_id',file_args['target_second_stage']+pred_suffix]]
    test_preds_avg = test_preds_avg[['date_idx','time_idx','segment_id',file_args['target_second_stage']+pred_suffix]]
    
    # add results to prediction dataframes
    print('adding average based predictions to results...')
    train_preds = train_preds.merge(train_preds_avg, how='left', on=['time_idx','date_idx','segment_id'])
    test_preds = test_preds.merge(test_preds_avg, how='left', on=['time_idx','date_idx','segment_id'])

    # fill null predictions with 0
    print('filling null predictions to 0...')
    train_preds[file_args['target_second_stage']+pred_suffix].fillna(value=0, inplace=True)
    test_preds[file_args['target_second_stage']+pred_suffix].fillna(value=0, inplace=True)
    
    # calculate scores for individual clusters
    for clust in test_data_pos['cluster'].unique():
        tmp_preds = test_preds.loc[test_preds['cluster']==clust,file_args['target_second_stage']+pred_suffix]
        tmp_actuals = test_preds.loc[test_preds['cluster']==clust,file_args['target_second_stage']]
        model_test_results[clust] = f1_score(tmp_actuals, tmp_preds, average=file_args['scoring_metric'][3:])
    
    test_results_dict['stage_2']['model_avg_baseline'] = model_test_results

### stage 2 non-baseline models

In [None]:
for model_key in model_dict.keys():
    # predict stage 2 full data
    if file_args['model_full_{}'.format(model_key)]:
        print('predicting full {}'.format(model_key))
        pred_suffix = '_preds_full_{}'.format(model_key)
        model_test_results = {}

        # split features and targets
        level_cols = [c for c in train_data_pos.columns if c.startswith('level')]
        X_trn = train_data_pos.drop(labels=['date','time','cluster']+level_cols, axis=1)
        Y_trn = train_data_pos.loc[:,file_args['target_second_stage']].values.ravel()
        X_tst = test_data_pos.drop(labels=['date','time','cluster']+level_cols, axis=1)
        Y_tst = test_data_pos.loc[:,file_args['target_second_stage']].values.ravel()

        # load model and make predictions
        fn = os.path.join(save_dir, 'stage2_model_full_{}.pkl'.format(model_key))
        model = joblib.load(fn)
        trn_preds = model.predict(X_trn)
        tst_preds = model.predict(X_tst)

        # create prediction dataframes
        train_preds_pos = train_data_pos[['date_idx','time_idx','segment_id']].copy()
        test_preds_pos = test_data_pos[['date_idx','time_idx','segment_id']].copy()
        train_preds_pos.loc[:,file_args['target_second_stage']+pred_suffix] = trn_preds
        test_preds_pos.loc[:,file_args['target_second_stage']+pred_suffix] = tst_preds

        # join to predictions and fillna with 0
        train_preds = train_preds.merge(train_preds_pos, how='left', on=['segment_id','date_idx','time_idx'])
        test_preds = test_preds.merge(test_preds_pos, how='left', on=['segment_id','date_idx','time_idx'])
        train_preds[file_args['target_second_stage']+pred_suffix].fillna(value=0, inplace=True)
        test_preds[file_args['target_second_stage']+pred_suffix].fillna(value=0, inplace=True)

        # calculate scores for individual clusters
        for clust in test_data['cluster'].unique():
            tmp_preds = test_preds.loc[test_preds['cluster']==clust,file_args['target_second_stage']+pred_suffix]
            tmp_actuals = test_preds.loc[test_preds['cluster']==clust,file_args['target_second_stage']]
            model_test_results[clust] = f1_score(tmp_actuals, tmp_preds, average=file_args['scoring_metric'][3:])

        test_results_dict['stage_2']['model_full_{}'.format(model_key)] = model_test_results
    
    # predict stage 2 clusters
    if file_args['model_clusters_{}'.format(model_key)]:
        print('predicting cluster {}'.format(model_key))
        pred_suffix = '_preds_cluster_{}'.format(model_key)
        test_results_dict['stage_2']['model_clusters_{}'.format(model_key)] = {}

        # create dataframes to append predictions
        train_preds_ensemble = pd.DataFrame(columns=['time_idx','date_idx','segment_id',file_args['target_second_stage']+pred_suffix])
        test_preds_ensemble = pd.DataFrame(columns=['time_idx','date_idx','segment_id',file_args['target_second_stage']+pred_suffix])

        model_test_results = {}

        for clust in test_data_pos['cluster'].unique():
            # subset data to cluster
            train_clust = train_data[train_data['cluster']==clust].copy()
            test_clust = test_data[test_data['cluster']==clust].copy()

            # calculate negative to positive ratio for each cluster
            trn_clust_ratio = util.get_neg_pos_ratio(train_clust)
            tst_clust_ratio = util.get_neg_pos_ratio(test_clust)

            # subset to positive data
            train_clust_pos = train_data_pos[train_data_pos['cluster']==clust].copy()
            test_clust_pos = test_data_pos[test_data_pos['cluster']==clust].copy()
            train_preds_pos = train_preds[train_preds['cluster']==clust].copy()
            test_preds_pos = test_preds[test_preds['cluster']==clust].copy()

            # split features and targets
            level_cols = [col for col in train_data_pos.columns if col.startswith('level')]
            X_trn = train_clust_pos.drop(labels=['date','time','cluster']+level_cols, axis=1)
            Y_trn = train_clust_pos.loc[:,file_args['target_second_stage']].values.ravel()
            X_tst = test_clust_pos.drop(labels=['date','time','cluster']+level_cols, axis=1)
            Y_tst = test_clust_pos.loc[:,file_args['target_second_stage']].values.ravel()

            # make predictions
            fn = os.path.join(save_dir, 'stage2_model_clusters_{}_cluster_{}.pkl'.format(model_key, clust))
            model = joblib.load(fn)
            trn_preds = model.predict(X_trn)
            tst_preds = model.predict(X_tst)

            # create prediction dataframes
            train_preds_pos = train_clust_pos[['date_idx','time_idx','segment_id']].copy()
            test_preds_pos = test_clust_pos[['date_idx','time_idx','segment_id']].copy()
            train_clust_preds = train_clust[['date_idx','time_idx','segment_id',file_args['target_second_stage']]].copy()
            test_clust_preds = test_clust[['date_idx','time_idx','segment_id',file_args['target_second_stage']]].copy()

            # add positive predictions
            train_preds_pos.loc[:,file_args['target_second_stage']+pred_suffix] = trn_preds
            test_preds_pos.loc[:,file_args['target_second_stage']+pred_suffix] = tst_preds

            # join to predictions and fillna with 0
            train_clust_preds = train_clust_preds.merge(train_preds_pos, how='left', on=['segment_id','date_idx','time_idx'])
            test_clust_preds = test_clust_preds.merge(test_preds_pos, how='left', on=['segment_id','date_idx','time_idx'])
            train_clust_preds[file_args['target_second_stage']+pred_suffix].fillna(value=0, inplace=True)
            test_clust_preds[file_args['target_second_stage']+pred_suffix].fillna(value=0, inplace=True)

            # calculate scores for individual cluster
            tmp_preds = test_clust_preds[file_args['target_second_stage']+pred_suffix].values
            tmp_actuals = test_clust_preds[file_args['target_second_stage']].values
            model_test_results[clust] = f1_score(tmp_actuals, tmp_preds, average=file_args['scoring_metric'][3:])

            # drop target column
            train_clust_preds.drop(labels=file_args['target_second_stage'], axis=1, inplace=True)
            test_clust_preds.drop(labels=file_args['target_second_stage'], axis=1, inplace=True)

            # add results to ensemble dataframe
            train_preds_ensemble = train_preds_ensemble.append(train_clust_preds)
            test_preds_ensemble = test_preds_ensemble.append(test_clust_preds)

        # add prediction columns to dataframes
        train_preds = train_preds.merge(train_preds_ensemble, how='left', 
                                        on=['date_idx','time_idx','segment_id'])
        test_preds = test_preds.merge(test_preds_ensemble, how='left', 
                                      on=['date_idx','time_idx','segment_id'])
        # add results to dict
        test_results_dict['stage_2']['model_clusters_{}'.format(model_key)] = model_test_results

### stage 2 clusters with best ensemble models

In [None]:
if file_args['model_clusters_ensemble']:
    print('making predictions using best models on validation data for each cluster...')
    print('target variable is {}'.format(file_args['target_second_stage']))
    pred_suffix = '_preds_cluster_ensemble'   
    test_results_dict['stage_2']['model_clusters_ensemble'] = {}
    model_test_results = {}
    
    # create dataframes to append predictions
    train_preds_ensemble = pd.DataFrame(columns=['time_idx','date_idx','segment_id',file_args['target_second_stage']+pred_suffix])
    test_preds_ensemble = pd.DataFrame(columns=['time_idx','date_idx','segment_id',file_args['target_second_stage']+pred_suffix])
    
    for clust in test_data_pos['cluster'].unique():
        print('modeling cluster {}...'.format(clust))
        
        # subset data to cluster
        train_clust = train_data[train_data['cluster']==clust].copy()
        test_clust = test_data[test_data['cluster']==clust].copy()

        # subset to positive data
        train_clust_pos = train_data_pos[train_data_pos['cluster']==clust].copy()
        test_clust_pos = test_data_pos[test_data_pos['cluster']==clust].copy()

        train_preds_clust = train_preds[train_preds['cluster']==clust].copy()
        test_preds_clust = test_preds[test_preds['cluster']==clust].copy()
         
        # split features and targets
        level_cols = [col for col in train_data.columns if col.startswith('level')]
        X_trn = train_clust_pos.drop(labels=['date','time','cluster']+level_cols, axis=1)
        Y_trn = train_clust_pos.loc[:,file_args['target_second_stage']].values.ravel()
        X_tst = test_clust_pos.drop(labels=['date','time','cluster']+level_cols, axis=1)
        Y_tst = test_clust_pos.loc[:,file_args['target_second_stage']].values.ravel()
                
        model, model_type = util.get_best_model(2, clust)
        print('best model for cluster {} is {}'.format(clust, model_type))

        if model_type == 'model_avg_baseline':
            # join average based predictions to train and test dataframes
            train_clust_pos_preds = train_clust_pos.merge(model, how='left', on=['time_idx','segment_id','day_of_week'])
            test_clust_pos_preds = test_clust_pos.merge(model, how='left', on=['time_idx','segment_id','day_of_week'])
            
            # fill null predictions with 0
            train_clust_pos_preds[file_args['target_second_stage']+'_preds_avg_baseline'].fillna(value=0, inplace=True)
            test_clust_pos_preds[file_args['target_second_stage']+'_preds_avg_baseline'].fillna(value=0, inplace=True)

            # get predictions
            trn_preds = train_clust_pos_preds[file_args['target_second_stage']+'_preds_avg_baseline'].values
            tst_preds = test_clust_pos_preds[file_args['target_second_stage']+'_preds_avg_baseline'].values      
                  
        else:
            trn_preds = model.predict(X_trn)
            tst_preds = model.predict(X_tst)
        
        # create prediction dataframes
        train_preds_pos = train_clust_pos[['date_idx','time_idx','segment_id']].copy()
        test_preds_pos = test_clust_pos[['date_idx','time_idx','segment_id']].copy()
        train_clust_preds = train_clust[['date_idx','time_idx','segment_id',file_args['target_second_stage']]].copy()
        test_clust_preds = test_clust[['date_idx','time_idx','segment_id',file_args['target_second_stage']]].copy()

        # add positive predictions
        train_preds_pos.loc[:,file_args['target_second_stage']+pred_suffix] = trn_preds
        test_preds_pos.loc[:,file_args['target_second_stage']+pred_suffix] = tst_preds
        
        # join to predictions and fillna with 0
        train_clust_preds = train_clust_preds.merge(train_preds_pos, how='left', on=['segment_id','date_idx','time_idx'])
        test_clust_preds = test_clust_preds.merge(test_preds_pos, how='left', on=['segment_id','date_idx','time_idx'])
        train_clust_preds[file_args['target_second_stage']+pred_suffix].fillna(value=0, inplace=True)
        test_clust_preds[file_args['target_second_stage']+pred_suffix].fillna(value=0, inplace=True)

        # calculate scores for individual cluster
        tmp_preds = test_clust_preds[file_args['target_second_stage']+pred_suffix].values
        tmp_actuals = test_clust_preds[file_args['target_second_stage']].values
        model_test_results[clust] = f1_score(tmp_actuals, tmp_preds, average=file_args['scoring_metric'][3:])
        
        # drop target column
        train_clust_preds.drop(labels=file_args['target_second_stage'], axis=1, inplace=True)
        test_clust_preds.drop(labels=file_args['target_second_stage'], axis=1, inplace=True)
        
        # add results to ensemble dataframe
        train_preds_ensemble = train_preds_ensemble.append(train_clust_preds)
        test_preds_ensemble = test_preds_ensemble.append(test_clust_preds)

    # add prediction columns to dataframes
    train_preds = train_preds.merge(train_preds_ensemble, how='left', 
                                    on=['date_idx','time_idx','segment_id'])
    test_preds = test_preds.merge(test_preds_ensemble, how='left', 
                                  on=['date_idx','time_idx','segment_id'])
    # add results to dict
    test_results_dict['stage_2']['model_clusters_ensemble'] = model_test_results

In [None]:
print('--- second stage predictions took {0:.1f} seconds ---'.format(time.time() - second_modeling_stage_start))

# 3. evaluate results

In [None]:
eval_results_start = time.time()

In [None]:
# add stage 1 cluster counts
clust_count_dict = {}
for clust in test_data['cluster'].unique():
    clust_count_dict[clust] = test_data[test_data['cluster']==clust].shape[0]
    
test_results_dict['stage_1']['cluster_counts'] = clust_count_dict

In [None]:
# add stage 2 cluster counts
clust_count_dict = {}
for clust in test_data_pos['cluster'].unique():
    clust_count_dict[clust] = test_data_pos[test_data_pos['cluster']==clust].shape[0]
    
test_results_dict['stage_2']['cluster_counts'] = clust_count_dict

In [None]:
# save test_results_dict
fn = os.path.join(file_args['save_dir'], 'test_results_dict.pkl')
joblib.dump(test_results_dict, fn)

In [None]:
util.metrics_plot_model(test_results_dict, stage='stage_1', score_metric=file_args['scoring_metric'], 
                        sort=True, title_prefix='test')

In [None]:
util.metrics_plot_model(test_results_dict, stage='stage_2', score_metric=file_args['scoring_metric'], 
                        sort=True, title_prefix='test')

In [None]:
print('--- evaluating results took {0:.1f} seconds ---'.format(time.time() - eval_results_start))

In [None]:
print('--- step 3 took {0:.1f} seconds ---'.format(time.time() - start_time))

# 4. write predictions to csv and database

In [None]:
# get info from database
sqlalchemy_conn_str_file = open(sqlalchemy_conn_str_file, 'r').read()
engine = create_engine(sqlalchemy_conn_str_file)

In [None]:
train_preds.to_sql('train_predictions', con=engine, if_exists='replace', chunksize=1000)

In [None]:
test_preds.to_sql('test_predictions', con=engine, if_exists='replace', chunksize=1000)

In [None]:
fn = os.path.join(save_dir, 'train_predictions.csv')
train_preds.to_csv(fn, index=False, chunksize=1000)

In [None]:
fn = os.path.join(save_dir, 'test_predictions.csv')
test_preds.to_csv(fn, index=False, chunksize=1000)