In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import requests as r
from math import sin, cos, sqrt, atan2, radians
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import SequentialFeatureSelector, RFECV
from scipy import stats

random_state=42
num_samples = 75000
itr = 50

In [None]:
url = 'https://envera-consulting-public-assets.s3.us-west-1.amazonaws.com/ssie-637-final-dataset.csv'
df = pd.read_csv(url)

In [None]:
df.shape

In [None]:
sub_list = ['hour', 'Call Priority', 'delta_1_min',
            'delta_2_min', 'delta_3_min',
            'delta_4_min', 'delta_5_min',
            'delta_6_min', 'delta_7_min',
            'distance_km']

sub_list2 = ['delta_1_min',
            'delta_2_min', 'delta_3_min',
            'delta_4_min', 'delta_5_min',
            'delta_6_min', 'delta_7_min',
            'distance_km']

In [None]:
new_df2 = df[sub_list]

In [None]:
new_df2.head(2)

In [None]:
Q1 = new_df2[sub_list2].quantile(0.25) 
Q3 = new_df2[sub_list2].quantile(0.75)
IQR = Q3 - Q1
condition = ~((new_df2[sub_list2] < (Q1 - 1.5 * IQR)) | (new_df2[sub_list2] > (Q3 + 1.5 * IQR))).any(axis=1)

new_df2 = new_df2[condition]

In [None]:
new_df2.shape

In [None]:
new_df2.describe()

In [None]:
delta_1_min_list = new_df2['delta_1_min'].values.tolist()
delta_2_min_list = new_df2['delta_2_min'].values.tolist()
delta_3_min_list = new_df2['delta_3_min'].values.tolist()
delta_4_min_list = new_df2['delta_4_min'].values.tolist()
delta_5_min_list = new_df2['delta_5_min'].values.tolist()
delta_6_min_list = new_df2['delta_6_min'].values.tolist()
delta_7_min_list = new_df2['delta_7_min'].values.tolist()
hour_list = new_df2['hour'].values.tolist()
priority_list = new_df2['Call Priority'].values.tolist()
distance_list = new_df2['distance_km'].values.tolist()


plt.figure(figsize=(10, 10))
plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=0.9,
                    wspace=0.4,
                    hspace=0.2)
num_rows = 2
num_col = 4
rotation = 0
fontsize = 13

plt.subplot(num_rows,num_col,1)
plt.boxplot([delta_1_min_list])
plt.xticks([1], ['delta_1_min'], fontsize=fontsize, rotation=rotation)
plt.yticks(fontsize=13)


plt.subplot(num_rows,num_col,2)
plt.boxplot([delta_2_min_list])
plt.xticks([1], ['delta_2_min'], fontsize=fontsize, rotation=rotation)
plt.yticks(fontsize=13)

plt.subplot(num_rows,num_col,3)
plt.boxplot([delta_3_min_list])
plt.xticks([1], ['delta_3_min'], fontsize=fontsize, rotation=rotation)
plt.yticks(fontsize=13)

plt.subplot(num_rows,num_col,4)
plt.boxplot([delta_4_min_list])
plt.xticks([1], ['delta4_min'], fontsize=fontsize, rotation=rotation)
plt.yticks(fontsize=13)

plt.subplot(num_rows,num_col,5)
plt.boxplot([delta_5_min_list])
plt.xticks([1], ['delta_5_min'], fontsize=fontsize, rotation=rotation)
plt.ylim(-0.9, 140, 20)
plt.yticks(fontsize=13)

plt.subplot(num_rows,num_col,6)
plt.boxplot([delta_6_min_list])
plt.xticks([1], ['delta_6_min'], fontsize=fontsize, rotation=rotation)
plt.yticks(fontsize=13)

plt.subplot(num_rows,num_col,7)
plt.boxplot([delta_7_min_list])
plt.xticks([1], ['delta_7_min'], fontsize=fontsize, rotation=rotation)
plt.ylim(-0.1, 5, 1)
plt.yticks(fontsize=13)


plt.subplot(num_rows,num_col,8)
plt.boxplot([distance_list])
plt.xticks([1], ['distance_list'], fontsize=fontsize, rotation=rotation)
plt.yticks(fontsize=13)
plt.ylim(0, 20, 2)


plt.show()

In [None]:
plt.figure(figsize=(15, 4))
plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=0.9,
                    wspace=0.4,
                    hspace=0.5)

plt.subplot(1,3,1)
plt.hist(delta_6_min_list, bins=500, density=True, alpha=0.75)
plt.xlabel('Response Time (min)', fontsize=14)
plt.ylabel('Count Frequency (unitless)', fontsize=14)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.ylim(0, 0.225)
plt.title('A: Histogram of the \n Response Time (min)')

plt.subplot(1,3,2)
plt.hist(hour_list, density=True, bins=24, alpha=0.75)

plt.ylabel('Count Frequency (unitless)', fontsize=14)
plt.xlabel('Call Hour', fontsize=14)
plt.xticks(np.arange(0, 24, step=2), fontsize=13) 
plt.yticks(fontsize=13)
plt.title('B: Histogram of Hour that \n the Call was Received \n (System Load)')

plt.subplot(1,3,3)
plt.hist(distance_list, density=True, bins=50, alpha=0.75)
plt.xlabel('Response Distance (km)', fontsize=14)
plt.ylabel('Count Frequency (unitless)', fontsize=14)
plt.yticks(fontsize=13)
plt.xticks(np.arange(0, 20, step=2), fontsize=13) 
plt.ylim(0, 0.2)
plt.title('C: Histogram of the Response \n Distance (km)')

plt.show()

In [None]:
col_sub = ['Call Priority','hour', 'delta_1_min', 'delta_2_min',
           'delta_3_min', 'delta_4_min', 'delta_5_min', 'delta_6_min', 
           'delta_7_min', 'distance_km']
col_sub2 = ['hour', 'delta_1_min', 'delta_2_min',
           'delta_3_min', 'delta_4_min', 'delta_5_min', 'delta_6_min', 
           'delta_7_min', 'distance_km']
col_sub3 = ['Call Priority', 'delta_1_min', 'delta_2_min',
           'delta_3_min', 'delta_4_min', 'delta_5_min', 'delta_6_min', 
           'delta_7_min', 'distance_km']
col_sub4 = ['hour', 'distance_km', 'delta_6_min']

col_sub5 = ['Call Priority', 'distance_km', 'delta_6_min']
col_sub6 = ['hour', 'Call Priority', 'distance_km', 'delta_6_min']


col_sub7 = ['Call Priority', 'delta_6_min']
col_sub8 = ['hour', 'delta_6_min']
col_sub9 = ['Call Priority', 'hour', 'delta_6_min']

In [None]:
final_df = new_df2[col_sub]
final_df.describe()

In [None]:
final_df2 = new_df2[col_sub2]

In [None]:
final_df2.head()

In [None]:
final_df3 = new_df2[col_sub3]
final_df3.head()

In [None]:
final_df4 = new_df2[col_sub4]
final_df4.head()

In [None]:
final_df5 = new_df2[col_sub5]
final_df5.head()

In [None]:
final_df6 = new_df2[col_sub6]
final_df6.head()

In [None]:
final_df7 = new_df2[col_sub7]
final_df7.head()

In [None]:
final_df8 = new_df2[col_sub8]
final_df8.head()

In [None]:
final_df9 = new_df2[col_sub9]
final_df9.head()

In [None]:
one = OneHotEncoder()
std = StandardScaler()

#final_df
col_transform_1 = make_column_transformer(
    (one, ['Call Priority', 'hour']),
    (std, ['distance_km', 'delta_1_min', 'delta_2_min', 'delta_3_min', 
            'delta_4_min', 'delta_5_min', 'delta_7_min', 'delta_6_min']),
    remainder='passthrough')

#final_df2
col_transform_2 = make_column_transformer(
    (one, ['hour']),
    (std, ['distance_km', 'delta_1_min', 'delta_2_min', 'delta_3_min', 
            'delta_4_min', 'delta_5_min', 'delta_7_min', 'delta_6_min']),
    remainder='passthrough')

#final_df3
col_transform_3 = make_column_transformer(
    (one, ['Call Priority']),
    (std, ['distance_km', 'delta_1_min', 'delta_2_min', 'delta_3_min', 
            'delta_4_min', 'delta_5_min', 'delta_7_min', 'delta_6_min']),
    remainder='passthrough')


##########################################

#final_df6
col_transform_5 = make_column_transformer(
    (one, ['hour', 'Call Priority']),
    (std, ['distance_km', 'delta_6_min']),
    remainder='passthrough')

#final_df5
col_transform_4 = make_column_transformer(
    (one, ['Call Priority']),
    (std, ['distance_km', 'delta_6_min']),
    remainder='passthrough')

#final_df4
col_transform_6 = make_column_transformer(
    (one, ['hour']),
    (std, ['distance_km', 'delta_6_min']),
    remainder='passthrough')

##########################################

#final_df9
col_transform_9 = make_column_transformer(
    (one, ['hour', 'Call Priority']),
    (std, ['delta_6_min']),
    remainder='passthrough')

#final_df7
col_transform_7 = make_column_transformer(
    (one, ['Call Priority']),
    (std, ['delta_6_min']),
    remainder='passthrough')

#final_df8
col_transform_8 = make_column_transformer(
    (one, ['hour']),
    (std, ['delta_6_min']),
    remainder='passthrough')

In [None]:
data_dict = [{'col_transform': [{'data': final_df, 'transformer': col_transform_1, 'dataset': 'dataset_1'}]},
             {'col_transform': [{'data': final_df2, 'transformer': col_transform_2, 'dataset': 'dataset_2'}]},
             {'col_transform': [{'data': final_df3, 'transformer': col_transform_3, 'dataset': 'dataset_3'}]},
             {'col_transform': [{'data': final_df5, 'transformer': col_transform_4, 'dataset': 'dataset_4'}]},
             {'col_transform': [{'data': final_df6, 'transformer': col_transform_5, 'dataset': 'dataset_5'}]},
             {'col_transform': [{'data': final_df4, 'transformer': col_transform_6, 'dataset': 'dataset_6'}]},
             {'col_transform': [{'data': final_df7, 'transformer': col_transform_7, 'dataset': 'dataset_7'}]}, 
             {'col_transform': [{'data': final_df8, 'transformer': col_transform_8, 'dataset': 'dataset_8'}]},
             {'col_transform': [{'data': final_df9, 'transformer': col_transform_9, 'dataset': 'dataset_9'}]}]

In [None]:
dataset = []
except_list = []
for idx, each_trans in enumerate(data_dict):
    trans = each_trans['col_transform'][0]['transformer']
    data = each_trans['col_transform'][0]['data']
    name_str = each_trans['col_transform'][0]['dataset']
    try:
        data_transformed = trans.fit_transform(data)
        dense_array = data_transformed.toarray()
        
        feat_pos = len(dense_array[0]) -1
        X = dense_array[:,:feat_pos]
        y = dense_array[:, -1:]

        elm_dict = {
            'dataset': name_str,
            'data': [{'X': X, 'y': y}],
            'num_features': len(X)
        }
        dataset.append(elm_dict)
        
    except:
        except_list.append(idx)
        data_transformed = trans.fit_transform(data)
        X = data_transformed[:, 1:]
        y = data_transformed[:, :1]

        elm_dict = {
            'dataset': name_str,
            'data': [{'X': X, 'y': y}],
            'num_features': len(X)
        }
        dataset.append(elm_dict)

In [None]:
def train_func(dataset=None, model=None, test_size=None, random_state=None):
    
    df_list = []
    
    for each in dataset:
        
        X_train, X_test, y_train, y_test = train_test_split(each['data'][0]['X'], 
                                                            each['data'][0]['y'], 
                                                            test_size=test_size, 
                                                            random_state=random_state)
        reg = model.fit(X_train, y_train)
        y_pred_time = reg.predict(X_test)

        r2 = r2_score(y_test, y_pred_time)
        mse = mean_squared_error(y_test, y_pred_time)
        mae = mean_absolute_error(y_test, y_pred_time)
        rmse = np.sqrt(mse)
        rmsle = np.log(rmse)

        print('')
        print('-----------------')
        print('Dataset: {}, Model: {}, R2_Score: {:.5f}, MSE_Score: {:.5f}, MAE_Score: {:.5f}, RMSE_Score: {:.5f}'.format(each['dataset'], 
                                                            model, r2, mse, mae, rmse))
   
        cv = KFold(n_splits=10, 
                   shuffle=True, 
                   random_state=random_state)
    
        cv_scores = cross_val_score(estimator=model, 
                                 X=each['data'][0]['X'], 
                                 y=each['data'][0]['y'], 
                                 scoring=None, 
                                 cv=cv)
        print('')
        print('******* 10-Fold Cross Validation Scores *******')
        print('Dataset: {}, Model: {}, Mean_Score (Default): {:.5f}, Min_Score: {:.5f} Max_Score: {:.5f}, List of CV Scores: {}'.format(each['dataset'], 
                                                                                                     model,
                                                                                                     np.mean(cv_scores), 
                                                                                                     cv_scores.min(), 
                                                                                                     cv_scores.max(), 
                                                                                                     cv_scores))
        d = each['dataset']
        m = model
        s = test_size
        df = pd.DataFrame({'model': [m for each in range(9)],
                           'dataset': [d for each in range(9)],
                           'testsize': [s for each in range(9)],
                           'metric_set': ['Non-CV', 'Non-CV', 'Non-CV', 'Non-CV', 'Non-CV', '10 Fold-CV', '10 Fold-CV', '10 Fold-CV', '10 Fold-CV'],
                           'metric': ['R2_Score', 'MSE_Score', 'MAE_Score', 'RMSE_Score', 'RMSLE_Score', 'Mean_Score', 'Min_Score', 'Max_Score', 'cv_scores'], 
                           'value': [r2, mse, mae, rmse, rmsle, np.mean(cv_scores), cv_scores.min(), cv_scores.max(), cv_scores]})   
        
        df_list.append(df)
        
    return pd.concat(df_list)
        

In [None]:
model = LinearRegression()
run1 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
model = LinearRegression()
run2 = train_func(dataset=dataset, model=model, test_size=0.5, random_state=random_state)

In [None]:
model = LinearRegression()
run3 = train_func(dataset=dataset, model=model, test_size=0.7, random_state=random_state)

In [None]:
model = LinearRegression()
run4 = train_func(dataset=dataset, model=model, test_size=0.8, random_state=random_state)

In [None]:
model = LinearRegression()
run5 = train_func(dataset=dataset, model=model, test_size=0.9, random_state=random_state)

In [None]:
model = linear_model.Lasso(alpha=0.025, random_state=random_state)
run6 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
model = linear_model.Lasso(alpha=0.050, random_state=random_state)
run13 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
model = linear_model.Lasso(alpha=0.075, random_state=random_state)
run14 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
model = linear_model.Lasso(alpha=0.1, random_state=random_state)
run15 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
model = linear_model.Lasso(alpha=0.5, random_state=random_state)
run16 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
model = linear_model.Lasso(alpha=0.75, random_state=random_state)
run17 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
model = linear_model.Lasso(alpha=0.35, random_state=random_state)
run18 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
model = linear_model.Lasso(alpha=0, random_state=random_state)
run7 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
model = linear_model.Lasso(alpha=1, random_state=random_state)
run8 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
model = Ridge(alpha=8.0, random_state=random_state)
run9 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
model = Ridge(alpha=1, random_state=random_state)
run10 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
model = Ridge(alpha=5, random_state=random_state)
run11 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
model = ElasticNet(random_state=random_state)
run12 = train_func(dataset=dataset, model=model, test_size=0.3, random_state=random_state)

In [None]:
df_all_runs = pd.concat([run1, run2, run3, run4, run5, run6, 
                         run7, run8, run9, run10, run11, run12, 
                         run13, run14, run15, run16, run17, run18])

In [None]:
df_all_runs.shape

In [None]:
df_all_runs.to_csv('modeling-output.csv', index=False)