In [26]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, roc_curve, auc

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

In [27]:
df = pd.read_csv('../datafiles/CO_2014_2015.csv')
df.shape

(139616, 45)

In [28]:
cols = ['Age', 'CustomerLifetimeSpend',
       #'Customer1YearSpend', 
       'Customer3YearSpend', 'ThreeYearVisitCount',
       'SkiLoyaltyIndexRating', 'FrontRangeMarket',
       'Spend_Winter_2010', 'Spend_Winter_2011', 'Spend_Winter_2012',
       'Spend_Winter_2013', 'Spend_Winter_2014', #'Spend_Winter_2015',
       #'Household1YearSpend',
       'Household3YearSpend', 'HouseholdLifetimeSpend',
       'NumberOfHouseholdMembers', 'NumberOfAdults',
       'NumberOfChildren', 'Pass_Winter_2010', 'Pass_Winter_2011',
       'Pass_Winter_2012', 'Pass_Winter_2013', 'Pass_Winter_2014',
       'Early_Purchase_2014_2015', 'Regular_Purchase_2014_2015',
       'Late_Purchase_2014_2015', 'Super_Late_Purchase_2014_2015', 'January_15', 'February_15', 'March_15',
       'April_15', 'November_14', 'December_14', 'Total_Days_14_15', 'Total_Days_13_14', 
       'Ski_Resort1_2014_2015',
       'Ski_Resort2_2014_2015', 'Ski_Resort3_2014_2015', 'Ski_Resort4_2014_2015', 'Ski_Resort5_2014_2015']

#### Assigning X and y

In [29]:
X = df[cols]
y = df['Pass_Winter_2015']

### Baseline for Previous Pass Customers purchasing a 2015/2016 Pass

In [30]:
y.mean()

0.4682128122851249

#### Train, Test, Split of X and y

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =42)

In [32]:
y_train.mean()

0.46732943693177476

In [33]:
y_test.mean()

0.47086293834517534

#### Scaling the data for some of the models

In [34]:
ss = StandardScaler()
# Fit Scaler on X_train data from TTS
ss.fit(X_train)
# Transform X_Train data from TTS
X_train_scaled = ss.transform(X_train)
#Transform X_test from TTS
X_test_scaled = ss.transform(X_test)

## Pipeline of Classification Models based on Non-Scaled data

#### Removed KNN and SVC because they took a long time and did not score well

In [None]:
#('knn_class', KNeighborsClassifier()),
#('svc', SVC())

In [35]:
%%time
estimators = [
              ('Logistic Regression', LogisticRegression()),
              ('Decision Tree Classifier', DecisionTreeClassifier()),  
              ('Bagging Classifier', BaggingClassifier()),
              ('Random Forest Classifier', RandomForestClassifier()),
              ('AdaBoost Classifier', AdaBoostClassifier()),
              ('XGBoost Classifier', XGBClassifier()),
              ('Neural Net Classifier', MLPClassifier(alpha=1))
             ]
for estimator in estimators:
    pipe = Pipeline([estimator])
    print(estimator[0])
    print('Training Score: {}'.format(cross_val_score(pipe, X_train, y_train).mean()))
    pipe.fit(X_train, y_train)
    y_hat = pipe.predict(X_test)
    print('Testing Score: {}'.format(pipe.score(X_test, y_test)), '\n')
    print('F1 Score: {}'.format(f1_score(y_test, y_hat)), '\n')

Logistic Regression
Training Score: 0.7341565629431587
Testing Score: 0.7310050424020169 

F1 Score: 0.6818043176195479 

Decision Tree Classifier
Training Score: 0.7996600676350268
Testing Score: 0.806268622507449 

F1 Score: 0.7939042974702835 

Bagging Classifier
Training Score: 0.8418042093437199
Testing Score: 0.8472381388952556 

F1 Score: 0.8355336212214682 

Random Forest Classifier
Training Score: 0.8401234234458088
Testing Score: 0.8453758881503552 

F1 Score: 0.8331633126217194 

AdaBoost Classifier
Training Score: 0.7648980222472964
Testing Score: 0.7616032546413019 

F1 Score: 0.747350842568696 

XGBoost Classifier


  if diff:
  if diff:
  if diff:


Training Score: 0.8126766801662845


  if diff:
  if diff:


Testing Score: 0.81199862479945 

F1 Score: 0.8061333018199007 

Neural Net Classifier
Training Score: 0.6380349519888542
Testing Score: 0.7426369470547788 

F1 Score: 0.7571571463328919 

CPU times: user 3min 4s, sys: 7.1 s, total: 3min 11s
Wall time: 3min 16s


## Pipeline of Classification Models based on Scaled data

In [36]:
%%time
estimators = [
              ('Logistic Regression', LogisticRegression()),
              ('Decision Tree Classifier', DecisionTreeClassifier()),  
              ('Bagging Classifier', BaggingClassifier()),
              ('Random Forest Classifier', RandomForestClassifier()),
              ('AdaBoost Classifier', AdaBoostClassifier()),
              ('Neural Net Classifier', MLPClassifier(alpha=1)),
              ('XGBoost Classifier', XGBClassifier())              
             ]

for estimator in estimators:
    pipe = Pipeline([estimator])
    print(estimator[0])
    print('Training Score: {}'.format(cross_val_score(pipe, X_train_scaled, y_train).mean()))
    pipe.fit(X_train_scaled, y_train)
    y_hat = pipe.predict(X_test)
    print('Testing Score: {}'.format(pipe.score(X_test_scaled, y_test)), '\n')
    print('F1 Score: {}'.format(f1_score(y_test, y_hat)), '\n')

Logistic Regression
Training Score: 0.7336504162971677
Testing Score: 0.731692642677057 

F1 Score: 0.698146435961784 

Decision Tree Classifier
Training Score: 0.7982084659590307
Testing Score: 0.8067270226908091 

F1 Score: 0.4351331360946746 

Bagging Classifier
Training Score: 0.8424440516657965
Testing Score: 0.8478111391244556 

F1 Score: 0.5244267918396822 

Random Forest Classifier
Training Score: 0.8410497948843393
Testing Score: 0.8470662388264956 

F1 Score: 0.6307729352275464 

AdaBoost Classifier
Training Score: 0.7648980222472964
Testing Score: 0.7616032546413019 

F1 Score: 0.6633970295655393 

Neural Net Classifier
Training Score: 0.8081022630817288
Testing Score: 0.8046642218656888 

F1 Score: 0.5192059283431912 

XGBoost Classifier


  if diff:
  if diff:
  if diff:


Training Score: 0.8126766801662845


ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37'] ['Age', 'CustomerLifetimeSpend', 'Customer3YearSpend', 'ThreeYearVisitCount', 'SkiLoyaltyIndexRating', 'FrontRangeMarket', 'Spend_Winter_2010', 'Spend_Winter_2011', 'Spend_Winter_2012', 'Spend_Winter_2013', 'Spend_Winter_2014', 'Household3YearSpend', 'HouseholdLifetimeSpend', 'NumberOfHouseholdMembers', 'NumberOfAdults', 'NumberOfChildren', 'Pass_Winter_2010', 'Pass_Winter_2011', 'Pass_Winter_2012', 'Pass_Winter_2013', 'Pass_Winter_2014', 'Early_Purchase_2014_2015', 'Regular_Purchase_2014_2015', 'Late_Purchase_2014_2015', 'Super_Late_Purchase_2014_2015', 'January_15', 'February_15', 'March_15', 'April_15', 'November_14', 'December_14', 'Total_Days_14_15', 'Total_Days_13_14', 'Ski_Resort1_2014_2015', 'Ski_Resort2_2014_2015', 'Ski_Resort3_2014_2015', 'Ski_Resort4_2014_2015', 'Ski_Resort5_2014_2015']
expected f8, f7, f21, f16, f4, f6, f26, f28, f5, f3, f12, f14, f15, f37, f13, f20, f36, f25, f34, f23, f22, f17, f11, f24, f31, f35, f1, f9, f10, f30, f2, f0, f32, f29, f27, f33, f19, f18 in input data
training data did not have the following fields: SkiLoyaltyIndexRating, Total_Days_14_15, Spend_Winter_2010, Ski_Resort3_2014_2015, April_15, February_15, Age, Regular_Purchase_2014_2015, Pass_Winter_2012, Customer3YearSpend, Pass_Winter_2014, Pass_Winter_2010, FrontRangeMarket, Ski_Resort2_2014_2015, ThreeYearVisitCount, Ski_Resort5_2014_2015, Household3YearSpend, Late_Purchase_2014_2015, Total_Days_13_14, NumberOfHouseholdMembers, Spend_Winter_2014, Pass_Winter_2013, Ski_Resort4_2014_2015, November_14, Ski_Resort1_2014_2015, NumberOfChildren, Spend_Winter_2012, December_14, Spend_Winter_2011, Spend_Winter_2013, NumberOfAdults, March_15, Super_Late_Purchase_2014_2015, January_15, CustomerLifetimeSpend, Pass_Winter_2011, HouseholdLifetimeSpend, Early_Purchase_2014_2015