In [1]:
from sklearn import model_selection
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from pandas.api.types import CategoricalDtype 
from sklearn.model_selection import cross_val_score
from mlxtend.classifier import StackingCVClassifier
import warnings
import pickle
import pandas as pd
from pandas.api.types import CategoricalDtype 
warnings.simplefilter('ignore')
RANDOM_SEED = 42

# Import your models first

In [2]:
# Logistic Regression
f = open('logreg.pckl', 'rb')
logreg = pickle.load(f)
f.close()

In [3]:
# Decision Tree
f = open('decisiontree.pckl', 'rb')
decisiontree = pickle.load(f)
f.close()

In [4]:
# Random Forest
f = open('randomforest.pckl', 'rb')
randomforest = pickle.load(f)
f.close()

In [5]:
# SVM
f = open('svm.pckl', 'rb')
svm = pickle.load(f)
f.close()

In [6]:
# Inspect models
print(logreg)
print(decisiontree)
print(randomforest)
print(svm)

LogisticRegression(random_state=42)
DecisionTreeClassifier(criterion='entropy', max_depth=4, min_samples_leaf=20,
                       min_samples_split=50, presort=None, random_state=40)
RandomForestClassifier(criterion='entropy', min_samples_split=12,
                       n_estimators=200)
SVC(gamma='auto', kernel='linear')


## Reintroduce X and y

In [7]:
df = pd.read_csv("./Output/data_clean&processed_addedsentiment.csv")
categorical_cols = ['country', 'staff_pick', 'usd_type', 'parent_category', 'category_name', 'location_name']
num_cols = ['backers_count', 'fx_rate', 'goal', 'pledged', 'num_faq', 'n_comments', 'duration']
text_cols = ['blurb', 'name', 'story', 'faq', 'comments']
# Split dataset
X = df.drop('state', axis=1)
y = df.state

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=2019)

# One Hot Encoding with training set only
OHE_df = pd.DataFrame()

for i in categorical_cols:
    OHE_df[i] = X_train[i].astype(CategoricalDtype(list(set(X[i]))))
    
    OHE_df = pd.concat([OHE_df, pd.get_dummies(X[i], prefix=i)],axis=1)

OHE_df = OHE_df.drop(categorical_cols, axis= 1)

X_no_cat = X.drop(categorical_cols, axis= 1)
OHE_df = pd.concat([X_no_cat, OHE_df],axis=1)

# For Decision Tree
OHE_df = OHE_df.drop(OHE_df.loc[:, OHE_df.columns.str.contains('_log')].columns.tolist(), axis=1)
OHE_df = OHE_df.drop(text_cols, axis=1)

In [8]:
X = OHE_df.fx_rate.values.reshape(-1,1)
y = df.state

## Drop Text Columns

In [9]:
cols_to_drop = ['profile', 'category', 'created_at', 'location', 'current_currency', 'deadline', 'id', 
                'launched_at', 'slug', 'source_url', 'state_changed_at', 'urls', 'static_usd_rate', 
                'usd_pledged', 'converted_pledged_amount', 'spotlight', 'name_afinn', 'blurb_afinn','comments_afinn']

data_new = df.drop(cols_to_drop, axis=1)

text_to_drop = ['blurb','name','story','faq','comments','blurb_name','faq_comments']
data_new = data_new.drop(text_to_drop , axis=1)

In [10]:
categorical_cols = ["country","staff_pick","usd_type","parent_category","num_faq_bool","category_name"]

# Split dataset
X = data_new.drop(['state','backers_count','pledged','pledged_log','duration','n_comments','num_faq','goal','location_name'], axis=1)
y = data_new.state

# set stratify = y to maintain ratio of successful:failed for train and test
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=2019, stratify = y)

In [11]:
OHE_df = pd.DataFrame()

for i in categorical_cols:
    OHE_df[i] = X_train[i].astype(CategoricalDtype(list(set(X[i]))))
    
    OHE_df = pd.concat([OHE_df, pd.get_dummies(X[i], prefix=i)],axis=1)
    
OHE_df = OHE_df.drop(categorical_cols, axis= 1)

X_no_cat = X.drop(categorical_cols, axis= 1)
OHE_df = pd.concat([X_no_cat, OHE_df],axis=1)

#OHE_train, OHE_test, y_train, y_test = model_selection.train_test_split(OHE_df, y, test_size=0.2, random_state=2019)

In [12]:
### Dont need run this

#models = [('lr',logreg),('svm',svm),('decisiontree',decisiontree),('randomforest',randomforest)]
#stacking = StackingClassifier(estimators=models)

# Create stacking model

In [13]:
sclf = StackingCVClassifier(classifiers=[logreg, decisiontree, randomforest, svm], 
                          meta_classifier=svm)

print('5-fold cross validation:\n')

for clf, label in zip([logreg, decisiontree, randomforest, sclf], 
                      ['Logistic Regression',
                       'Decision Tree', 
                        'Random Forest',
                        'Stacked Model']):

    sclf_scores = model_selection.cross_val_score(clf, OHE_df, y,
                                              cv=5, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (sclf_scores.mean(), sclf_scores.std(), label))

5-fold cross validation:

Accuracy: nan (+/- nan) [Logistic Regression]
Accuracy: 0.9107 (+/- 0.0156) [Decision Tree]
Accuracy: 0.9381 (+/- 0.0196) [Random Forest]
Accuracy: nan (+/- nan) [Stacked Model]
