In [17]:
from sklearn import model_selection
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from pandas.api.types import CategoricalDtype 
from sklearn.model_selection import cross_val_score
from mlxtend.classifier import StackingCVClassifier
import warnings
import pickle
import pandas as pd
from pandas.api.types import CategoricalDtype 
warnings.simplefilter('ignore')
RANDOM_SEED = 42

# Import your models first

In [18]:
# Logistic Regression
f = open('logreg.pckl', 'rb')
logreg = pickle.load(f)
f.close()

In [19]:
# Decision Tree
f = open('decisiontree.pckl', 'rb')
decisiontree = pickle.load(f)
f.close()

In [20]:
# Random Forest
f = open('randomforest.pckl', 'rb')
randomforest = pickle.load(f)
f.close()

In [21]:
# SVM
f = open('svm.pckl', 'rb')
svm = pickle.load(f)
f.close()

In [22]:
# Inspect models
print(logreg)
print(decisiontree)
print(randomforest)
print(svm)

LogisticRegression(C=0.01, intercept_scaling=1.0, penalty='l1', random_state=42,
                   solver='liblinear')
DecisionTreeClassifier(criterion='entropy', max_depth=6, min_samples_leaf=10,
                       min_samples_split=5, presort=None, random_state=40)
RandomForestClassifier(random_state=42)
SVC(gamma='auto', kernel='linear')


In [23]:
df = pd.read_csv("./Output/data_clean&processed_addedsentiment.csv")

## Drop Text Columns

In [24]:
cols_to_drop = ['profile', 'category', 'created_at', 'location', 'current_currency', 'deadline', 'id', 
                'launched_at', 'slug', 'source_url', 'state_changed_at', 'urls', 'static_usd_rate', 
                'usd_pledged', 'converted_pledged_amount', 'spotlight', 'name_afinn', 'blurb_afinn','comments_afinn']

data_new = df.drop(cols_to_drop, axis=1)

text_to_drop = ['blurb','name','story','faq','comments','blurb_name','faq_comments']
data_new = data_new.drop(text_to_drop , axis=1)

In [25]:
categorical_cols = ["country","staff_pick","usd_type","parent_category","num_faq_bool","category_name"]

# Split dataset
X = data_new.drop(['state','backers_count','pledged','pledged_log','duration','n_comments','num_faq','goal','location_name'], axis=1)
y = data_new.state

# set stratify = y to maintain ratio of successful:failed for train and test
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=2019, stratify = y)

In [26]:
OHE_df = pd.DataFrame()

for i in categorical_cols:
    OHE_df[i] = X_train[i].astype(CategoricalDtype(list(set(X[i]))))
    
    OHE_df = pd.concat([OHE_df, pd.get_dummies(X[i], prefix=i)],axis=1)
    
OHE_df = OHE_df.drop(categorical_cols, axis= 1)

X_no_cat = X.drop(categorical_cols, axis= 1)
OHE_df = pd.concat([X_no_cat, OHE_df],axis=1)

#OHE_train, OHE_test, y_train, y_test = model_selection.train_test_split(OHE_df, y, test_size=0.2, random_state=2019)

# Create stacking model

In [27]:
sclf = StackingCVClassifier(classifiers=[decisiontree, logreg, randomforest], 
                          meta_classifier=svm)

print('5-fold cross validation:\n')

for clf, label in zip([decisiontree, logreg, randomforest, sclf], 
                      ['Decision Tree',
                      'Logistic Regression',
                        'Random Forest',
                        'Stacked Model']):

    sclf_scores = model_selection.cross_val_score(clf, OHE_df, y,
                                              cv=5, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (sclf_scores.mean(), sclf_scores.std(), label))

5-fold cross validation:

Accuracy: 0.9561 (+/- 0.0093) [Decision Tree]
Accuracy: 0.9389 (+/- 0.0087) [Logistic Regression]
Accuracy: 0.9205 (+/- 0.0285) [Random Forest]
Accuracy: 0.9588 (+/- 0.0074) [Stacked Model]
