In [6]:
import pandas as pd
import numpy as np
import json 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import tree, model_selection, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [7]:
df = pd.read_csv("./Output/data_clean&processed.csv")
df = df.drop('Unnamed: 0', axis=1)
print(df.shape)

(985, 47)


In [8]:
df.head()

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,created_at,current_currency,deadline,fx_rate,goal,...,backers_count_log,goal_log,pledged_log,duration_log,n_comments_log,num_faq_bool,avg_fund_per_backer,comments_afinn,blurb_name,faq_comments
0,1,With your help we will create this device that...,"{'id': 331, 'name': '3D Printing', 'slug': 'te...",1,ES,2015-08-18 21:01,USD,2016-07-09 20:11,1.212886,15000,...,-1.390561,0.689521,-1.915118,-0.012145,-0.710711,0,1.0,-1.0,With your help we will create this device that...,[' ']No comments yet.
1,2,We at Ormiston Primary are looking at starting...,"{'id': 309, 'name': 'Farms', 'slug': 'food/far...",9,NZ,2015-08-11 18:04,USD,2015-09-11 15:55,0.723585,5000,...,-1.16855,0.107232,-1.25713,-0.012145,-0.710711,0,7.5,-1.0,We at Ormiston Primary are looking at starting...,[' ']No comments yet.
2,0,Self-taught aspiring metalsmith Looking for he...,"{'id': 54, 'name': 'Mixed Media', 'slug': 'art...",0,US,2015-04-28 21:14,USD,2015-05-28 21:14,1.0,10000,...,-1.77009,0.474607,-2.134448,-0.099817,-0.710711,0,0.0,-1.0,Self-taught aspiring metalsmith Looking for he...,[' ']No comments yet.
3,0,So many women believe they are past their prim...,"{'id': 278, 'name': 'People', 'slug': 'photogr...",0,US,2014-07-07 1:30,USD,2014-10-26 0:00,1.0,2000,...,-1.77009,-0.378322,-2.134448,-0.012145,-0.710711,0,0.0,-1.0,So many women believe they are past their prim...,[' ']No comments yet.
4,10,The Horror Zine's Jeani Rector brings us anoth...,"{'id': 324, 'name': 'Anthologies', 'slug': 'pu...",340,US,2014-11-04 16:30,USD,2014-12-09 9:20,1.0,2500,...,-0.457135,-0.260089,-0.289093,-0.012145,-0.23295,0,34.0,2.0,The Horror Zine's Jeani Rector brings us anoth...,[' ']Paula Limbaugh\nover 6 years ago\nSo so...


In [9]:
df.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'current_currency', 'deadline', 'fx_rate',
       'goal', 'id', 'launched_at', 'location', 'name', 'pledged', 'profile',
       'slug', 'source_url', 'spotlight', 'staff_pick', 'state',
       'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged',
       'usd_type', 'story', 'faq', 'num_faq', 'comments', 'n_comments',
       'duration', 'parent_category', 'category_name', 'location_name',
       'month_launched', 'year_launched', 'backers_count_log', 'goal_log',
       'pledged_log', 'duration_log', 'n_comments_log', 'num_faq_bool',
       'avg_fund_per_backer', 'comments_afinn', 'blurb_name', 'faq_comments'],
      dtype='object')

In [135]:
categorical_cols = ['country', 'staff_pick', 'usd_type', 'parent_category', 'category_name']
num_cols = ['backers_count', 'fx_rate', 'goal', 'pledged', 'num_faq', 'n_comments', 'duration', 'month_launched', 'year_launched']
text_cols = ['blurb', 'name', 'story', 'faq', 'comments']

In [138]:
# Split dataset
X = df.drop('state', axis=1)
y = df.state
#y.mean() #0.6370449678800857

# stratify y
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=2019, stratify=y)

In [139]:
# One Hot Encoding
from pandas.api.types import CategoricalDtype 

OHE_df = pd.DataFrame()

for i in categorical_cols:
    OHE_df[i] = X_train[i].astype(CategoricalDtype(list(set(X[i]))))
    
    OHE_df = pd.concat([OHE_df, pd.get_dummies(X[i], prefix=i)],axis=1)

In [140]:
OHE_df.columns.tolist()

['country',
 'country_AT',
 'country_AU',
 'country_BE',
 'country_CA',
 'country_CH',
 'country_DE',
 'country_DK',
 'country_ES',
 'country_FR',
 'country_GB',
 'country_HK',
 'country_IE',
 'country_IT',
 'country_JP',
 'country_LU',
 'country_MX',
 'country_NL',
 'country_NO',
 'country_NZ',
 'country_SE',
 'country_SG',
 'country_US',
 'staff_pick',
 'staff_pick_False',
 'staff_pick_True',
 'usd_type',
 'usd_type_domestic',
 'usd_type_international',
 'parent_category',
 'parent_category_Art',
 'parent_category_Comics',
 'parent_category_Crafts',
 'parent_category_Dance',
 'parent_category_Fashion',
 'parent_category_Film & Video',
 'parent_category_Food',
 'parent_category_Games',
 'parent_category_Journalism',
 'parent_category_Music',
 'parent_category_No Parent Category',
 'parent_category_Photography',
 'parent_category_Publishing',
 'parent_category_Technology',
 'parent_category_Theater',
 'category_name',
 'category_name_3D Printing',
 'category_name_Academic',
 'category_

# GridSearch

In [148]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [149]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 3],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 300, 500]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(OHE_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   12.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   38.0s
[Parallel(n_jobs=-1)]: Done 288 out of 288 | elapsed:  1.2min finished


{'criterion': 'entropy',
 'max_depth': None,
 'min_samples_split': 2,
 'n_estimators': 100}

# Random Forest

In [150]:
from sklearn.ensemble import RandomForestClassifier
from numpy import *
random.seed(4222)
classifier = RandomForestClassifier(criterion="entropy", n_estimators=100)
# {'criterion': 'entropy',
#  'max_depth': None, #default
#  'min_samples_split': 2, #default
#  'n_estimators': 100}

In [151]:
print(OHE_train.shape)
print(y_train.shape)
print(OHE_test.shape)
print(y_test.shape)

(735, 122)
(735,)
(184, 122)
(184,)


In [152]:
classifier.fit(OHE_train, y_train)

RandomForestClassifier(criterion='entropy')

# Model Evaluation

In [153]:
# Evaluate Model 
y_predict_class = classifier.predict(OHE_test)

report = """
The evaluation report is:
Confusion Matrix:
{}
Accuracy: {}
MSE: {}
AUC:{}
""".format(confusion_matrix(y_test, y_predict_class),
           accuracy_score(y_test, y_predict_class), 
           mean_squared_error(y_test, y_predict_class),
           metrics.roc_auc_score(y_test, classifier.predict_proba(OHE_test)[:, 1]))
print(report)


The evaluation report is:
Confusion Matrix:
[[ 64   9]
 [  3 108]]
Accuracy: 0.9347826086956522
MSE: 0.06521739130434782
AUC:0.9808095766999877

