In [29]:
import pandas as pd
import numpy as np
import json 
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.api.types import CategoricalDtype
from sklearn import tree, model_selection, metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder


In [30]:
df = pd.read_csv("./Output/data_clean&processed_addedsentiment.csv")
print(df.shape)

(985, 52)


In [31]:
df.head()

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,created_at,current_currency,deadline,fx_rate,goal,...,num_faq_bool,avg_fund_per_backer,comments_afinn,blurb_name,faq_comments,story_afinn,faq_comments_afinn,blurb_name_afinn,blurb_afinn,name_afinn
0,1,With your help we will create this device that...,"{'id': 331, 'name': '3D Printing', 'slug': 'te...",1,ES,2015-08-18 21:01,USD,2016-07-09 20:11,1.212886,15000,...,0,1.0,-1.0,With your help we will create this device that...,[' ']No comments yet.,-10.0,-1.0,4.0,2.0,2.0
1,2,We at Ormiston Primary are looking at starting...,"{'id': 309, 'name': 'Farms', 'slug': 'food/far...",9,NZ,2015-08-11 18:04,USD,2015-09-11 15:55,0.723585,5000,...,0,7.5,-1.0,We at Ormiston Primary are looking at starting...,[' ']No comments yet.,15.0,-1.0,1.0,1.0,0.0
2,0,Self-taught aspiring metalsmith Looking for he...,"{'id': 54, 'name': 'Mixed Media', 'slug': 'art...",0,US,2015-04-28 21:14,USD,2015-05-28 21:14,1.0,10000,...,0,0.0,-1.0,Self-taught aspiring metalsmith Looking for he...,[' ']No comments yet.,25.0,-1.0,4.0,2.0,2.0
3,0,So many women believe they are past their prim...,"{'id': 278, 'name': 'People', 'slug': 'photogr...",0,US,2014-07-07 1:30,USD,2014-10-26 0:00,1.0,2000,...,0,0.0,-1.0,So many women believe they are past their prim...,[' ']No comments yet.,14.0,-1.0,3.0,0.0,3.0
4,10,The Horror Zine's Jeani Rector brings us anoth...,"{'id': 324, 'name': 'Anthologies', 'slug': 'pu...",340,US,2014-11-04 16:30,USD,2014-12-09 9:20,1.0,2500,...,0,34.0,2.0,The Horror Zine's Jeani Rector brings us anoth...,[' ']Paula Limbaugh\nover 6 years ago\nSo so...,2.0,2.0,4.0,4.0,0.0


In [32]:
df.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'current_currency', 'deadline', 'fx_rate',
       'goal', 'id', 'launched_at', 'location', 'name', 'pledged', 'profile',
       'slug', 'source_url', 'spotlight', 'staff_pick', 'state',
       'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged',
       'usd_type', 'story', 'faq', 'num_faq', 'comments', 'n_comments',
       'duration', 'parent_category', 'category_name', 'location_name',
       'month_launched', 'year_launched', 'backers_count_log', 'goal_log',
       'pledged_log', 'duration_log', 'n_comments_log', 'num_faq_bool',
       'avg_fund_per_backer', 'comments_afinn', 'blurb_name', 'faq_comments',
       'story_afinn', 'faq_comments_afinn', 'blurb_name_afinn', 'blurb_afinn',
       'name_afinn'],
      dtype='object')

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 985 entries, 0 to 984
Data columns (total 52 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   backers_count             985 non-null    int64  
 1   blurb                     985 non-null    object 
 2   category                  985 non-null    object 
 3   converted_pledged_amount  985 non-null    int64  
 4   country                   985 non-null    object 
 5   created_at                985 non-null    object 
 6   current_currency          985 non-null    object 
 7   deadline                  985 non-null    object 
 8   fx_rate                   985 non-null    float64
 9   goal                      985 non-null    int64  
 10  id                        985 non-null    int64  
 11  launched_at               985 non-null    object 
 12  location                  985 non-null    object 
 13  name                      985 non-null    object 
 14  pledged   

In [34]:
# Cols for OHE 
categorical_cols = ['country', 'staff_pick', 'usd_type', 'parent_category', 'category_name', 'location_name']

# Other feature cols for modelling
cols_to_keep = ['backers_count', 'fx_rate', 'goal', 'pledged', 'n_comments', 'duration', 'month_launched', 'year_launched', 'avg_fund_per_backer', 'num_faq_bool']
sentiment_cols = ['comments_afinn', 'faq_comments_afinn', 'blurb_name_afinn', 'blurb_afinn', 'name_afinn', 'story_afinn']
target_col = ['state']

#Cols to drop
text_cols = ['blurb', 'name', 'story', 'faq', 'comments', 'blurb_name', 'faq_comments']
log_cols = ['backers_count_log', 'goal_log', 'pledged_log', 'duration_log', 'n_comments_log']
other_cols = ['profile', 'category', 'created_at', 'location', 'current_currency', 'deadline', 'id', 
              'launched_at', 'slug', 'source_url', 'state_changed_at', 'urls', 'static_usd_rate', 
              'usd_pledged', 'converted_pledged_amount', 'spotlight', 'num_faq']

In [35]:
cols_to_drop = text_cols + log_cols + other_cols
df = df.drop(cols_to_drop, axis=1)
print(df.shape)

(985, 23)


In [36]:
# Split dataset
X = df.drop('state', axis=1)
y = df.state
#y.mean() #0.6370449678800857

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=2019)

In [37]:
# One Hot Encoding with training set only
OHE_df = pd.DataFrame()

for i in categorical_cols:
    OHE_df[i] = X_train[i].astype(CategoricalDtype(list(set(X[i]))))
    
    OHE_df = pd.concat([OHE_df, pd.get_dummies(X[i], prefix=i)],axis=1)

In [38]:
OHE_df = OHE_df.drop(categorical_cols, axis= 1)

X_no_cat = X.drop(categorical_cols, axis= 1)
OHE_df = pd.concat([X_no_cat, OHE_df],axis=1)

# For Decision Tree/Random Forest
OHE_df = OHE_df.drop(OHE_df.loc[:, OHE_df.columns.str.contains('_log')].columns.tolist(), axis=1)

In [39]:
# Resplit X dataset with OHE columns
OHE_train, OHE_test, y_train, y_test = model_selection.train_test_split(OHE_df, y, test_size=0.2, random_state=2019)

In [40]:
print('Before OHE, n features = ', X_train.shape[1])
print('After OHE, n features = ', OHE_train.shape[1])

Before OHE, n features =  22
After OHE, n features =  612


# GridSearch

In [41]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [42]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 3],
    'min_samples_split': [2, 8, 10, 12],
    'n_estimators': [100, 200, 300, 500]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(OHE_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 288 out of 288 | elapsed:  2.4min finished


{'criterion': 'entropy',
 'max_depth': None,
 'min_samples_split': 10,
 'n_estimators': 200}

# Random Forest

In [43]:
from sklearn.ensemble import RandomForestClassifier
from numpy import *
random.seed(4222)
classifier = RandomForestClassifier(criterion="entropy", min_samples_split=10, n_estimators=200)
# {'criterion': 'entropy',
#  'max_depth': None, #default
#  'min_samples_split': 10,
#  'n_estimators': 200}

In [44]:
print(OHE_train.shape)
print(y_train.shape)
print(OHE_test.shape)
print(y_test.shape)

(788, 612)
(788,)
(197, 612)
(197,)


In [45]:
classifier.fit(OHE_train, y_train)

RandomForestClassifier(criterion='entropy', min_samples_split=10,
                       n_estimators=200)

# Model Evaluation

In [46]:
# Evaluate Model 
y_predict_class = classifier.predict(OHE_test)

report = """
The evaluation report is:
Confusion Matrix:
{}
Accuracy: {}
""".format(confusion_matrix(y_test, y_predict_class),
           accuracy_score(y_test, y_predict_class))
print(report)


The evaluation report is:
Confusion Matrix:
[[ 59  11]
 [  2 125]]
Accuracy: 0.934010152284264



# Feature Importance

In [47]:
important_features_dict = {}
importance = classifier.feature_importances_
for i, v in enumerate(importance):
    important_features_dict[i] = v

top_idx = sorted(important_features_dict,
                                 key=important_features_dict.get,
                                 reverse=True)[:10]

top_features_names = OHE_train.columns[top_idx]
top_features_scores = importance[top_idx]

print('Top 3 most important features: \n')

for i in range(3):
    print('%s: %s, Score: %.3f ' % (i+1, top_features_names[i], top_features_scores[i]))


Top 3 most important features: 

1: backers_count, Score: 0.177 
2: pledged, Score: 0.148 
3: avg_fund_per_backer, Score: 0.105 
