In [1]:
# setting environment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# supress warnings
import warnings
warnings.filterwarnings("ignore")

In this project we will continue working on the Kickstarter data we cleaned up and evaluate with different algorithm such as decision tree and random forest

In [2]:
# read in data
KS_clean = pd.read_csv('data/KS_clean.csv')
KS_clean.head()

Unnamed: 0,backers_count,disable_communication,goal,is_starrable,staff_pick,state,name_Action,name_Animals,name_Audio,name_Children's Books,...,color_2577151,color_51627,color_58341,color_6526716,location_AU,location_CA,location_DE,location_GB,location_Others,location_US
0,170,0,25000.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,10,0,500.0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0,0,3500.0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,20000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,62,0,5000.0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Before fitting our model, we will split the training and testing set

In [3]:
# splitting training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(KS_clean.drop('state', axis=1), KS_clean['state'], test_size=0.3,
                                                   random_state=1009)

### Decision Tree

In [4]:
# simple approach, set up random parameters to see how decision tree works
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dst = DecisionTreeClassifier(criterion='entropy', max_features=5, max_depth=5, random_state=1009)
dst.fit(X_train, y_train)

prediction = dst.predict(X_test)
# compare the real data with prediction
print(accuracy_score(prediction, y_test))

0.8540209790209791


From above, we can see by randomly select our parameters, we get a score of 85.4% accuracy, which is pretty good, let's see how it performs in other error metrics

In [5]:
# calculate true positive rate and false positive rate
tp = (prediction == 1) & (y_test == 1)
tn = (prediction == 0) & (y_test == 0)
fp = (prediction == 1) & (y_test == 0)
fn = (prediction == 0) & (y_test == 1)
print('True positive rate: {}'.format(np.sum(tp)/(np.sum(tp)+np.sum(fn))))
print('False positive rate: {}'.format(np.sum(fp)/(np.sum(fp)+np.sum(tn))))

True positive rate: 0.8811544991511036
False positive rate: 0.17477477477477477


We can see that our first approach of decision tree algorithms performs pretty well, and it is better than Naive Bayes we did previously. Our next step is to find out the best tree model. there are many parameters to tune to find the best combination, such as `criterion`, `splitter`, `max_depth`, `min_samples_split`, `min_samples_leaf`, `max_features`..etc. We will use GridSearch to find the best.

In [6]:
from sklearn.model_selection import GridSearchCV
# setting hyperparameters
hyperparameters = {
    'criterion': ['entropy', 'gini'],
    'max_depth': range(3,11),
    'max_features': ['log2', 'sqrt'],
    'min_samples_leaf': range(1,11),
    'min_samples_split': [3, 5],
    }

dst_2 = DecisionTreeClassifier(random_state=1009)
grid = GridSearchCV(dst_2, param_grid=hyperparameters, cv=10)

# use training set to find the best parameter
grid.fit(X_train, y_train)

print(grid.best_params_)

{'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5}


In [9]:
print(grid.best_score_)

0.9002624671916011


In [17]:
# use the best parameters above to test our model
dst_3 = DecisionTreeClassifier(**grid.best_params_, random_state=1009)
dst_3.fit(X_train, y_train)

prediction = dst_3.predict(X_test)
# compare the real data with prediction
print(accuracy_score(prediction, y_test))

0.8907342657342657


In [18]:
# calculate true positive rate and false positive rate
tp = (prediction == 1) & (y_test == 1)
tn = (prediction == 0) & (y_test == 0)
fp = (prediction == 1) & (y_test == 0)
fn = (prediction == 0) & (y_test == 1)
print('True positive rate: {}'.format(np.sum(tp)/(np.sum(tp)+np.sum(fn))))
print('False positive rate: {}'.format(np.sum(fp)/(np.sum(fp)+np.sum(tn))))

True positive rate: 0.9151103565365025
False positive rate: 0.13513513513513514


In [19]:
# randomize the order of raw data
KS_clean = KS_clean.sample(len(KS_clean))

In [27]:
# double confirm whether the best parameter is truly better than the first tree
from sklearn.model_selection import cross_val_score
print('Simple tree:', cross_val_score(dst, KS_clean.drop('state', axis=1), KS_clean['state'], cv=10).mean())
print('Better tree:', cross_val_score(dst_3, KS_clean.drop('state', axis=1), KS_clean['state'], cv=10).mean())

Simple tree: 0.8745817448969333
Best tree: 0.8974061204253573


We can see that there are some improvements by tuning our model, the next step is to use cross validation to see if there's any overfitting problem

In [30]:
# let's try to fit with the whole dataset using Gridsearch
dst_4 = DecisionTreeClassifier(random_state=1009)
grid = GridSearchCV(dst_4, param_grid=hyperparameters, cv=10)

# use training set to find the best parameter
grid.fit(KS_clean.drop(['state'], axis=1), KS_clean['state'])

print(grid.best_params_)

{'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5}


In [31]:
dst_5 = DecisionTreeClassifier(criterion='entropy', max_depth=10, max_features='log2', min_samples_leaf=1,
                               min_samples_split=5, random_state=1009)
dst_5.fit(X_train, y_train)

prediction = dst_5.predict(X_test)
# compare the real data with prediction
print(accuracy_score(prediction, y_test))

0.8784965034965035


In [32]:
# calculate true positive rate and false positive rate
tp = (prediction == 1) & (y_test == 1)
tn = (prediction == 0) & (y_test == 0)
fp = (prediction == 1) & (y_test == 0)
fn = (prediction == 0) & (y_test == 1)
print('True positive rate: {}'.format(np.sum(tp)/(np.sum(tp)+np.sum(fn))))
print('False positive rate: {}'.format(np.sum(fp)/(np.sum(fp)+np.sum(tn))))

True positive rate: 0.9303904923599321
False positive rate: 0.17657657657657658


In [33]:
cross_val_score(dst_5, KS_clean.drop(['state'], axis=1), KS_clean['state'], cv=10)

array([0.92408377, 0.91361257, 0.92388451, 0.89238845, 0.9160105 ,
       0.90551181, 0.86351706, 0.90551181, 0.8687664 , 0.91578947])

From testing different approaches previously, we will use the dst_3 parameters as our best model for decision tree, next we will compare them with random forest

### Random Forest

In [32]:
from sklearn.ensemble import RandomForestClassifier

# we will use the simples model without specifying any parameter
rf = RandomForestClassifier(random_state=1009)
rf.fit(X_train, y_train)

prediction = rf.predict(X_test)
# compare the real data with prediction
print(accuracy_score(prediction, y_test))

0.9222027972027972


In [33]:
# calculate true positive rate and false positive rate
tp = (prediction == 1) & (y_test == 1)
tn = (prediction == 0) & (y_test == 0)
fp = (prediction == 1) & (y_test == 0)
fn = (prediction == 0) & (y_test == 1)
print('True positive rate: {}'.format(np.sum(tp)/(np.sum(tp)+np.sum(fn))))
print('False positive rate: {}'.format(np.sum(fp)/(np.sum(fp)+np.sum(tn))))

True positive rate: 0.9320882852292021
False positive rate: 0.08828828828828829


In [36]:
cross_val_score(rf, KS_clean.drop(['state'], axis=1), KS_clean['state'], cv=10)

array([0.94240838, 0.92408377, 0.94225722, 0.93700787, 0.93175853,
       0.95013123, 0.92913386, 0.94750656, 0.92913386, 0.92894737])

We can see that even without tuning any parameters in random forest algorithm, it still performs better than our best decision tree model in every aspect, since random forest takes the average of many decision trees, even a tree in the forest falls, it can still outperform a single decision tree.

Lastly, we will compare the time it takes to find a best decision tree parameters v.s. the time it takes to use a simplest random tree algorithm

In [37]:
%%time
hyperparameters = {
    'criterion': ['entropy', 'gini'],
    'max_depth': range(3,11),
    'max_features': ['log2', 'sqrt'],
    'min_samples_leaf': range(1,11),
    'min_samples_split': [3, 5],
    }

dst_2 = DecisionTreeClassifier(random_state=1009)
grid = GridSearchCV(dst_2, param_grid=hyperparameters, cv=10)

# use training set to find the best parameter
grid.fit(X_train, y_train)

print(grid.best_params_)

dst_3 = DecisionTreeClassifier(criterion='gini', max_depth=10, max_features='sqrt', min_samples_leaf=1,
                               min_samples_split=5, random_state=1009)
dst_3.fit(X_train, y_train)

prediction = dst_3.predict(X_test)

# compare the real data with prediction
print(accuracy_score(prediction, y_test))

{'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5}
0.8907342657342657
CPU times: user 31.3 s, sys: 128 ms, total: 31.4 s
Wall time: 31.5 s


In [42]:
%%time
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

prediction = rf.predict(X_test)
# compare the real data with prediction
print(accuracy_score(prediction, y_test))

0.9335664335664335
CPU times: user 36.3 ms, sys: 1.28 ms, total: 37.6 ms
Wall time: 36.2 ms


We can see the complexity between two models are pretty huge, almost around 1000 folds! We then jump into the conclusion that even the simplest random forest model outperforms the decision tree algorithm.