# Kickstarter Example (reworked)

## Imports

Here we import the necessary modules.

In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

## Reading the data

Load the data from the `.csv` into a `DataFrame`.

In [2]:
PATH_ROOT = os.path.join('input', 'kickstarter-projects')
FLOC = os.path.join(PATH_ROOT, 'ks-projects-201801.csv')

In [3]:
df = pd.read_csv(FLOC, index_col='ID')
df.head()

Unnamed: 0_level_0,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


## Building a model

Let's build a `RandomForestClassifier` model!

### Preprocessing

Here we perform preprocessing steps which entails creating the feature space - a clean, processed version of the raw `df`.

In [4]:
# create a constant list of desired columns
COLUMNS = ['category', 'main_category', 
           'currency', 'goal', 'pledged', 
           'state', 'backers', 'country', 
           'usd pledged', 'usd_pledged_real', 
           'usd_goal_real']
# only select the desired columns, drop NaN and missing values
feature_space = df[COLUMNS].dropna()
feature_space.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 374864 entries, 1000002330 to 999988282
Data columns (total 11 columns):
category            374864 non-null object
main_category       374864 non-null object
currency            374864 non-null object
goal                374864 non-null float64
pledged             374864 non-null float64
state               374864 non-null object
backers             374864 non-null int64
country             374864 non-null object
usd pledged         374864 non-null float64
usd_pledged_real    374864 non-null float64
usd_goal_real       374864 non-null float64
dtypes: float64(5), int64(1), object(5)
memory usage: 34.3+ MB


In [5]:
# features
X = feature_space.drop('state', axis=1)
# labels
y = feature_space['state']

In [6]:
# define some preprocessor objects...
label_encoder = LabelEncoder()

# ...and apply fit_transform to our features
for col in X.select_dtypes(include='object').columns.values:
    X[col] = label_encoder.fit_transform(X[col])

X.head()

Unnamed: 0_level_0,category,main_category,currency,goal,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1000002330,108,12,5,1000.0,0.0,0,9,0.0,0.0,1533.95
1000003930,93,6,13,30000.0,2421.0,15,21,100.0,2421.0,30000.0
1000004038,93,6,13,45000.0,220.0,3,21,220.0,220.0,45000.0
1000007540,90,10,13,5000.0,1.0,1,21,1.0,1.0,5000.0
1000011046,55,6,13,19500.0,1283.0,14,21,1283.0,1283.0,19500.0


In [7]:
# Create the (stratified) train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

### Training

Here we instantiate and train the model on the test data.

In [8]:
%%time
# Let's just use the default parameters (10 estimators)
model = RandomForestClassifier()
model.fit(X_train, y_train)



Wall time: 14.8 s


### Testing

How well does the model perform on the test data (unseen during training)?

In [9]:
model.score(X_test, y_test)

0.8608775449229588

Not bad... 86% accuracy. Can we optimize our model parameters for the chosen feature space?

### Model grid (`GridSearchCV`)

Let's make use of ScikitLearn's grid searching capabilities. **Warning**: Grid searching is a computationally expensive action - it will take a few minutes (at least) to run the full grid search.

In [10]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, make_scorer, confusion_matrix

In [11]:
%%time
# 5-fold cross validation... 
# (https://en.wikipedia.org/wiki/Cross-validation_(statistics)#k-fold_cross-validation)
skf = StratifiedKFold(n_splits=10)

# here we define the parameter grid we wish to test.
# let's just vary a couple for the time being.
params = {'max_depth': [10, 20, 30], 'n_estimators': [10, 20, 30]}

# choose the type of model to use in grid searching
model = RandomForestClassifier()

# establish and fit the grid
grid = GridSearchCV(estimator=model, 
                    param_grid=params, 
                    cv=skf, 
                    refit=True,
                    return_train_score=True,
                    scoring=make_scorer(accuracy_score))
grid.fit(X_train, y_train)

Wall time: 1h 10min 7s


In [12]:
# print out the optimal parameters found by the grid search
print('Best parameters found: {}'.format(grid.best_params_))

Best parameters found: {'max_depth': 10, 'n_estimators': 20}


In [13]:
accuracy_score(y_test, grid.predict(X_test))

0.8812475991292842

We were able to increase our test-set accuracy by 2% by doing a small grid search... The grid can become more granular, and maybe some better results will be found. Let's look at a confusion matrix for our predictions.

In [14]:
# show confusion matrix
predicted_base = 'Predicted {}'
actually_base = 'Actually {}'
label_vals = [v for v in y.value_counts().index]
cm = confusion_matrix(y_test, grid.predict(X_test))
cm_frame = pd.DataFrame(cm, columns=[predicted_base.format(v) for v in label_vals],
                            index=[actually_base.format(v) for v in label_vals])
cm_frame.head()

Unnamed: 0,Predicted failed,Predicted successful,Predicted canceled,Predicted live,Predicted suspended
Actually failed,7,9429,0,253,0
Actually successful,5,49119,0,279,0
Actually canceled,0,570,0,130,0
Actually live,0,2,0,33461,0
Actually suspended,0,368,0,93,0


Here the problem of class imbalance becomes especially apparent - our model **doesn't predict *any* canceled or suspended projects**... This is most likely because these classes are severly under-represented in the data.

## Digging deeper

Let's reload the original dataset and perform more intense preprocessing. No more sections here, everything in a single block. 

Define some methods for performing common functionlity.

In [99]:
def csv_to_df(floc):
    '''Extracts the contents of the raw .csv file into a DataFrame. '''
    return pd.read_csv(floc, index_col='ID', parse_dates=['launched', 'deadline'])

def select_columns(df, cols=['category', 'main_category', 
                             'currency', 'state', 'backers', 
                             'country', 'usd_pledged_real', 
                             'usd_goal_real', 'launched', 'deadline']):
    '''Selects a subset of columns from a DataFrame.
    
    If a column list is not supplied, a default list of columns
    is created.
    
    Parameters
    ----------
        df              : the DataFrame from which to select the specified columns
        cols (optional) : the columns to select
    
    Returns
    -------
        res : the resulting DataFrame after subselection of supplied columns
    '''
    # only select the desired columns, drop NaN and missing values
    res = df[cols].dropna()
    res.info()
    return res

def split_features_and_labels(df):
    '''Splits a DataFrame into features and labels
    
    '''
    X = df.drop('state', axis=1)
    # labels
    y = df['state']

    return X, y


def label_encode(X):
    '''Performs label encoding on a DataFrame
    
    '''
    label_encoder = LabelEncoder()
    
    for col in X.select_dtypes(include='object').columns.values:
        X[col] = label_encoder.fit_transform(X[col])
    
    print('\n Categorical labels encoded.')
    return X

def impute_backers(X):
    '''Performs a variation of mean-imputation on the backers column
    
    '''
    ok = X.loc[(X['usd_pledged_real'] > 0) & (X['backers'] > 0)]
    ok_idx = ok.index
    mean = np.mean(ok['usd_pledged_real'] / ok['backers'])

    def _impute(row):
        row['backers'] = np.ceil(row['usd_pledged_real'] / mean).astype('int')
        return row

    X[~X.index.isin(ok_idx.values)] = \
        X[~X.index.isin(ok_idx.values)].apply(_impute, axis=1)
    
    def calc_avg_pledge(row):
        if row['backers'] > 0 and row['usd_pledged_real'] > 0:
            return row['usd_pledged_real'] / row['backers']
        return 0

    X['average_pledge'] = X.apply(calc_avg_pledge, axis=1)
    print('\n backers column imputed.')
    return X

def calculate_time_delta(X):
    '''Adds a column to the supplied dataframe representing the time delta
    between the launch date and the deadline of the project.
    
    '''
    X['launch_deadline_delta'] = (X['deadline'] - X['launched']).dt.days
    print('\n launch_deadline_delta column added.')
    return X

def train(X, y, n_estimators=25, max_depth=20, kfolds=None, show_cm=True):
    print('\n Training model...')
    _X = X.copy(deep=True)
    _X = _X.drop(['launched', 'deadline'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(_X, y, stratify=y)
    model = None
    if kfolds:
        skv = StratifiedKFold(n_splits=10)

        params = {'max_depth':[max_dept], 'n_estimators':[n_estimators]}

        model = GridSearchCV(estimator=RandomForestClassifier(),
                            param_grid=params,
                            cv=skf, 
                            refit=True,
                            return_train_score=True,
                            scoring=make_scorer(accuracy_score))
        model.fit(X_train, y_train)
        train_preds = model.predict(X_train)
        test_preds = model.predict(X_test)
        print('Model accuracy:\nTrain set:{train}\nTest set: {test}\n\n'
              .format(train=accuracy_score(y_train, train_preds),
                      test=accuracy_score(y_test, test_preds)))
    else:
        X_train, X_validate, y_train, y_validate = \
            train_test_split(X_train, y_train, stratify=y_train)
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
        model.fit(X_train, y_train)
        train_preds = model.predict(X_train)
        val_preds = model.predict(X_validate)
        test_preds = model.predict(X_test)
    
        print('Model accuracy:\nTrain set:{train}\nValidation set: {validate}\n'
              'Test set: {test}\n\n'
              .format(train=accuracy_score(y_train, train_preds),
                      validate=accuracy_score(y_validate, val_preds),
                      test=accuracy_score(y_test, test_preds)))
    
    if show_cm:
        predicted_base = 'Predicted {}'
        actually_base = 'Actually {}'
        label_vals = [v for v in y.value_counts().index]
        cm = confusion_matrix(y_test, test_preds)
        cm_frame = pd.DataFrame(cm, columns=[predicted_base.format(v) for v in label_vals],
                                    index=[actually_base.format(v) for v in label_vals])
        print(cm_frame.head())

In [96]:
def extraction():
    df = csv_to_df(FLOC)
    df = csv_to_df(FLOC)
    df = select_columns(df)
    X, y = split_features_and_labels(df)
    return X, y

def preprocessing(X):
    X = label_encode(X)
    X = impute_backers(X)
    X = calculate_time_delta(X)
    return X

In [100]:
# Main logic
X, y = extraction()
X = preprocessing(X)
train(X, y)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 378661 entries, 1000002330 to 999988282
Data columns (total 10 columns):
category            378661 non-null object
main_category       378661 non-null object
currency            378661 non-null object
state               378661 non-null object
backers             378661 non-null int64
country             378661 non-null object
usd_pledged_real    378661 non-null float64
usd_goal_real       378661 non-null float64
launched            378661 non-null datetime64[ns]
deadline            378661 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(2), int64(1), object(5)
memory usage: 31.8+ MB

 Categorical labels encoded.

 backers column imputed.

 launch_deadline_delta column added.

 Training model...
Model accuracy:
Train set:0.9252662021822006
Validation set: 0.8798997168974211
Test set: 0.8801681701983817


                     Predicted failed  Predicted successful  \
Actually failed                   252                  9173 