In [1]:
import pandas as pd
import numpy as np
import datetime

from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('kickstarter.csv')
data.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [3]:
len(data)

378661

In [4]:
# Start-up outcomes categories
print('Outcome: ', list(data['state'].unique()))

Outcome:  ['failed', 'canceled', 'successful', 'live', 'undefined', 'suspended']


In [5]:
live = np.where(data.state == 'live')[0]

print('The indeces: ', live)

The indeces:  [    40    183    458 ... 378429 378496 378512]


In [6]:
data.iloc[live]

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
40,1000149007,Unschooling To University Book Project,Nonfiction,Publishing,CAD,2018-01-19,3000.0,2017-11-20 18:15:14,592.00,live,24,CA,185.65,472.88,2396.36
183,1000761521,KILOS.,Thrillers,Film & Video,USD,2018-02-06,5000.0,2017-12-23 20:34:31,310.00,live,4,US,200.00,310.00,5000.00
458,1002183790,The Man in the Field,Horror,Film & Video,GBP,2018-01-08,850.0,2017-12-21 11:19:18,413.00,live,11,GB,13.39,563.32,1159.38
537,1002599057,Dear Atlantas,Film & Video,Film & Video,USD,2018-02-16,2000.0,2017-12-18 02:48:27,5.00,live,1,US,5.00,5.00,2000.00
544,1002629894,Art Calendar 2018,Illustration,Art,MXN,2018-01-06,1000.0,2017-12-21 05:34:12,0.00,live,0,MX,0.00,0.00,52.32
627,100301082,"Feliz año nuevo, mamá.",Theater,Theater,MXN,2018-02-04,49000.0,2017-12-06 03:36:06,1000.00,live,2,MX,53.52,52.32,2563.83
641,1003104071,One GuitarMan Band,Music,Music,EUR,2018-01-17,2000.0,2017-12-18 00:02:46,0.00,live,0,IT,0.00,0.00,2427.39
752,1003665581,Durango Falls TV Show,Film & Video,Film & Video,USD,2018-01-20,10000.0,2017-11-21 10:56:11,1.00,live,1,US,0.00,1.00,10000.00
783,1003819770,GrandLo Café,Drinks,Food,USD,2018-01-19,25000.0,2017-11-20 19:54:51,9097.00,live,38,US,3438.00,9097.00,25000.00
842,100411349,E Coin Mining and Rig-Building Workshop,Hardware,Technology,USD,2018-01-06,3100.0,2017-11-22 20:28:28,225.00,live,9,US,75.00,225.00,3100.00


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
ID                  378661 non-null int64
name                378657 non-null object
category            378661 non-null object
main_category       378661 non-null object
currency            378661 non-null object
deadline            378661 non-null object
goal                378661 non-null float64
launched            378661 non-null object
pledged             378661 non-null float64
state               378661 non-null object
backers             378661 non-null int64
country             378661 non-null object
usd pledged         374864 non-null float64
usd_pledged_real    378661 non-null float64
usd_goal_real       378661 non-null float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


In [8]:
# Ati dropping the live projects...
data = data.query('state != "live"')

len(data)

375862

In [9]:
# Now all the indeces with live projects were eliminated
live = np.where(data.state == 'live')[0]

print('The indeces: ', live)

The indeces:  []


In [10]:
data.loc[live]

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real


In [11]:
# Adding a new column for the outcome
# Assigning 'successful' == 1 and others == 0

data = data.assign(outcome=(data['state'] == 'successful').astype(int))

data.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,0


In [12]:
data.launched.dtype

dtype('O')

In [13]:
data[['launched']].head()

Unnamed: 0,launched
0,2015-08-11 12:12:28
1,2017-09-02 04:43:57
2,2013-01-12 00:20:50
3,2012-03-17 03:24:11
4,2015-07-04 08:35:03


In [14]:
# COnverting the launched col into datetime
data.launched = pd.to_datetime(data.launched, infer_datetime_format=True)

data.launched.dtype

dtype('<M8[ns]')

In [15]:
data.launched.head()

0   2015-08-11 12:12:28
1   2017-09-02 04:43:57
2   2013-01-12 00:20:50
3   2012-03-17 03:24:11
4   2015-07-04 08:35:03
Name: launched, dtype: datetime64[ns]

In [16]:
# After converting dates
# Creating new cols for day, hour, month and year
# Assigning the appropriate values from the launched col <timestamp>

data = data.assign(hour=data.launched.dt.hour,
                  day=data.launched.dt.day, 
                  month=data.launched.dt.month,
                  year=data.launched.dt.year)

data.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome,hour,day,month,year
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0,12,11,8,2015
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0,4,2,9,2017
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0,0,12,1,2013
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,0,3,17,3,2012
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,0,8,4,7,2015


In [17]:
cats = ['category', 'currency', 'country']

In [18]:
encoder = LabelEncoder()

In [19]:
enc_cats = data[cats].apply(encoder.fit_transform)

In [20]:
enc_cats.head()

Unnamed: 0,category,currency,country
0,108,5,9
1,93,13,22
2,93,13,22
3,90,13,22
4,55,13,22


In [21]:
X = data[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(enc_cats)
X.head()

Unnamed: 0,goal,hour,day,month,year,outcome,category,currency,country
0,1000.0,12,11,8,2015,0,108,5,9
1,30000.0,4,2,9,2017,0,93,13,22
2,45000.0,0,12,1,2013,0,93,13,22
3,5000.0,3,17,3,2012,0,90,13,22
4,19500.0,8,4,7,2015,0,55,13,22


In [22]:
# Creating training, validation and test sets

In [23]:
X.shape

(375862, 9)

In [24]:
valid_fraction = 0.1

valid_size = int(len(X) * valid_fraction)

valid_size

37586

In [25]:
X.shape

(375862, 9)

In [26]:
train = X[:-2* valid_size]

train.shape

(300690, 9)

In [27]:
valid = X[-2*valid_size:-valid_size]
valid.shape

(37586, 9)

In [28]:
test = X[-valid_size:]
test.shape

(37586, 9)

In [29]:
# Now train a model
import lightgbm as lgb
#using a LightGBM model. This is a tree-based model that typically provides the 
# Best performance, even compared to XGBoost. It's also relatively fast to train.

In [30]:
feature_cols = train.columns.drop('outcome')
feature_cols

Index(['goal', 'hour', 'day', 'month', 'year', 'category', 'currency',
       'country'],
      dtype='object')

In [31]:
dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])

dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

In [32]:
param = {'num_leaves': 64, 'objective': 'binary'}
param['metric'] = 'auc'
num_round = 1000

bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid], early_stopping_rounds=5)

[LightGBM] [Info] Number of positive: 107340, number of negative: 193350
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 528
[LightGBM] [Info] Number of data points in the train set: 300690, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.356979 -> initscore=-0.588501
[LightGBM] [Info] Start training from score -0.588501
[1]	valid_0's auc: 0.694192
Training until validation scores don't improve for 5 rounds
[2]	valid_0's auc: 0.697026
[3]	valid_0's auc: 0.70002
[4]	valid_0's auc: 0.701645
[5]	valid_0's auc: 0.70601
[6]	valid_0's auc: 0.707926
[7]	valid_0's auc: 0.70945
[8]	valid_0's auc: 0.710437
[9]	valid_0's auc: 0.712047
[10]	valid_0's auc: 0.713417
[11]	valid_0's auc: 0.714648
[12]	valid_0's auc: 0.715791
[13]	valid_0's auc: 0.717431
[14]	valid_0's auc: 0.718216
[15]	valid_0's auc: 0.719381
[16]	valid_0's auc: 0.720884
[17]	valid_0's auc: 0.721617
[18]

In [33]:
# Make predictions and evaluate the model
from sklearn import metrics

In [34]:
predictions = bst.predict(test[feature_cols])

In [35]:
score = metrics.roc_auc_score(test['outcome'], predictions)

print(f'Test AUC score: {score}')

Test AUC score: 0.7453069666284735
