In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

%matplotlib inline

In [25]:
# Load the CSV dataset
df = pd.read_csv('../inputs/kickstarter_projects.csv',
                 parse_dates=['deadline', 'launched'])

In [30]:
# Drop live projects
df = df.query('state != "live"')

# Add outcome column, "successful" == 1, others are 0
df = df.assign(outcome=(df['state'] == 'successful').astype(int))
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome,hour,day,month,year
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:00,0.0,failed,0,GB,0.0,0.0,1533.95,0,12,11,8,2015
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:00,2421.0,failed,15,US,100.0,2421.0,30000.0,0,4,2,9,2017
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:00,220.0,failed,3,US,220.0,220.0,45000.0,0,0,12,1,2013
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:00,1.0,failed,1,US,1.0,1.0,5000.0,0,3,17,3,2012
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:00,1283.0,canceled,14,US,1283.0,1283.0,19500.0,0,8,4,7,2015


In [31]:
# Feature Engineering
df = df.assign(hour=df.launched.dt.hour,
               day=df.launched.dt.day,
               month=df.launched.dt.month,
               year=df.launched.dt.year)

cat_features = ['category', 'currency', 'country']
encoder = LabelEncoder()

In [32]:
# Apply the label encoder to each column
encoded = df[cat_features].apply(encoder.fit_transform)
data = df[['goal', 'hour', 'day', 'month', 'year', 'outcome']].join(encoded)

valid_fraction = 0.1
valid_size = int(len(data) * valid_fraction)

train = data[:-2 * valid_size]
valid = data[-2 * valid_size:-valid_size]
test = data[-valid_size:]

In [33]:
# Define the hyperparameter grid for grid search
param_grid = {
    'num_leaves': [32, 64, 128],
    'objective': ['binary'],
    'metric': ['auc'],
}

In [34]:
# Create the LightGBM model
lgb_model = lgb.LGBMClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(lgb_model, param_grid, cv=3, scoring='roc_auc')

# Fit the GridSearchCV on the training data
grid_search.fit(train.drop('outcome', axis=1), train['outcome'])

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

[LightGBM] [Info] Number of positive: 71560, number of negative: 128900
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 528
[LightGBM] [Info] Number of data points in the train set: 200460, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.356979 -> initscore=-0.588501
[LightGBM] [Info] Start training from score -0.588501
[LightGBM] [Info] Number of positive: 71560, number of negative: 128900
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 527
[LightGBM] [Info] Number of data points in the train set: 200460, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.356979 -> initscore=-0.588501
[LightGBM] [Info] Start training from score -0.588501
[LightGBM] [Info] Number of positive: 71560, number of negative: 128900
You can set `force_r

## Training a LightGBM model
We will use a LightGBM model. This is a tree-based model that typically provides the best performance, even compared to XGBoost. It's also relatively fast to train.

In [19]:
!pip install lightgbm

Collecting lightgbm
  Using cached lightgbm-4.0.0-py3-none-win_amd64.whl (1.3 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.0.0


In [36]:
# Create the final LightGBM model with the best hyperparameters
final_lgb_model = lgb.LGBMClassifier(**best_params)

# Fit the final model on the training data
final_lgb_model.fit(train.drop('outcome', axis=1), train['outcome'])

# Make predictions on the test data using the final model
y_pred_lgb = final_lgb_model.predict(test.drop('outcome', axis=1))

# Evaluate the final model
print("Classification Report:")
print(classification_report(test['outcome'], y_pred_lgb))

print("Confusion Matrix:")
print(confusion_matrix(test['outcome'], y_pred_lgb))

[LightGBM] [Info] Number of positive: 107340, number of negative: 193350
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 527
[LightGBM] [Info] Number of data points in the train set: 300690, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.356979 -> initscore=-0.588501
[LightGBM] [Info] Start training from score -0.588501
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.85      0.79     24272
           1       0.62      0.43      0.51     13314

    accuracy                           0.70     37586
   macro avg       0.67      0.64      0.65     37586
weighted avg       0.69      0.70      0.69     37586

Confusion Matrix:
[[20684  3588]
 [ 7563  5751]]


In [37]:
# Get feature importances from the final model
importances = final_lgb_model.feature_importances_
print("Feature importances:")
for col, imp in zip(train.drop('outcome', axis=1).columns, importances):
    print(f"{col}: {imp}")

Feature importances:
goal: 2329
hour: 1371
day: 1383
month: 1067
year: 1171
category: 4318
currency: 536
country: 525


In [39]:
from scipy.stats import ttest_ind

# Separate successful and unsuccessful projects
successful_projects = train[train['outcome'] == 1]
unsuccessful_projects = train[train['outcome'] == 0]

# Calculate statistical significance for each numerical feature using t-test
print("Statistical significance of numerical features:")
for col in train.drop('outcome', axis=1).columns:
    if col not in cat_features:
        t_stat, p_value = ttest_ind(successful_projects[col], unsuccessful_projects[col], equal_var=False)
        print(f"{col}: t-statistic = {t_stat:.4f}, p-value = {p_value:.4f}")

Statistical significance of numerical features:
goal: t-statistic = -17.8227, p-value = 0.0000
hour: t-statistic = 5.3334, p-value = 0.0000
day: t-statistic = -8.8394, p-value = 0.0000
month: t-statistic = -5.9599, p-value = 0.0000
year: t-statistic = -45.6717, p-value = 0.0000


In [38]:
# Get AUC score on the test data
ypred_prob = final_lgb_model.predict_proba(test.drop('outcome', axis=1))[:, 1]
score = roc_auc_score(test['outcome'], ypred_prob)
print(f"Test AUC score: {score}")

Test AUC score: 0.7470003673875241
