In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

%matplotlib inline

In [2]:
# Load the CSV dataset
df = pd.read_csv('../inputs/kickstarter_projects.csv', 
                 parse_dates=['deadline', 'launched'])

In [3]:
# Drop live projects
df = df.query('state != "live"')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375862 entries, 0 to 378660
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   ID                375862 non-null  int64         
 1   name              375858 non-null  object        
 2   category          375862 non-null  object        
 3   main_category     375862 non-null  object        
 4   currency          375862 non-null  object        
 5   deadline          375862 non-null  datetime64[ns]
 6   goal              375862 non-null  float64       
 7   launched          375862 non-null  datetime64[ns]
 8   pledged           375862 non-null  float64       
 9   state             375862 non-null  object        
 10  backers           375862 non-null  int64         
 11  country           375862 non-null  object        
 12  usd pledged       372066 non-null  float64       
 13  usd_pledged_real  375862 non-null  float64       
 14  usd_

In [4]:
# Remove corrupted data: when state = "successful" and backers = 0
df = df[~((df['state'] == 'successful') & (df['backers'] == 0))]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375757 entries, 0 to 378660
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   ID                375757 non-null  int64         
 1   name              375753 non-null  object        
 2   category          375757 non-null  object        
 3   main_category     375757 non-null  object        
 4   currency          375757 non-null  object        
 5   deadline          375757 non-null  datetime64[ns]
 6   goal              375757 non-null  float64       
 7   launched          375757 non-null  datetime64[ns]
 8   pledged           375757 non-null  float64       
 9   state             375757 non-null  object        
 10  backers           375757 non-null  int64         
 11  country           375757 non-null  object        
 12  usd pledged       372066 non-null  float64       
 13  usd_pledged_real  375757 non-null  float64       
 14  usd_

In [5]:
# Drop all rows that contain any null values
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 372062 entries, 0 to 378660
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   ID                372062 non-null  int64         
 1   name              372062 non-null  object        
 2   category          372062 non-null  object        
 3   main_category     372062 non-null  object        
 4   currency          372062 non-null  object        
 5   deadline          372062 non-null  datetime64[ns]
 6   goal              372062 non-null  float64       
 7   launched          372062 non-null  datetime64[ns]
 8   pledged           372062 non-null  float64       
 9   state             372062 non-null  object        
 10  backers           372062 non-null  int64         
 11  country           372062 non-null  object        
 12  usd pledged       372062 non-null  float64       
 13  usd_pledged_real  372062 non-null  float64       
 14  usd_

In [6]:
# Deal with Dependent Variable State
df['target'] = df['state'].apply(lambda x: 0 if x == 'successful' else 1)
print(df['target'].value_counts())

1    238211
0    133851
Name: target, dtype: int64


In [7]:
# Feature Engineering
df['launched'] = pd.to_datetime(df['launched'])
df['deadline'] = pd.to_datetime(df['deadline'])
df['duration'] = (df['deadline'] - df['launched']).dt.days

In [8]:
# Extract year-month and convert to string
df['launched_year_month'] = df['launched'].dt.to_period('M').astype(str)
df['deadline_year_month'] = df['deadline'].dt.to_period('M').astype(str)

In [9]:
# Convert Datetime columns to categorical columns
date_time_cols = ['launched_year_month', 'deadline_year_month']
df[date_time_cols] = df[date_time_cols].astype('category')

In [10]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,target,duration,launched_year_month,deadline_year_month
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:00,0.0,failed,0,GB,0.0,0.0,1533.95,1,58,2015-08,2015-10
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:00,2421.0,failed,15,US,100.0,2421.0,30000.0,1,59,2017-09,2017-11
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:00,220.0,failed,3,US,220.0,220.0,45000.0,1,44,2013-01,2013-02
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:00,1.0,failed,1,US,1.0,1.0,5000.0,1,29,2012-03,2012-04
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:00,1283.0,canceled,14,US,1283.0,1283.0,19500.0,1,55,2015-07,2015-08


In [11]:
cat_features = ['category', 'currency', 
                'country', 'launched_year_month',
               'deadline_year_month']
encoder = LabelEncoder()

encoded = df[cat_features].apply(encoder.fit_transform)
encoded.head()

Unnamed: 0,category,currency,country,launched_year_month,deadline_year_month
0,108,5,9,76,77
1,93,13,21,101,102
2,93,13,21,45,45
3,90,13,21,35,35
4,55,13,21,75,75


In [12]:
df = df[['backers','usd_pledged_real', 'usd_goal_real',
        'target']].join(encoded)
df.head()

Unnamed: 0,backers,usd_pledged_real,usd_goal_real,target,category,currency,country,launched_year_month,deadline_year_month
0,0,0.0,1533.95,1,108,5,9,76,77
1,15,2421.0,30000.0,1,93,13,21,101,102
2,3,220.0,45000.0,1,93,13,21,45,45
3,1,1.0,5000.0,1,90,13,21,35,35
4,14,1283.0,19500.0,1,55,13,21,75,75


In [36]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Separate the numerical columns to be standardized
num_cols = ['usd_goal_real', 'usd_pledged_real', 'backers', 'category', 
            'currency', 'country','launched_year_month', 'deadline_year_month']

# Standardize the numerical columns
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Apply log1p transformation to each numerical feature
for col in num_cols:
    df[col] = df[col].apply(lambda x: np.log1p(x) if x >= -1 else x)

In [123]:
# Define target and features
X = df.drop(['target', 'usd_goal_real','deadline_year_month',
            'currency', 'country', 'category'
              ], axis=1)
y = df['target']

In [124]:
X.head()

Unnamed: 0,backers,usd_pledged_real,launched_year_month
0,-0.480368,-0.411115,0.483698
1,-0.34787,-0.207718,0.784736
2,-0.452223,-0.390559,-1.902492
3,-0.470886,-0.41102,-1.127948
4,-0.356106,-0.297477,0.464482


## Training a LightGBM model
We will use a LightGBM model. This is a tree-based model that typically provides the best performance, even compared to XGBoost. It's also relatively fast to train.

In [14]:
!pip install lightgbm



In [125]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Define the hyperparameter grid for grid search
param_grid = {
    'num_leaves': [16,32,64],
    'objective': ['binary'],
    'metric': ['auc'],
}

In [126]:
# Create the LightGBM model
lgb_model = lgb.LGBMClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(lgb_model, param_grid, cv=3, scoring='roc_auc')

# Fit the GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)

# Create the final LightGBM model with the best hyperparameters
final_lgb_model = lgb.LGBMClassifier(**best_params)

# Fit the final model on the training data
final_lgb_model.fit(X_train, y_train)

# Make predictions on the test data using the final model
y_pred_lgb = final_lgb_model.predict(X_test)

# Evaluate the final model
print("Classification Report:")
print(classification_report(y_test, y_pred_lgb))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lgb))

[LightGBM] [Info] Number of positive: 127161, number of negative: 71271
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 617
[LightGBM] [Info] Number of data points in the train set: 198432, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.640829 -> initscore=0.578964
[LightGBM] [Info] Start training from score 0.578964
[LightGBM] [Info] Number of positive: 127162, number of negative: 71271
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 617
[LightGBM] [Info] Number of data points in the train set: 198433, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.640831 -> initscore=0.578972
[LightGBM] [Info] Start training from score 0.578972
[LightGBM] [Info] Number of positive: 127161, number of negative: 71272
You can set `force_row_w

In [127]:
# Get feature importances from the final model
importances = final_lgb_model.feature_importances_
print("Feature importances:")
for col, imp in zip(X_train.columns, importances):
    print(f"{col}: {imp}")

Feature importances:
backers: 1635
usd_pledged_real: 2594
launched_year_month: 2071


In [128]:
# Get AUC score on the test data
ypred_prob = final_lgb_model.predict_proba(X_test)[:, 1]
score = roc_auc_score(y_test, ypred_prob)
print(f"Test AUC score: {score}")

Test AUC score: 0.9238159928127115
