In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('./kickstarter-projects/kickstarter-projects.csv')
df = df[ (df['state'] == 'successful') | (df['state'] == 'failed') ]
df = df[ df['backers'] > 0 ]

df['country'] = df['country'].replace({ 'N,0"': np.nan })
df = df.dropna()

# Get project duration in days
df['launched'] = pd.to_datetime( df['launched'], format='%Y-%m-%d %H:%M:%S' )
df['deadline'] = pd.to_datetime( df['deadline'], format='%Y-%m-%d' )
df['duration'] = df['deadline'] - df['launched']
df['duration'] = df['duration'].dt.days

# Plus one for zeros because each project has to last at least a day
df['duration'] = [ i + 1 if i == 0 else i for i in df['duration'] ]

# One hot encode the variables
df['state'] = df['state'].map({ 'successful': 1, 'failed': 0 })
df = df.dropna()

df = pd.get_dummies(df, columns=['category', 'main_category', 'currency', 'country'],
                    drop_first=True, dtype=np.int64)

df.columns = [ i.lower().replace(' ', '') for i in df.columns ]

# Calculate project performance metrics
df['pledged_per_backer'] = df['usd_pledged_real'] / df['backers']
df['goal_per_backer']    = df['usd_goal_real']    / df['backers']
df['pledged_per_day']    = df['usd_pledged_real'] / df['duration']
df['goal_per_day']       = df['usd_goal_real']    / df['duration']

# Scale numeric variables
for i in ['pledged_per_backer', 'goal_per_backer', 'pledged_per_day', 'goal_per_day']:
    df[i] = ( df[i] - np.mean(df[i]) ) / np.std(df[i])

# Create variables for regression
y = df['state']
x = df.drop(columns=['id', 'name', 'state', 'goal', 'backers', 'duration', 'pledged',
                     'usd_pledged_real', 'usd_goal_real', 'deadline', 'launched',
                     'usdpledged'])

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = LogisticRegressionCV(Cs=np.arange(0.2, 2, 0.2), max_iter=np.inf).fit(x_train, y_train)

KeyboardInterrupt: 

In [22]:
from sklearn.metrics import classification_report

y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90     31775
           1       0.91      0.84      0.87     26790

    accuracy                           0.89     58565
   macro avg       0.89      0.89      0.89     58565
weighted avg       0.89      0.89      0.89     58565

