In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [2]:
# Load the CSV dataset
df = pd.read_csv('../inputs/kickstarter_projects.csv')

In [3]:
# Convert to Datetime and Split Date Parts
df['deadline'] = pd.to_datetime(df['deadline'])
df['launched'] = pd.to_datetime(df['launched'])

# Extract Year, Month, Day, and Hour Fields from Deadline
df['year_deadline'] = df['deadline'].dt.year
df['month_deadline'] = df['deadline'].dt.month


# Extract Year, Month, Day, and Hour Fields from 
df['year_launched'] = df['launched'].dt.year
df['month_launched'] = df['launched'].dt.month

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   ID                378661 non-null  int64         
 1   name              378657 non-null  object        
 2   category          378661 non-null  object        
 3   main_category     378661 non-null  object        
 4   currency          378661 non-null  object        
 5   deadline          378661 non-null  datetime64[ns]
 6   goal              378661 non-null  float64       
 7   launched          378661 non-null  datetime64[ns]
 8   pledged           378661 non-null  float64       
 9   state             378661 non-null  object        
 10  backers           378661 non-null  int64         
 11  country           378661 non-null  object        
 12  usd pledged       374864 non-null  float64       
 13  usd_pledged_real  378661 non-null  float64       
 14  usd_

In [4]:
# Combine Year and Month Columns into a new Categorical Column 'Year_Month_deadline'
df['year_month_deadline'] = df['year_deadline'].astype(str) + '-' + df['month_deadline'].astype(str)

# Convert 'Year_Month_deadline' column to Categorical data type
df['year_month_deadline'] = pd.Categorical(df['year_month_deadline'])

# Combine Year and Month Columns into a new Categorical Column 'Year_Month_launched'
df['year_month_launched'] = df['year_launched'].astype(str) + '-' + df['month_launched'].astype(str)

# Convert 'Year_Month_launched' column to Categorical data type
df['year_month_launched'] = pd.Categorical(df['year_month_launched'])


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 21 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   ID                   378661 non-null  int64         
 1   name                 378657 non-null  object        
 2   category             378661 non-null  object        
 3   main_category        378661 non-null  object        
 4   currency             378661 non-null  object        
 5   deadline             378661 non-null  datetime64[ns]
 6   goal                 378661 non-null  float64       
 7   launched             378661 non-null  datetime64[ns]
 8   pledged              378661 non-null  float64       
 9   state                378661 non-null  object        
 10  backers              378661 non-null  int64         
 11  country              378661 non-null  object        
 12  usd pledged          374864 non-null  float64       
 13  usd_pledged_re

In [5]:
# Deal with Dependent Variable State
df['target'] = df['state'].apply(lambda x: 1 if x == 'successful' else 0)
df['target'].value_counts()

0    244705
1    133956
Name: target, dtype: int64

In [6]:
# Set the Display to Show all Columns
pd.set_option('display.max_columns', None)

In [7]:
def remove_outliers_2std(dataframe, column_name):
    mean_val = dataframe[column_name].mean()
    std_val = dataframe[column_name].std()
    lower_bound = mean_val - 2 * std_val
    upper_bound = mean_val + 2 * std_val
    
    # Filter out data beyond 2 standard deviations from the mean
    dataframe_filtered = dataframe[(dataframe[column_name] >= lower_bound) & (dataframe[column_name] <= upper_bound)]
    
    return dataframe_filtered

# Applying the function to the 'df' DataFrame and the 'backers' column
df = remove_outliers_2std(df, 'backers')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375896 entries, 0 to 378660
Data columns (total 22 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   ID                   375896 non-null  int64         
 1   name                 375892 non-null  object        
 2   category             375896 non-null  object        
 3   main_category        375896 non-null  object        
 4   currency             375896 non-null  object        
 5   deadline             375896 non-null  datetime64[ns]
 6   goal                 375896 non-null  float64       
 7   launched             375896 non-null  datetime64[ns]
 8   pledged              375896 non-null  float64       
 9   state                375896 non-null  object        
 10  backers              375896 non-null  int64         
 11  country              375896 non-null  object        
 12  usd pledged          372099 non-null  float64       
 13  usd_pledged_re

In [8]:
# Remove corrupted data: when state = "successful" and backers = 0
df = df[~((df['state'] == 'successful') & (df['backers'] == 0))]

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375791 entries, 0 to 378660
Data columns (total 22 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   ID                   375791 non-null  int64         
 1   name                 375787 non-null  object        
 2   category             375791 non-null  object        
 3   main_category        375791 non-null  object        
 4   currency             375791 non-null  object        
 5   deadline             375791 non-null  datetime64[ns]
 6   goal                 375791 non-null  float64       
 7   launched             375791 non-null  datetime64[ns]
 8   pledged              375791 non-null  float64       
 9   state                375791 non-null  object        
 10  backers              375791 non-null  int64         
 11  country              375791 non-null  object        
 12  usd pledged          372099 non-null  float64       
 13  usd_pledged_re

In [9]:
# Drop NA
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 372095 entries, 0 to 378660
Data columns (total 22 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   ID                   372095 non-null  int64         
 1   name                 372095 non-null  object        
 2   category             372095 non-null  object        
 3   main_category        372095 non-null  object        
 4   currency             372095 non-null  object        
 5   deadline             372095 non-null  datetime64[ns]
 6   goal                 372095 non-null  float64       
 7   launched             372095 non-null  datetime64[ns]
 8   pledged              372095 non-null  float64       
 9   state                372095 non-null  object        
 10  backers              372095 non-null  int64         
 11  country              372095 non-null  object        
 12  usd pledged          372095 non-null  float64       
 13  usd_pledged_re

Dropped NAs because there were few and wouldn't affect the data. 

In [10]:
# Create copy of df 
df2 = df.copy()

# Perform one-hot encoding on categorical features
#df_encoded = pd.get_dummies(df2, columns=['category', 'currency'])

#df_encoded.head()

Removed country from the ML analysis because it lacked variance. 

In [11]:
df2.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,year_deadline,month_deadline,year_launched,month_launched,year_month_deadline,year_month_launched,target
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:00,0.0,failed,0,GB,0.0,0.0,1533.95,2015,10,2015,8,2015-10,2015-8,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:00,2421.0,failed,15,US,100.0,2421.0,30000.0,2017,11,2017,9,2017-11,2017-9,0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:00,220.0,failed,3,US,220.0,220.0,45000.0,2013,2,2013,1,2013-2,2013-1,0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:00,1.0,failed,1,US,1.0,1.0,5000.0,2012,4,2012,3,2012-4,2012-3,0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:00,1283.0,canceled,14,US,1283.0,1283.0,19500.0,2015,8,2015,7,2015-8,2015-7,0


In [12]:
df2['main_category'].value_counts().sort_values(ascending=False)

Film & Video    62523
Music           49470
Publishing      39304
Games           34200
Technology      32048
Design          29395
Art             28124
Food            24557
Fashion         22727
Theater         10908
Photography     10770
Comics          10748
Crafts           8806
Journalism       4749
Dance            3766
Name: main_category, dtype: int64

In [13]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df2['category'].value_counts().sort_values(ascending=False)

Product Design        21740
Documentary           16085
Tabletop Games        13605
Music                 13320
Shorts                12354
Food                  11462
Video Games           11419
Film & Video           9205
Fiction                9166
Fashion                8514
Nonfiction             8307
Art                    8241
Apparel                7145
Theater                7053
Technology             6813
Rock                   6749
Children's Books       6745
Apps                   6339
Photography            5747
Webseries              5743
Indie Rock             5650
Publishing             5512
Narrative Film         5171
Web                    5148
Comics                 4959
Crafts                 4661
Country & Folk         4450
Design                 4120
Hip-Hop                3907
Hardware               3551
Pop                    3348
Painting               3293
Games                  3207
Illustration           3169
Accessories            3148
Public Art          

In [14]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 372095 entries, 0 to 378660
Data columns (total 22 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   ID                   372095 non-null  int64         
 1   name                 372095 non-null  object        
 2   category             372095 non-null  object        
 3   main_category        372095 non-null  object        
 4   currency             372095 non-null  object        
 5   deadline             372095 non-null  datetime64[ns]
 6   goal                 372095 non-null  float64       
 7   launched             372095 non-null  datetime64[ns]
 8   pledged              372095 non-null  float64       
 9   state                372095 non-null  object        
 10  backers              372095 non-null  int64         
 11  country              372095 non-null  object        
 12  usd pledged          372095 non-null  float64       
 13  usd_pledged_re

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils import compute_class_weight

# Perform one-hot encoding on categorical features
df_encoded = pd.get_dummies(df2, columns=['main_category', 
                                          'currency', 
                                          'year_month_deadline', 
                                          'year_month_launched',
                                          #'country'
                                         ])

In [16]:
# Prepare the feature matrix X and the target variable y
X = df_encoded.drop(columns=['target', 'state', 'deadline', 'launched','category', 
                             'name','ID','country', 'usd pledged', 'pledged', 'year_deadline', 
                             'month_deadline', 'year_launched', 'month_launched', 'goal'])

y = df_encoded['target']  # Assign the target variable

df_encoded['target'].value_counts()

0    240916
1    131179
Name: target, dtype: int64

In [18]:
from sklearn.preprocessing import StandardScaler

# Normalize 'backers', 'usd_pledged_real', and 'usd_goal_real' using StandardScaler
scaler = StandardScaler()
X[['backers','usd_pledged_real', 'usd_goal_real']] = scaler.fit_transform(X[['backers','usd_pledged_real', 'usd_goal_real']])

X.head(15)

Unnamed: 0,backers,usd_pledged_real,usd_goal_real,main_category_Art,main_category_Comics,main_category_Crafts,main_category_Dance,main_category_Design,main_category_Fashion,main_category_Film & Video,main_category_Food,main_category_Games,main_category_Journalism,main_category_Music,main_category_Photography,main_category_Publishing,main_category_Technology,main_category_Theater,currency_AUD,currency_CAD,currency_CHF,currency_DKK,currency_EUR,currency_GBP,currency_HKD,currency_JPY,currency_MXN,currency_NOK,currency_NZD,currency_SEK,currency_SGD,currency_USD,year_month_deadline_2009-10,year_month_deadline_2009-11,year_month_deadline_2009-12,year_month_deadline_2009-5,year_month_deadline_2009-6,year_month_deadline_2009-7,year_month_deadline_2009-8,year_month_deadline_2009-9,year_month_deadline_2010-1,year_month_deadline_2010-10,year_month_deadline_2010-11,year_month_deadline_2010-12,year_month_deadline_2010-2,year_month_deadline_2010-3,year_month_deadline_2010-4,year_month_deadline_2010-5,year_month_deadline_2010-6,year_month_deadline_2010-7,year_month_deadline_2010-8,year_month_deadline_2010-9,year_month_deadline_2011-1,year_month_deadline_2011-10,year_month_deadline_2011-11,year_month_deadline_2011-12,year_month_deadline_2011-2,year_month_deadline_2011-3,year_month_deadline_2011-4,year_month_deadline_2011-5,year_month_deadline_2011-6,year_month_deadline_2011-7,year_month_deadline_2011-8,year_month_deadline_2011-9,year_month_deadline_2012-1,year_month_deadline_2012-10,year_month_deadline_2012-11,year_month_deadline_2012-12,year_month_deadline_2012-2,year_month_deadline_2012-3,year_month_deadline_2012-4,year_month_deadline_2012-5,year_month_deadline_2012-6,year_month_deadline_2012-7,year_month_deadline_2012-8,year_month_deadline_2012-9,year_month_deadline_2013-1,year_month_deadline_2013-10,year_month_deadline_2013-11,year_month_deadline_2013-12,year_month_deadline_2013-2,year_month_deadline_2013-3,year_month_deadline_2013-4,year_month_deadline_2013-5,year_month_deadline_2013-6,year_month_deadline_2013-7,year_month_deadline_2013-8,year_month_deadline_2013-9,year_month_deadline_2014-1,year_month_deadline_2014-10,year_month_deadline_2014-11,year_month_deadline_2014-12,year_month_deadline_2014-2,year_month_deadline_2014-3,year_month_deadline_2014-4,year_month_deadline_2014-5,year_month_deadline_2014-6,year_month_deadline_2014-7,year_month_deadline_2014-8,year_month_deadline_2014-9,year_month_deadline_2015-1,year_month_deadline_2015-10,year_month_deadline_2015-11,year_month_deadline_2015-12,year_month_deadline_2015-2,year_month_deadline_2015-3,year_month_deadline_2015-4,year_month_deadline_2015-5,year_month_deadline_2015-6,year_month_deadline_2015-7,year_month_deadline_2015-8,year_month_deadline_2015-9,year_month_deadline_2016-1,year_month_deadline_2016-10,year_month_deadline_2016-11,year_month_deadline_2016-12,year_month_deadline_2016-2,year_month_deadline_2016-3,year_month_deadline_2016-4,year_month_deadline_2016-5,year_month_deadline_2016-6,year_month_deadline_2016-7,year_month_deadline_2016-8,year_month_deadline_2016-9,year_month_deadline_2017-1,year_month_deadline_2017-10,year_month_deadline_2017-11,year_month_deadline_2017-12,year_month_deadline_2017-2,year_month_deadline_2017-3,year_month_deadline_2017-4,year_month_deadline_2017-5,year_month_deadline_2017-6,year_month_deadline_2017-7,year_month_deadline_2017-8,year_month_deadline_2017-9,year_month_deadline_2018-1,year_month_deadline_2018-2,year_month_deadline_2018-3,year_month_launched_2009-10,year_month_launched_2009-11,year_month_launched_2009-12,year_month_launched_2009-4,year_month_launched_2009-5,year_month_launched_2009-6,year_month_launched_2009-7,year_month_launched_2009-8,year_month_launched_2009-9,year_month_launched_2010-1,year_month_launched_2010-10,year_month_launched_2010-11,year_month_launched_2010-12,year_month_launched_2010-2,year_month_launched_2010-3,year_month_launched_2010-4,year_month_launched_2010-5,year_month_launched_2010-6,year_month_launched_2010-7,year_month_launched_2010-8,year_month_launched_2010-9,year_month_launched_2011-1,year_month_launched_2011-10,year_month_launched_2011-11,year_month_launched_2011-12,year_month_launched_2011-2,year_month_launched_2011-3,year_month_launched_2011-4,year_month_launched_2011-5,year_month_launched_2011-6,year_month_launched_2011-7,year_month_launched_2011-8,year_month_launched_2011-9,year_month_launched_2012-1,year_month_launched_2012-10,year_month_launched_2012-11,year_month_launched_2012-12,year_month_launched_2012-2,year_month_launched_2012-3,year_month_launched_2012-4,year_month_launched_2012-5,year_month_launched_2012-6,year_month_launched_2012-7,year_month_launched_2012-8,year_month_launched_2012-9,year_month_launched_2013-1,year_month_launched_2013-10,year_month_launched_2013-11,year_month_launched_2013-12,year_month_launched_2013-2,year_month_launched_2013-3,year_month_launched_2013-4,year_month_launched_2013-5,year_month_launched_2013-6,year_month_launched_2013-7,year_month_launched_2013-8,year_month_launched_2013-9,year_month_launched_2014-1,year_month_launched_2014-10,year_month_launched_2014-11,year_month_launched_2014-12,year_month_launched_2014-2,year_month_launched_2014-3,year_month_launched_2014-4,year_month_launched_2014-5,year_month_launched_2014-6,year_month_launched_2014-7,year_month_launched_2014-8,year_month_launched_2014-9,year_month_launched_2015-1,year_month_launched_2015-10,year_month_launched_2015-11,year_month_launched_2015-12,year_month_launched_2015-2,year_month_launched_2015-3,year_month_launched_2015-4,year_month_launched_2015-5,year_month_launched_2015-6,year_month_launched_2015-7,year_month_launched_2015-8,year_month_launched_2015-9,year_month_launched_2016-1,year_month_launched_2016-10,year_month_launched_2016-11,year_month_launched_2016-12,year_month_launched_2016-2,year_month_launched_2016-3,year_month_launched_2016-4,year_month_launched_2016-5,year_month_launched_2016-6,year_month_launched_2016-7,year_month_launched_2016-8,year_month_launched_2016-9,year_month_launched_2017-1,year_month_launched_2017-10,year_month_launched_2017-11,year_month_launched_2017-12,year_month_launched_2017-2,year_month_launched_2017-3,year_month_launched_2017-4,year_month_launched_2017-5,year_month_launched_2017-6,year_month_launched_2017-7,year_month_launched_2017-8,year_month_launched_2017-9,year_month_launched_2018-1,year_month_launched_2070-1
0,-0.390168,-0.258313,-0.037827,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,-0.302298,-0.149789,-0.013327,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,-0.372594,-0.248451,-0.000417,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,-0.38431,-0.258268,-0.034844,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,-0.308156,-0.200801,-0.022364,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0.922023,2.089438,0.003886,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,-0.29644,-0.204297,-0.038287,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,-0.155849,-0.238006,-0.017631,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,-0.050405,0.110738,0.068436,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,-0.138275,0.021426,0.016796,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
import statsmodels.api as sm
import numpy as np

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Function to perform logistic regression and calculate p-values
def logistic_regression_with_pvalues(X_train, y_train, X_test, y_test):
    # Perform Logistic Regression with class weights and regularization
    logreg = LogisticRegression(class_weight='balanced', penalty='l2', C=1.0, max_iter=100000, random_state=10)
    logreg.fit(X_train, y_train)

    # Get feature importances (coefficients) from the logistic regression model
    feature_importance = logreg.coef_[0]

    # Print feature importances (you can also plot them if you prefer)
    print("Feature Importances:")
    for feature, importance in zip(X_train.columns, feature_importance):
        print(f"{feature}: {importance}")

    # Make predictions on the test set
    y_pred = logreg.predict(X_test)

    # Compute classification metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print("\nClassification Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"AUC: {auc:.2f}")
    print(f"F1 Score: {f1:.2f}")

    # Fit logistic regression using statsmodels to get p-values
    X_train_with_constant = sm.add_constant(X_train)
    logit_model = sm.Logit(y_train, X_train_with_constant)
    result = logit_model.fit()

    # Filter features with p-values < 0.05
    significant_features = result.pvalues[1:].index[result.pvalues[1:] < 0.05]
    print("\nSignificant Features (p-value < 0.05):")
    print(significant_features)

    # Add epsilon to avoid divide by zero in log function
    epsilon = 1e-8
    result.pvalues = np.maximum(result.pvalues, epsilon)

# Assuming you already have X_train, X_test, y_train, y_test from the previous code
logistic_regression_with_pvalues(X_train, y_train, X_test, y_test)

Feature Importances:
backers: 3.821338782997042
usd_pledged_real: 4.157584325448635
usd_goal_real: -149.2388015589726
main_category_Art: 0.3956117057690875
main_category_Comics: 0.24931314931241202
main_category_Crafts: -0.19371538629703855
main_category_Dance: 1.0551454650025538
main_category_Design: -0.6238287649185625
main_category_Fashion: -0.41202027062670943
main_category_Film & Video: 0.28328889516521966
main_category_Food: -0.31287187031872693
main_category_Games: -0.8132371191048826
main_category_Journalism: -0.29822101287281544
main_category_Music: 0.456317213923315
main_category_Photography: -0.07146943023114137
main_category_Publishing: -0.12161339853310858
main_category_Technology: -0.8961563929818792
main_category_Theater: 1.011307363139659
currency_AUD: -0.06201991428617002
currency_CAD: 0.01445812057447102
currency_CHF: -0.09083585100905302
currency_DKK: 0.2617051359559568
currency_EUR: -0.188364554570937
currency_GBP: 0.2334693303702215
currency_HKD: -0.010014374537070


Classification Metrics:
Accuracy: 0.92
Precision: 0.87
Recall: 0.89
AUC: 0.91
F1 Score: 0.88


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


         Current function value: inf
         Iterations: 35


LinAlgError: Singular matrix

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
import numpy as np

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Function to perform decision tree classification and calculate metrics
def decision_tree_classification(X_train, y_train, X_test, y_test):
    # Initialize the Decision Tree Classifier with class weights
    class_weights = {0: 1.0, 1: 10.0}  # Adjust the weights according to your dataset
    dt_classifier = DecisionTreeClassifier(class_weight=class_weights, random_state=10)

    # Train the Decision Tree Classifier
    dt_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = dt_classifier.predict(X_test)

    # Compute classification metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print("\nDecision Tree Classification Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"AUC: {auc:.2f}")
    print(f"F1 Score: {f1:.2f}")

# Assuming you already have X_train, X_test, y_train, y_test from the previous code
decision_tree_classification(X_train, y_train, X_test, y_test)



Decision Tree Classification Metrics:
Accuracy: 0.99
Precision: 0.99
Recall: 0.99
AUC: 0.99
F1 Score: 0.99


# DO NOT TOUCH BELOW

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
import statsmodels.api as sm

def logistic_regression_with_pvalues(X_train, y_train, X_test, y_test):
    # Calculate class weights
    #class_counts = y_train.value_counts()
    #class_weight_0 = class_counts[1] / (class_counts[0] + class_counts[1])
    #class_weight_1 = class_counts[0] / (class_counts[0] + class_counts[1])

    # Perform Logistic Regression with class weights
    logreg = LogisticRegression(class_weight='balanced')
    logreg.fit(X_train, y_train)

    # Get feature importances (coefficients) from the logistic regression model
    feature_importance = logreg.coef_[0]

    # Print feature importances (you can also plot them if you prefer)
    print("Feature Importances:")
    for feature, importance in zip(X_train.columns, feature_importance):
        print(f"{feature}: {importance}")

    # Make predictions on the test set
    y_pred = logreg.predict(X_test)

    # Compute classification metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print("\nClassification Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"AUC: {auc:.2f}")
    print(f"F1 Score: {f1:.2f}")

    # Fit logistic regression using statsmodels to get p-values
    X_train_with_constant = sm.add_constant(X_train)
    logit_model = sm.Logit(y_train, X_train_with_constant)
    result = logit_model.fit()

    # Filter features with p-values < 0.05
    significant_features = result.pvalues[1:].index[result.pvalues[1:] < 0.05]
    print("\nSignificant Features (p-value < 0.05):")
    print(significant_features)

# Assuming you already have X_train, X_test, y_train, y_test from the previous code
logistic_regression_with_pvalues(X_train, y_train, X_test, y_test)
