#### Importing Libraries/Modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

#### Reading File

In [2]:
df2016=pd.read_csv('ks-projects-201612.csv')
df2018=pd.read_csv('ks-projects-201801.csv')

#### Data Cleaning

In [3]:
df2016.columns

Index(['ID ', 'name ', 'category ', 'main_category ', 'currency ', 'deadline ',
       'goal ', 'launched ', 'pledged ', 'state ', 'backers ', 'country ',
       'usd pledged ', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15',
       'Unnamed: 16'],
      dtype='object')

In [4]:
df2016=df2016.loc[df2016['currency '].isin(['GBP', 'USD', 'CAD', 'NOK', 'AUD', 'EUR', 'MXN', 'SEK', 'NZD',
       'CHF', 'DKK', 'HKD'])]

In [5]:
df2016=df2016[df2016['country ']!='N,"0']

In [6]:
df2016.drop(['Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15',
       'Unnamed: 16'],inplace=True,axis=1)

In [7]:
df2016.info()

<class 'pandas.core.frame.DataFrame'>
Index: 319209 entries, 0 to 323749
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   ID              319209 non-null  int64 
 1   name            319205 non-null  object
 2   category        319209 non-null  object
 3   main_category   319209 non-null  object
 4   currency        319209 non-null  object
 5   deadline        319209 non-null  object
 6   goal            319209 non-null  object
 7   launched        319209 non-null  object
 8   pledged         319209 non-null  object
 9   state           319209 non-null  object
 10  backers         319209 non-null  object
 11  country         319209 non-null  object
 12  usd pledged     319209 non-null  object
dtypes: int64(1), object(12)
memory usage: 34.1+ MB


In [8]:
df2016.columns = df2016.columns.str.strip()

In [9]:
df2016['name']=df2016['name'].astype(str)
df2016['category']=df2016['category'].astype(str)
df2016['main_category']=df2016['main_category'].astype(str)
df2016['currency']=df2016['currency'].astype(str)
df2016['state']=df2016['state'].astype(str)
df2016['goal']=pd.to_numeric(df2016['goal'])
df2016['backers']=pd.to_numeric(df2016['backers'])
df2016['country']=df2016['country'].astype(str)
df2016['usd pledged']=pd.to_numeric(df2016['usd pledged'])
df2016['pledged']=pd.to_numeric(df2016['pledged'])

In [10]:
df2016['deadline']=pd.to_datetime(df2016['deadline'],format="%d-%m-%Y %H:%M")
df2016['deadline_date']=df2016['deadline'].dt.date

df2016['launched']=pd.to_datetime(df2016['launched'],format="%d-%m-%Y %H:%M")
df2016['launched_date']=df2016['launched'].dt.date

In [11]:
df2016.drop(['deadline','launched','pledged','ID','name'],axis=1,inplace=True)

##### Data Distribution

In [12]:
main_category_counts = df2016['main_category'].value_counts()
category_counts = df2016['category'].value_counts()
state_counts = df2016['state'].value_counts()

In [13]:
categorical_columns=[]
numeric_columns=[]
time_columns=[]

for column in df2016.columns.tolist():
    if df2016[column].dtype=='object':
        categorical_columns.append(column)
    else:
        numeric_columns.append(column)

In [14]:
time_columns.append(categorical_columns.pop(-1))
time_columns.append(categorical_columns.pop(-1))



In [15]:
print(categorical_columns,numeric_columns,time_columns)

['category', 'main_category', 'currency', 'state', 'country'] ['goal', 'backers', 'usd pledged'] ['launched_date', 'deadline_date']


In [16]:
df2016.info()

<class 'pandas.core.frame.DataFrame'>
Index: 319209 entries, 0 to 323749
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   category       319209 non-null  object 
 1   main_category  319209 non-null  object 
 2   currency       319209 non-null  object 
 3   goal           319209 non-null  float64
 4   state          319209 non-null  object 
 5   backers        319209 non-null  int64  
 6   country        319209 non-null  object 
 7   usd pledged    319209 non-null  float64
 8   deadline_date  319209 non-null  object 
 9   launched_date  319209 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 26.8+ MB


In [17]:
df2016.head()

Unnamed: 0,category,main_category,currency,goal,state,backers,country,usd pledged,deadline_date,launched_date
0,Poetry,Publishing,GBP,1000.0,failed,0,GB,0.0,2015-10-09,2015-08-11
1,Narrative Film,Film & Video,USD,45000.0,failed,3,US,220.0,2013-02-26,2013-01-12
2,Music,Music,USD,5000.0,failed,1,US,1.0,2012-04-16,2012-03-17
3,Film & Video,Film & Video,USD,19500.0,canceled,14,US,1283.0,2015-08-29,2015-07-04
4,Restaurants,Food,USD,50000.0,successful,224,US,52375.0,2016-04-01,2016-02-26


##### Label Encoding for Categorical Columns

In [18]:
labelencoder=LabelEncoder()


for column in categorical_columns:
    df2016[column]=labelencoder.fit_transform(df2016[column])

##### Find Correlations

In [19]:
df2016_cor=df2016[['category', 'main_category', 'currency', 'goal', 'state', 'backers','country', 'usd pledged']]

In [20]:
df2016_cor.corr()

Unnamed: 0,category,main_category,currency,goal,state,backers,country,usd pledged
category,1.0,0.203767,-0.008892,0.002917,0.015053,0.038119,-0.009573,0.02593
main_category,0.203767,1.0,-0.015602,0.004012,-0.020386,0.000569,-0.014601,0.005622
currency,-0.008892,-0.015602,1.0,-0.006009,0.073745,0.011655,0.985688,0.011786
goal,0.002917,0.004012,-0.006009,1.0,-0.022909,0.004921,-0.004876,0.006278
state,0.015053,-0.020386,0.073745,-0.022909,1.0,0.112615,0.070069,0.095593
backers,0.038119,0.000569,0.011655,0.004921,0.112615,1.0,0.010997,0.714973
country,-0.009573,-0.014601,0.985688,-0.004876,0.070069,0.010997,1.0,0.011194
usd pledged,0.02593,0.005622,0.011786,0.006278,0.095593,0.714973,0.011194,1.0


In [21]:
# Since Country and Currency are highly correlated, we can remove one of the columns

df2016.drop('currency',axis=1,inplace=True)

In [38]:
df2016.head()

Unnamed: 0,category,main_category,goal,state,backers,country,usd pledged,deadline_date,launched_date
0,107,12,1000.0,1,0,9,0.0,2015-10-09,2015-08-11
1,92,6,45000.0,1,3,19,220.0,2013-02-26,2013-01-12
2,89,10,5000.0,1,1,19,1.0,2012-04-16,2012-03-17
3,55,6,19500.0,0,14,19,1283.0,2015-08-29,2015-07-04
4,122,7,50000.0,3,224,19,52375.0,2016-04-01,2016-02-26


In [25]:
Predictor_Vars=df2016[['category', 'main_category', 'goal', 'backers','country', 'usd pledged']]
Response_Var=df2016['state']

# Splitting the dataset into Training and Test set
X_train,X_test,y_train,y_test=train_test_split(Predictor_Vars,Response_Var,train_size=0.8)

##### Initializing the models


In [26]:
dt_model=DecisionTreeClassifier()
rf_model=RandomForestClassifier()
gb_model=GradientBoostingClassifier()
ab_model=AdaBoostClassifier()

##### Training the models

In [27]:
# Decision Tree 
dt_model.fit(X_train,y_train)

In [28]:
# Random Forest
rf_model.fit(X_train,y_train)

In [29]:
# Gradient Boost
gb_model.fit(X_train,y_train)

In [30]:
# Ada boost
ab_model.fit(X_train,y_train)

##### Testing the model with test data set

In [31]:
dt_pred=dt_model.predict(X_test)
rf_pred=rf_model.predict(X_test)
gb_pred=gb_model.predict(X_test)
ab_pred=ab_model.predict(X_test)

##### Accuracy Score of the models


In [32]:
print("Decision Tree=",accuracy_score(y_test,dt_pred))
print("Random Forest=",accuracy_score(y_test,rf_pred))
print("Gradient Boost=",accuracy_score(y_test,gb_pred))
print("Adaboost=",accuracy_score(y_test,ab_pred))

Decision Tree= 0.792675668055512
Random Forest= 0.8543591992732057
Gradient Boost= 0.8578365339431722
Adaboost= 0.8212462015601015


##### Confusion Matrix

In [47]:
models=["DecisionTreeClassifier","RandomForestClassifier","GradientBoostingClassifier","AdaBoostClassifier"]
predictions=[dt_pred,rf_pred,gb_pred,ab_pred]


for i in range(len(models)):
    print(models[i])
    print(confusion_matrix(y_test,predictions[i]))
    print("---------------------------------")

DecisionTreeClassifier
[[ 1174  4998    93   184    38]
 [ 4839 27507   518   491   142]
 [   95   505    89   135    10]
 [  221   437   167 21832    52]
 [   50   204     4    53     4]]
---------------------------------
RandomForestClassifier
[[  446  5822    20   184    15]
 [ 1213 31792    64   404    24]
 [   26   609    37   161     1]
 [   23   409     6 22268     3]
 [   19   234     1    60     1]]
---------------------------------
GradientBoostingClassifier
[[    7  6210    10   259     1]
 [    9 32537    19   929     3]
 [    0   644    56   134     0]
 [    2   521    19 22165     2]
 [    0   251     0    63     1]]
---------------------------------
AdaBoostClassifier
[[    1  6007     7   469     3]
 [    6 31117    50  2313    11]
 [    0   661    20   153     0]
 [    3  1373    40 21291     2]
 [    0   240     5    69     1]]
---------------------------------


##### Parameter Tuning

In [48]:
from sklearn.model_selection import RandomizedSearchCV

##### Multiple parameter options for each model

In [49]:
dt_params={
    "criterion":["gini", "entropy", "log_loss"],
    "splitter":["best","random"],
    "max_depth":[5,10,15,20,50,100],
    
}

In [51]:
rf_params={
    "criterion":["gini","entropy","log_loss"],
    "max_depth":[5,10,15,20,50,100,None],
    "max_features":["sqrt", "log2", None]
}

In [52]:
gb_params={
    "loss":["log_loss","exponential"],
    "learning_rate":[0.1,0.5,1,2,10,100],
    "n_estimators":[10,20,50,100],
    "criterion":['friedman_mse','squared_error']
}

In [53]:
ab_params={
    "learning_rate":[0.1,0.5,1,2,10,100],
    "n_estimators":[10,20,50,100]    
}

##### Passing the parameters to the model functions

In [57]:
models=["DecisionTreeClassifier","RandomForestClassifier","GradientBoostingClassifier","AdaBoostClassifier"]
model_initializations=[dt_model,rf_model,gb_model,ab_model]
model_params=[dt_params,rf_params,gb_params,ab_params]

In [58]:
for i in range(len(model_initializations)):
    random_search=RandomizedSearchCV(
        estimator=model_initializations[i],
        param_distributions=model_params[i]

    )
    random_search.fit(X_train,y_train)
    print(models[i])
    print(random_search.best_params_)
    print('\n\n---------------------------\n\n')


DecisionTreeClassifier
{'splitter': 'best', 'max_depth': 5, 'criterion': 'entropy'}


---------------------------


RandomForestClassifier
{'max_features': None, 'max_depth': 10, 'criterion': 'log_loss'}


---------------------------


GradientBoostingClassifier
{'n_estimators': 100, 'loss': 'log_loss', 'learning_rate': 1, 'criterion': 'friedman_mse'}


---------------------------


AdaBoostClassifier
{'n_estimators': 100, 'learning_rate': 0.5}


---------------------------


