Importing all the libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import plotly.tools as tls
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import warnings
from collections import Counter

Reading the data from the csv file and exploring the dataset.

In [2]:
df = pd.read_csv('data.csv')
print('The dataset contains {} rows and {} columns'.format(df.shape[0], df.shape[1]))
print()

df.head()

The dataset contains 378661 rows and 15 columns



Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


Let us see if the dataset contains any missing values

In [3]:
#Checking for missing values
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

Unnamed: 0,Total,Percent
usd pledged,3797,0.010027
name,4,1.1e-05
usd_goal_real,0,0.0
usd_pledged_real,0,0.0
country,0,0.0
backers,0,0.0
state,0,0.0
pledged,0,0.0
launched,0,0.0
goal,0,0.0


The columns ```usd_pledged``` and ```name``` have a few missing values, but we can ignore those columns as I think the name of the project won't play much of a part in predicting the state of the project and we can just ignore ```usd pledged``` column and use the ```usd_pledged_real``` column instead.

Now let us see how many unique values does each column in the dataset have.

In [4]:
df.nunique()

ID                  378661
name                375764
category               159
main_category           15
currency                14
deadline              3164
goal                  8353
launched            378089
pledged              62130
state                    6
backers               3963
country                 23
usd pledged          95455
usd_pledged_real    106065
usd_goal_real        50339
dtype: int64

## Exploratory Data Analysis

Let us see the distribution of the final state of the projectsin percentage.

In [5]:
success_percent = round(df["state"].value_counts() / len(df["state"]) * 100,2)

print("Final States in %: ")
print(success_percent)

labels = list(success_percent.index)
values = list(success_percent.values)

data = [go.Pie(labels=labels, values=values, marker=dict(colors=['red']))]

layout = go.Layout(title='Distribuition of States', legend=dict(orientation="h"));

fig = go.Figure(data=data, layout=layout)
iplot(fig)

Final States in %: 
failed        52.22
successful    35.38
canceled      10.24
undefined      0.94
live           0.74
suspended      0.49
Name: state, dtype: float64


As we can see that most projects on kickstarter have failed. Let us see which category has the most failed projects and which category is the most successful one.

In [6]:
failed_categories = df['main_category'][df['state']=='failed']

failed_percentage = round(failed_categories.value_counts()/len(failed_categories) * 100, 2)

data = [go.Bar(x = failed_percentage.index,
             y = failed_percentage.values)]

layout = go.Layout(title = 'Failed Projects Percentage vs Main Category',
                  xaxis = dict(title = 'Category'),
                  yaxis = dict(title = 'Percentage'))

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [7]:
success_categories = df['main_category'][df['state']=='successful']

success_percentage = round(success_categories.value_counts()/len(failed_categories) * 100, 2)

data = [go.Bar(x = success_percentage.index,
             y = success_percentage.values)]

layout = go.Layout(title = 'Successful Projects Percentage vs Main Category',
                  xaxis = dict(title = 'Category'),
                  yaxis = dict(title = 'Percentage'))

fig = go.Figure(data=data, layout=layout)
iplot(fig)

Now let us compare the goal ammount and the goal amount vs the pledged amount for failed and successful projects.

In [8]:
success_goal = df.loc[:10000,'goal'][df['state']=='successful']
success_pledged = df.loc[:10000,'pledged'][df['state']=='successful']
fail_goal = df.loc[:10000,'goal'][df['state']=='failed']
fail_pledged = df.loc[:10000,'pledged'][df['state']=='failed']

data = [go.Scatter(
    x = success_pledged,
    y = success_goal,
    mode = 'markers',
    name = 'successful'),
       go.Scatter(
    x = fail_pledged,
    y = fail_goal,
    mode = 'markers',
    name = 'failed')]

layout = go.Layout(title = 'Goal Amount vs Pledged Amount',
                  xaxis = dict(title = 'Pledged Amount'),
                  yaxis = dict(title = 'Goal Amount'))

fig = go.Figure(data=data, layout=layout)
iplot(fig)

From this plot above we can say that projects for which the goal amount is more than the pledged amount have failed.
Let us compare the average number of backers for project with different states.

In [9]:
def get_avg_backers(df):
    states = df['state'].unique()
    Avg = []
    for state in states:
        backers = df['backers'][df['state']==state]
        Avg.append(np.mean(backers))
    return states, Avg

S, B = get_avg_backers(df)

data = [go.Bar(x = S,
              y = B)]

layout = go.Layout(title = 'Successful Projects Percentage vs Main Category',
                  xaxis = dict(title = 'State'),
                  yaxis = dict(title = 'Average Backers'))

fig = go.Figure(data=data, layout=layout)
iplot(fig)

We can observe that more the number of backers, higher is the possibility of that project to be a success.

Let us see if the country from which the project was has any impact on the state of the project

In [10]:
def get_country_stats(df):
    countries = df['country'].unique()
    Avg = []
    for country in countries:
        success_num = ((df['country']==country) & (df['state']=='successful')).sum()
        percent = round(success_num/(df['country']==country).sum() *100 , 2)
        Avg.append(percent)
    return countries, Avg

C, A = get_country_stats(df)

data = [go.Bar(x = C,
              y = A)]

layout = go.Layout(title = 'Successful Projects Percentage per country',
                  xaxis = dict(title = 'country'),
                  yaxis = dict(title = 'Success Percentage'))

fig = go.Figure(data=data, layout=layout)
iplot(fig)

Thus the success percent varies significantly for different countries. Now let us see the yearly trend in state of a project.

In [11]:
#Converting the end date of the project to datetime object
def get_year(date_str):
    date = pd.datetime.strptime(date_str, '%Y-%m-%d')
    year = date.year
    return year

df['deadline_year'] = df['deadline'].apply(lambda x: get_year(x))

In [12]:
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,deadline_year
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,2015
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,2017
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,2013
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,2012
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,2015


In [13]:
def get_yearly_stats(df):
    years = sorted(df['deadline_year'].unique())
    success = []
    failure = []
    live = []
    cancelled = []
    suspended =[]
    undefined = []
    for year in years:
        success_percent = round(((df['deadline_year']==year) & (df['state']=='successful')).sum()/(df['deadline_year']==year).sum() *100 ,2)
        fail_percent = round(((df['deadline_year']==year) & (df['state']=='failed')).sum()/(df['deadline_year']==year).sum() *100 ,2)
        cancel_percent = round(((df['deadline_year']==year) & (df['state']=='canceled')).sum()/(df['deadline_year']==year).sum() *100 ,2)
        suspend_percent = round(((df['deadline_year']==year) & (df['state']=='suspended')).sum()/(df['deadline_year']==year).sum() *100 ,2)
        live_percent = round(((df['deadline_year']==year) & (df['state']=='live')).sum()/(df['deadline_year']==year).sum() *100 ,2)
        undefined_percent = round(((df['deadline_year']==year) & (df['state']=='undefined')).sum()/(df['deadline_year']==year).sum() *100 ,2)
        
        success.append(success_percent)
        failure.append(fail_percent)
        live.append(live_percent)
        cancelled.append(cancel_percent)
        undefined.append(undefined_percent)
        suspended.append(suspend_percent)
    return years, success, failure, live, cancelled, undefined, suspended

Y, SC, F, L, C, U, SP = get_yearly_stats(df)

data = [go.Bar(x = Y,
              y = SC,
              name = 'Successful'),
       go.Bar(x = Y,
              y = F,
              name = 'Failed'),
       go.Bar(x = Y,
              y = L,
              name = 'Live'),
       go.Bar(x = Y,
              y = C,
              name = 'Canceled'),
       go.Bar(x = Y,
              y = U,
              name = 'Undefined'),
       go.Bar(x = Y,
              y = SP,
              name = 'Suspended')]

layout = go.Layout(title = 'State Percentage per Year',
                  xaxis = dict(title = 'Year'),
                  yaxis = dict(title = 'Percentage'))

fig = go.Figure(data=data, layout=layout)
iplot(fig)

As we can see that the most of the projects in the current year have their state as ```live```.


## Feature Engineering

from EDA, I decide to take the ```category```, ```main_category```, ```deadline_year```, and ```country``` as the categorical variables and ```backers``` as numerical variable for predicting the state of a project. I will also introduce one more categorical variable whose value will be 1 if the 'pledged amount' is greater than or equal to the 'goal amount'. Let us name that column ```success_probability```


In [14]:
def success_probability(goal, pledged):
    success = goal - pledged
    success_prob = (success < 5)
    return success_prob

df['success_probability'] = success_probability(df['usd_goal_real'].values, df['usd_pledged_real'].values)

Now let us divide the data into train set and test set

In [15]:
split_ratio = 0.75
split = int(split_ratio * len(df))
train = df.iloc[:split, :]
test = df.iloc[split:,:]

let us convert the categorical variables into one hot vectors

In [16]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#Getting the required categorical features
entire_corpus = df.loc[:,['country','category', 'main_category' , 'success_probability']].values
train_features = train.loc[:,['country','category', 'main_category' , 'success_probability']].values
test_features = test.loc[:,['country','category', 'main_category' , 'success_probability']].values

#Encoding each categorical column one at a time
for i in range(len(train_features[0,:])):
    label_encoder = LabelEncoder()
    label_encoder.fit(entire_corpus[:,i])
    train_features[:,i] = label_encoder.transform(train_features[:,i])
    test_features[:,i] = label_encoder.transform(test_features[:,i])

#Adding the deadline_year column 
train_features = np.concatenate((train_features, np.reshape(train['deadline_year'].values, (-1,1))), axis = 1)
test_features = np.concatenate((test_features, np.reshape(test['deadline_year'].values, (-1,1))), axis = 1)

#Converting all the encoded columns to one hot vectors
onehotencoder = OneHotEncoder(categorical_features = 'all')
train_features = onehotencoder.fit_transform(train_features).toarray()
test_features = onehotencoder.transform(test_features).toarray()

#Adding the numerical column to one hot vectors
train_features = np.concatenate((train_features, np.reshape(train['backers'].values, (-1,1))), axis = 1)
test_features = np.concatenate((test_features, np.reshape(test['backers'].values, (-1,1))), axis = 1)

#endocingthe target labels
label_encoder.fit(df['state'].values)
y_train = label_encoder.transform(train['state'].values)
y_test = label_encoder.transform(test['state'].values)


## Evaluation Metrics: Accuracy


In [17]:
from sklearn.metrics import accuracy_score

## Prediction Models

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB

### Logistic Regression

In [19]:
logistic_classifier = LogisticRegression()

logistic_classifier.fit(train_features, y_train)
y_pred = logistic_classifier.predict(test_features)


print('Logistic Regression Accuracy : {}'.format(accuracy_score(y_test, y_pred)))

Logistic Regression Accuracy : 0.8908161325079754


### Naive Bayes

In [20]:
nb_classifier = GaussianNB()

nb_classifier.fit(train_features, y_train)
y_pred = nb_classifier.predict(test_features)

print('Naive Bayes Classifier Accuracy : {}'.format(accuracy_score(y_test, y_pred)))

Naive Bayes Classifier Accuracy : 0.67156106733146


### Random Forest Classifier

In [24]:
rf_classifier = RandomForestClassifier(n_estimators = 20)
rf_classifier.fit(train_features, y_train)
y_pred = rf_classifier.predict(test_features)

print('Random Forest Classifier Accuracy : {}'.format(accuracy_score(y_test, y_pred)))

Random Forest Classifier Accuracy : 0.8714321931844591


### XGBoost Classifier

In [22]:
xgb_classifier = XGBClassifier()
xgb_classifier.fit(train_features, y_train)
y_pred = xgb_classifier.predict(test_features)

print('XGBoost Classifier Accuracy : {}'.format(accuracy_score(y_test, y_pred)))

XGBoost Classifier Accuracy : 0.8909851477827309



The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.



### LightGBM Classifier

In [23]:
lgb_classifier = LGBMClassifier()
lgb_classifier.fit(train_features, y_train)
y_pred = lgb_classifier.predict(test_features)

print('LightGBM Classifier Accuracy : {}'.format(accuracy_score(y_test, y_pred)))

LightGBM Classifier Accuracy : 0.8912069803308474



The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.



We can see that LightGBM Classifier has achieved the best accuracy. Thus we will choose this model for classification.