In [None]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 67)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale = 1)
%matplotlib inline
plt.style.use('ggplot')

np.random.seed(7)
from prettytable import PrettyTable
import warnings
warnings.filterwarnings('ignore')

from IPython.core.pylabtools import figsize

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as MSE

from keras.models import Sequential
from keras import layers
from keras.layers import LSTM,Dropout
from keras.layers import Dense,Conv1D,MaxPooling1D
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

In [None]:
kickstarter = pd.read_csv('../input/kickstarter-campaigns-dataset/kickstarter_data_full.csv', index_col=0)

In [None]:
# The dataset have 20632 entries of 67 features.
kickstarter.shape
kickstarter.info()

In [None]:
# looking here we can see that 'is_backing' and 'profile' contain missing values
sns.heatmap(kickstarter.isnull())

In [None]:
# friends, is_starred, is_backing, and permissions are looking weird
kickstarter['friends'].isnull().value_counts()
cols_to_drop = ['friends', 'is_starred', 'is_backing', 'permissions']
kickstarter.drop(labels=cols_to_drop, axis=1, inplace=True)
kickstarter.drop(labels='profile', axis=1, inplace=True)

In [None]:
#there are a lot of unnecessary features
second_col_drop = ['id', 
                   'photo', 
                   'slug', 
                   'currency_symbol',
                   'currency_trailing_code', 
                   'creator', 
                   'location',
                   'urls', 
                   'source_url', 
                   'name_len', 
                   'blurb_len',
                   'create_to_launch', 
                   'launch_to_deadline', 
                   'launch_to_state_change',
                   'USorGB', 
                   'TOPCOUNTRY', 
                   'LaunchedTuesday', 
                   'DeadlineWeekend',
                   'deadline_month', 'deadline_day', 'deadline_yr', 'deadline_hr', 
                   'state_changed_at_month', 'state_changed_at_day', 'state_changed_at_yr', 'state_changed_at_hr',
                   'created_at_month', 'created_at_day', 'created_at_yr', 'created_at_hr',
                   'launched_at_month', 'launched_at_day', 'launched_at_yr', 'launched_at_hr']
                   
kickstarter.drop(labels=second_col_drop, axis=1, inplace=True)

In [None]:
# we reduced dimensionality from 67 to 28
kickstarter.shape

In [None]:
kickstarter['disable_communication'] = kickstarter['disable_communication'] * 1 #converts type bool to 0 for false and 1 for true
kickstarter['staff_pick'] = kickstarter['staff_pick'] * 1 #converts type bool to 0 for false and 1 for true
kickstarter['spotlight'] = kickstarter['spotlight'] * 1 #converts type bool to 0 for false and 1 for true

In [None]:
figsize(10, 5)
sns.countplot(kickstarter['state']);
plt.xlabel('Campaign States'); 
plt.ylabel('Count');

In [None]:
figsize(2, 5)
sns.countplot(kickstarter['SuccessfulBool']);
plt.xlabel('Campaign Only Success or Failure'); 
plt.ylabel('Count');

In [None]:
print("Only " 
      + str(np.round(kickstarter['SuccessfulBool'].value_counts()[1] / len(kickstarter) * 100, decimals=2)) 
      + "% of campaigns were successful.")

In [None]:
# when we look at the general statistics, we see how each feature covers a very different range.
kickstarter.describe().transpose()

In [None]:
# Let's take a quick look at the common distribution of a few pairs of columns.
sns.pairplot(kickstarter[['goal','pledged','staff_pick', 'backers_count', 'spotlight','SuccessfulBool']], diag_kind='kde')

In [None]:
# It looks like the goal variable has a huge spread
kickstarter['goal'].sort_values().tail()

In [None]:
# the pledged amount is more reasonable because this represents real money that people decided to give
kickstarter['pledged'].sort_values().tail()

In [None]:
first_quartile = kickstarter['goal'].describe()['25%']
third_quartile = kickstarter['goal'].describe()['75%']
iqr = third_quartile - first_quartile
kickstarter_goal_iqr = kickstarter[(kickstarter['goal'] > first_quartile) & (kickstarter['goal'] < third_quartile)]

In [None]:
kickstarter_goal_iqr.describe().transpose()

In [None]:
figsize(5, 5)   
plt.hist(kickstarter_goal_iqr['goal'], 10, 
                            density = 10, 
                            color ='red',
                            edgecolor = 'black',
                            alpha = 0.7)
  
plt.xlabel('Campaign Goal USD')
plt.ylabel('Campaign Goal Distribution')  
plt.show()

In [None]:
# trim backers_count, pledged and create_to_launch_days then create a new IQR dataframe with these truncated values

kickstarter_iqr_trimmed = kickstarter_goal_iqr

first_quartile = kickstarter['create_to_launch_days'].describe()['25%']
third_quartile = kickstarter['create_to_launch_days'].describe()['75%']

iqr = third_quartile - first_quartile

kickstarter_iqr_trimmed = kickstarter[(kickstarter['create_to_launch_days'] > first_quartile) & (kickstarter['create_to_launch_days'] < third_quartile)]

first_quartile = kickstarter['pledged'].describe()['25%']
third_quartile = kickstarter['pledged'].describe()['75%']

iqr = third_quartile - first_quartile

kickstarter_iqr_trimmed = kickstarter[(kickstarter['pledged'] > first_quartile) & (kickstarter['pledged'] < third_quartile)]

first_quartile = kickstarter['backers_count'].describe()['25%']
third_quartile = kickstarter['backers_count'].describe()['75%']

iqr = third_quartile - first_quartile

kickstarter_iqr_trimmed = kickstarter[(kickstarter['backers_count'] > first_quartile) & (kickstarter['backers_count'] < third_quartile)]

In [None]:
# This reduction resulted in a dataframe where there are 9308 instances,
# with only the IQR for the variables in question remaining.
len(kickstarter_iqr_trimmed)

In [None]:
# correlations between each variable against SuccessfulBool, which remember, is a binary value where 0=failed and 1=succeeded.
kickstarter_iqr_trimmed.corr()['SuccessfulBool'].sort_values()

In [None]:
# Looking at the correlations above we can see that nothing is too strongly correlated except spotlight, backers_count, pledged, and staff_pick
# But really the only significant ones are backers_count and spotlight

len(kickstarter_iqr_trimmed[kickstarter_iqr_trimmed['spotlight'] == 1])

In [None]:
# taken together with the spotlight variable's correlation to SuccessfulBool,
# we can conclude that all spotlighted campaigns were successful, at least in this dataset,
# taking into account the fact that it is reduced to IQR values only

len(kickstarter_iqr_trimmed[kickstarter_iqr_trimmed['SuccessfulBool'] == 1])

In [None]:
# we are going to pool together these strongly correlated features for feature selection
reduced_x_features = kickstarter_iqr_trimmed[['launch_to_deadline_days', 'staff_pick', 'pledged', 'backers_count', 'spotlight', 'goal']]
reduced_y = kickstarter_iqr_trimmed[['SuccessfulBool']]

In [None]:
# # we are going to pool together these strongly correlated features for feature selection
# reduced_x_features = kickstarter_iqr_trimmed[['launch_to_deadline_days', 'staff_pick', 'SuccessfulBool', 'backers_count', 'spotlight', 'goal']]
# reduced_y = kickstarter_iqr_trimmed[['pledged']]

In [None]:
# Because of the original format of the variables, we need to take the log and sqrt transformations of them and 
# check correlation with those as well to account for non-linear relationships

numeric_subset = kickstarter_iqr_trimmed.select_dtypes('number')

for col in numeric_subset.columns:
    if col == 'SuccessfulBool':
        next
    else:
        numeric_subset['sqrt_' + col] = np.sqrt(numeric_subset[col])
        numeric_subset['log_' + col] = np.log(numeric_subset[col])

categorical_subset = kickstarter_iqr_trimmed['category']

categorical_subset = pd.get_dummies(categorical_subset)
features = pd.concat([numeric_subset, categorical_subset], axis = 1)
features = features.dropna(subset = ['SuccessfulBool'])

correlations = features.corr()['SuccessfulBool'].dropna().sort_values()
correlations.head()

In [None]:
# we saw in the previous step that goal got a boost in correlation what you take its log,
# so we will add log_goal into the reduced_x_features dataframe and 
# saw log_pledged show a significant boost as well, so that will be included

reduced_x_features['log_goal'] = features['log_goal']
reduced_x_features['log_pledged'] = features['log_pledged']
#reduced_x_features.drop('pledged', axis=1, inplace=True)

In [None]:
reduced_x_features

In [None]:
reduced_y

In [None]:
# Feature engineering is the process of creating new features from existing ones in a sense, 
# so when we transformed goal and pledged to log_goal and log_pledged, 
# we found that these had a stronger correlation than their original forms,
# so these new features were added to reduced_x_feature
figsize(20,7)
sns.heatmap(kickstarter_iqr_trimmed.corr(), annot=True, annot_kws={"size": 10}, cmap="Purples")


In [None]:
kickstarter_X = []
kickstarter_y = []
for i, j in reduced_x_features.iterrows():
    tmp = str(reduced_x_features['launch_to_deadline_days'][i]) + " " + \
        str(reduced_x_features['staff_pick'][i]) + " " + \
        str(reduced_x_features['backers_count'][i]) + " " + \
        str(reduced_x_features['spotlight'][i]) + " " + \
        str(reduced_x_features['goal'][i]) + " " + \
        str(reduced_x_features['log_goal'][i]) + " " + \
        str(reduced_x_features['log_pledged'][i])  
    kickstarter_X.append(tmp)
    kickstarter_y.append(reduced_y['SuccessfulBool'][i])

In [None]:
# kickstarter_X = []
# kickstarter_y = []
# for i, j in reduced_x_features.iterrows():
#     tmp = str(reduced_x_features['launch_to_deadline_days'][i]) + " " + \
#         str(reduced_x_features['staff_pick'][i]) + " " + \
#         str(reduced_x_features['backers_count'][i]) + " " + \
#         str(reduced_x_features['spotlight'][i]) + " " + \
#         str(reduced_x_features['goal'][i]) + " " + \
#         str(reduced_x_features['log_goal'][i]) + " " + \
#         str(reduced_x_features['log_pledged'][i])
#     kickstarter_X.append(tmp)
#     kickstarter_y.append(reduced_y['pledged'][i])

In [None]:
# kickstarter_X

In [None]:
# kickstarter_y

In [None]:
print(len(kickstarter_X), len(kickstarter_y))

In [None]:
max_words = 2000
max_length = 30
vector_length = 16

encoded_docs = [one_hot(d, max_words) for d in kickstarter_X]
padded_docs = pad_sequences(encoded_docs, maxlen=7, padding='post')

X_train, X_test, y_train, y_test = train_test_split(padded_docs, np.array(kickstarter_y)[:, None].astype(int), test_size=0.20, random_state=1234)

In [None]:
print("X_train")
X_train

In [None]:
print("X_test")
X_test

In [None]:
print("y_train")
y_train

In [None]:
print("y_test")
y_test

#------------- CNN -------------#

In [None]:
# Initialising the RNN
model = Sequential()
model.add(layers.Embedding(max_words+1, vector_length, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=7, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Adding the first CNN layer and Dropout layer
model.add(Dense(128, activation="relu", input_shape=(X_train.shape[1],)))
model.add(Dropout(0.2))

# Adding a second CNN layer and Dropout layer
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.2))

# Adding a third CNN layer and Dropout layer
model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))

# Adding a fourth CNN layer and Dropout layer
model.add(Dense(16, activation="relu"))
model.add(Dropout(0.2))

# For Full connection layer we use dense
# As the output is 1D so we use unit=1
# Adding the output layer
model.add(Dense(1))
# model.add(Dense(1, activation= 'linear'))

print(model.summary())
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['mse'])

history = model.fit(X_train, y_train, 
          epochs=50, 
          verbose=1,
          validation_data=(X_test, y_test),
          batch_size=256)

scores = model.evaluate(X_test, y_test,
                        verbose=1,
                        batch_size = 256)

dt = RandomForestRegressor(criterion='mse',n_jobs=-1, n_estimators=10, max_depth=6, min_samples_leaf=1, random_state=3)
dt.fit(X_train,y_train)
y_predicted = dt.predict(X_test)
accuracy = dt.score(X_test,y_test)
MSE_score = MSE(y_test,y_predicted)

# print the final results
print("Training Accuracy:",(dt.score(X_train,y_train)))
print("Testing Accuracy:",accuracy)
print("Mean Squared Error",MSE_score.mean())

In [None]:
table = PrettyTable(border=True, header=True, padding_width=1)
table.field_names = ['X', 'y (actual)', 'Predicted']
table.add_row([X_test[15], y_test[15], y_predicted[15]])
table.add_row([X_test[25], y_test[25], y_predicted[25]])
table.add_row([X_test[40], y_test[40], y_predicted[40]])
table.add_row([X_test[47], y_test[47], y_predicted[47]])
table.add_row([X_test[85], y_test[85], y_predicted[85]])
table.add_row([X_test[110], y_test[110], y_predicted[110]])
table.add_row([X_test[202], y_test[202], y_predicted[202]])
table.add_row([X_test[1848], y_test[1848], y_predicted[1848]])
table.add_row([X_test[1857], y_test[1857], y_predicted[1857]])

print(table)

#------------- LSTM -------------#


In [None]:
# Initialising the RNN
model = Sequential()
model.add(layers.Embedding(max_words+1, vector_length, input_length=max_length))

# Adding the first LSTM layer and Dropout layer
model.add(LSTM(units = 128, return_sequences = True, input_shape = (X_train.shape[1], 1)))
model.add(Dropout(0.2))

# Adding a second LSTM layer and Dropout layer
model.add(LSTM(units = 64, return_sequences = True))
model.add(Dropout(0.2))

# Adding a third LSTM layer and Dropout layer
model.add(LSTM(units = 32, return_sequences = True))
model.add(Dropout(0.2))

# Adding a fourth LSTM layer and Dropout layer
model.add(LSTM(units = 16))
model.add(Dropout(0.2))

# For Full connection layer we use dense
# As the output is 1D so we use unit=1
# Adding the output layer
model.add(Dense(1))
# model.add(Dense(1, activation= 'linear'))

print(model.summary())
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['mse'])

history = model.fit(X_train, y_train, 
          epochs=50, 
          verbose=1,
          validation_data=(X_test, y_test),
          batch_size=256)

loss, accuracy = model.evaluate(X_train, y_train, verbose=1)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print("Testing Accuracy:  {:.4f}".format(accuracy))

scores = model.evaluate(X_test, y_test,
                        verbose=1,
                        batch_size = 256)

dt = RandomForestRegressor(criterion='mse',n_jobs=-1, n_estimators=10,max_depth=6, min_samples_leaf=1, random_state=3)
dt.fit(X_train,y_train)
y_predicted = dt.predict(X_test)
accuracy = dt.score(X_test,y_test)
MSE_score = MSE(y_test,y_predicted)

# print the final results
print("Training Accuracy:",dt.score(X_train,y_train))
print("Testing Accuracy:",accuracy)
print("Mean Squared Error",MSE_score.mean())

In [None]:
table = PrettyTable(border=True, header=True, padding_width=1)
table.field_names = ['X', 'y (actual)', 'Predicted']
table.add_row([X_test[15], y_test[15], y_predicted[15]])
table.add_row([X_test[25], y_test[25], y_predicted[25]])
table.add_row([X_test[40], y_test[40], y_predicted[40]])
table.add_row([X_test[47], y_test[47], y_predicted[47]])
table.add_row([X_test[85], y_test[85], y_predicted[85]])
table.add_row([X_test[110], y_test[110], y_predicted[110]])
table.add_row([X_test[202], y_test[202], y_predicted[202]])
table.add_row([X_test[1848], y_test[1848], y_predicted[1848]])
table.add_row([X_test[1857], y_test[1857], y_predicted[1857]])

print(table)

In [None]:
# table = PrettyTable(border=True, header=True, padding_width=1)
# table.field_names = ['Model', 'Accuracy']
# table.add_row(['CNN', '--%'])
# table.add_row(['LSTM', "--%"])

# print(table)