In [None]:
#Backup of current work just in case of loss of power

In [None]:
import glob
import pandas as pd
import numpy as np
import swifter
from ast import literal_eval
from pandas_profiling import ProfileReport
import tensorflow as tf
from tensorflow.keras import Sequential, Model
from tensorflow.keras.models import save_model, load_model, model_from_json
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, TextVectorization, Input, concatenate, StringLookup, BatchNormalization, Flatten, Lambda
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam, RMSprop, Adagrad
from tensorflow.keras.constraints import max_norm

In [None]:
# Data from: https://webrobots.io/kickstarter-datasets/ * Used in current model
# Reads in all files in a directory(make sure that all files are csv)
files = glob.glob("data/Kickstarter_2021-01-14T03_20_05_328Z/*")
df = pd.concat(map(pd.read_csv, files), ignore_index=True)

In [None]:
# Data from https://www.kaggle.com/sripaadsrinivasan/kickstarter-campaigns-dataset
df = pd.read_csv('data/kickstarter_data_full.csv', low_memory=False).drop(columns=['Unnamed: 0'])
columns = ['blurb', 'category', 'country', 'currency', 'goal', 'name', 'state']
df = df[columns].copy()

In [None]:
# Profile report for data exploring
profile = ProfileReport(df, minimal=True)
profile.to_notebook_iframe()

In [None]:
# Dropping unneeded colums
columns = ['friends', 'is_backing', 'is_starred', 'permissions', 'is_starrable', 'urls', 'creator', 'photo', 'profile', 'source_url', 'backers_count', 'converted_pledged_amount', 'created_at', 'location', 'deadline', 'currency_symbol', 'disable_communication', 'id', 'launched_at', 'pledged', 'slug', 'spotlight', 'staff_pick', 'state_changed_at', 'currency_trailing_code', 'fx_rate', 'current_currency', 'usd_pledged', 'static_usd_rate', 'country_displayable_name','usd_type']
df = df.drop(columns=columns)

In [None]:
# Changing the stringified dictionary to a dictionary and only returning id 'name'
# swifter allows faster .apply() method for pandas
df['category'] = df['category'].swifter.apply(lambda x: literal_eval(x)['name'])

In [None]:
#Save current data to csv so no time is wasted recleaning data
df = df[['name', 'blurb', 'goal', 'category', 'country', 'currency', 'state']]
df.to_csv('data/cleaned_dataset.csv', index=False)

In [None]:
df = pd.read_csv('data/cleaned_dataset.csv').dropna()

In [None]:
# Filtering the dataset to make target column binary
df_binary = df.loc[(df['state']=='successful') | (df['state']=='failed')]

In [None]:
# Used for making the vocab for a layer
seq_len = 40
cat_feat = df_binary.drop(columns=['blurb', 'name', 'state', 'goal'])
# Chaning from string to integer values
y = df_binary['state']
y = y.replace('successful', 1)
y = y.replace('failed', 0)

In [None]:
# Makeing vocab
vocab = set()
for cols in cat_feat.columns:
    for row in cat_feat[cols]:
        vocab.add(row)
vocab = list(vocab)

In [None]:
# Text vectorization and embedding for 'blurb' to save preprocessing time
text_vect_obj = TextVectorization(max_tokens=20000, output_sequence_length=40, pad_to_max_tokens=True)
# Adapt creates the vocab list quickly for the TextVectorization layer(cleaner than making a function to do it)
text_vect_obj.adapt(df_binary['blurb'].astype(str))
# Text vectorization and embedding for 'name' to save preprocessing time
text_vect_obj2 = TextVectorization(max_tokens=20000, output_sequence_length=40, pad_to_max_tokens=True)
text_vect_obj2.adapt(df_binary['name'])

In [None]:
# Inputs
nlp_blurb = Input(shape=(1,), dtype=tf.string)
nlp_name = Input(shape=(1,), dtype=tf.string)
nlp_cols = Input(shape=(3,), dtype=tf.string)
meta_input = Input((1,), dtype=tf.float32)

text_vect = text_vect_obj(nlp_blurb)
embed = Embedding(20000, 50)(text_vect)
nlp_blurb_out = LSTM(500)(embed)

text_vect2 = text_vect_obj2(nlp_name)
embed_2 = Embedding(20000, 50)(text_vect2)
nlp_name_out = LSTM(500)(embed_2)

# Encoding the columns the columns that can be multi hot encoded
cat_encoding = StringLookup(output_mode='multi_hot', vocabulary=vocab)(nlp_cols)

# Normalizing 'goal' to save preprocessing time
norm = BatchNormalization()(meta_input)

# Concatinates the outputs for the nlp models, multi hot model and normilization model
num_cat = concatenate([nlp_blurb_out, nlp_name_out, cat_encoding, norm])

# Standard Dense model with max_norm constraints and Dropout for weight regularization
x = Dense(1000, activation='relu', kernel_constraint=max_norm(), bias_constraint=max_norm())(num_cat)
x = Dropout(0.3)(x)
x = Dense(750, activation='relu', kernel_constraint=max_norm(), bias_constraint=max_norm())(x)
x = Dropout(0.3)(x)
x = Dense(500, activation='relu', kernel_constraint=max_norm(), bias_constraint=max_norm())(x)
x = Dropout(0.3)(x)
x = Dense(250, activation='relu', kernel_constraint=max_norm(), bias_constraint=max_norm())(x)
x = Dropout(0.3)(x)
x = Dense(1, activation='sigmoid')(x)

# Inputs need to have all of the inputs above in list format [input1, input2, ...]
model = Model(inputs=[nlp_blurb, nlp_name, meta_input, nlp_cols], outputs=x)
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

In [None]:
# Input data is going to be in list format, not in single dataframe as shown here [df[col1], df[col2], ...]
# Input order matters
# Using validation split and Earlystopping
model.fit([df_binary['blurb'].astype('string'), df_binary['name'].astype('string'), df_binary['goal'], df_binary[['category', 'country', 'currency']].astype('string')], y, epochs=50, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)])

In [None]:
# Save file size ~80MB
save_model(model, 'model', save_format='tf')

In [None]:
#Just a visualization of the model from above
from keras.utils.vis_utils import plot_model
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
text_vect_obj = TextVectorization(max_tokens=20000, output_sequence_length=40, pad_to_max_tokens=True)
text_vect_obj.adapt(df_binary['blurb'])
text_vect_obj2 = TextVectorization(max_tokens=20000, output_sequence_length=40, pad_to_max_tokens=True)
text_vect_obj2.adapt(df_binary['name'])

In [None]:
def build_model(opt='adam', lr=0.0001, nodes=1000):
    big_in = Input(5,)
    nlp_name = Lambda(lambda x: tf.cast(tf.expand_dims(x[:,0],-1), dtype=tf.string))(big_in)
    nlp_blurb = Lambda(lambda x: tf.cast(tf.expand_dims(x[:,1],-1), dtype=tf.string))(big_in)
    meta_input = Lambda(lambda x: tf.cast(tf.expand_dims(x[:,2],-1), dtype=tf.float32))(big_in)
    nlp_cols = Lambda(lambda x: tf.cast(x[:,3:], dtype=tf.string))(big_in)
    
    text_vect = text_vect_obj(nlp_blurb)
    embed = Embedding(len(text_vect_obj.get_vocabulary()), 50)(text_vect)
    nlp_blurb_out = LSTM(500)(embed)
    text_vect2 = text_vect_obj2(nlp_name)
    embed_2 = Embedding(len(text_vect_obj.get_vocabulary()), 50)(text_vect2)
    nlp_name_out = LSTM(500)(embed_2)
    cat_encoding = StringLookup(output_mode='multi_hot', vocabulary=vocab)(nlp_cols)
    norm = BatchNormalization()(meta_input)
    num_cat = concatenate([nlp_blurb_out, nlp_name_out, cat_encoding, norm])
    
    x = Dense(nodes, activation='relu', kernel_constraint=max_norm(), bias_constraint=max_norm())(num_cat)
    x = Dropout(0.3)(x)
    x = Dense(nodes*0.75, activation='relu', kernel_constraint=max_norm(), bias_constraint=max_norm())(x)
    x = Dropout(0.3)(x)
    x = Dense(nodes*0.5, activation='relu', kernel_constraint=max_norm(), bias_constraint=max_norm())(x)
    x = Dropout(0.3)(x)
    x = Dense(nodes*0.25, activation='relu', kernel_constraint=max_norm(), bias_constraint=max_norm())(x)
    x = Dropout(0.3)(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=big_in, outputs=x)
    
    if opt == 'adam':
        model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=lr), metrics=['accuracy'])
    elif opt == 'rmsprop':
        model.compile(loss='binary_crossentropy', optimizer=RMSprop(learning_rate=lr), metrics=['accuracy'])
    elif opt == 'adagrad':
        model.compile(loss='binary_entropy', optimizer=Adagrad(learning_rate=lr), metrics=['accuracy'])
    return model

In [None]:
param_grid = {
    'opt': ['adam', 'rmsprop', 'adagrad'],
    'lr': [0.01, 0.001, 0.0001, 0.00001],
    'nodes': [500, 1000, 2000, 4000]
}
wrapper = KerasClassifier(build_fn=build_model, epochs=50, validation_split=0.2, callbacks=[EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)])
search = RandomizedSearchCV(wrapper, param_grid, cv=3, n_iter=48)

In [None]:
search.fit(df_binary[['name', 'blurb', 'goal', 'category', 'country' ,'currency']], y)

In [None]:
search.best_params_