#### Setup

In [None]:
!pip install -U tensorflow keras

In [None]:
!pip install -U talos

In [None]:
!pip install -U fasttext

In [None]:
# Import general Python libraries
import pandas as pd
import numpy as np
import random
import sklearn
import seaborn as sns
import os
import io
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

In [None]:
# Specify seeds for random-operations
seed_value = 0
os.environ['PYTHONHASHSEED']=str(seed_value)
np.random.seed(seed_value)
random.seed(seed_value)

In [None]:
# Import sklearn-specific modules
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [None]:
# Import tensorflow-specific modules
import tensorflow as tf
tf.random.set_seed(seed_value)
print("Tensorflow Version: {}".format(tf.__version__))
print("Keras Version: {}".format(tf.keras.__version__))

Tensorflow Version: 2.4.1
Keras Version: 2.4.0


In [None]:
# Import keras-specific modules
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dropout, BatchNormalization, LayerNormalization, GaussianNoise, Activation
from tensorflow.keras.layers import Dense, Flatten, Embedding, Conv1D, MaxPool1D, AvgPool1D, GlobalMaxPool1D, GlobalAvgPool1D, RNN, GRU, LSTM, SeparableConv1D, SimpleRNN, Bidirectional, LocallyConnected1D, LeakyReLU, Input
from tensorflow.keras.optimizers import Adadelta, RMSprop, Adam, Adamax, Nadam
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.initializers import GlorotNormal, GlorotUniform, LecunNormal, LecunUniform, HeNormal, HeUniform, Constant
from tensorflow.keras.metrics import AUC, Precision, Recall
from tensorflow.keras.utils import plot_model
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

In [None]:
# Import talos-specific modules
import talos

In [None]:
# Set pandas options
pd.set_option("display.max_columns", None)

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Set up TPU configurations
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.TPUStrategy(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

INFO:tensorflow:Initializing the TPU system: grpc://10.52.181.106:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.52.181.106:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU')]


#### Import Dataset:

In [None]:
# Import Kickstarter Dataset
kickstarter_df = pd.read_csv("04_Final Datasets/Kickstarter_Text.csv", index_col=0)
print(kickstarter_df.shape)
print(len(kickstarter_df.index.unique()))
kickstarter_df.head(1)

(246891, 7)
246891


Unnamed: 0,campaign_successful,title,blurb,story,risks,reward_description,creator_bio
22821161,0,sentio golf putters. feel is the difference,choose the feel you want with our patented flo...,sentio putters feature a unique floating face...,high tech process although we have made severa...,our eternal gratitude. every little bit helps ...,sentio golf is driven to produce the most adva...


In [None]:
# Import Word Embeddings
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')
print(ft.get_dimension())

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
300




#### a) Baseline

This model provides the baseline, which will be used as starting point for the hyperparameter search. The goal is to gradually improve the model performance.

**Hyperparameters:**
- Preprocessing: Only Story, Removed Stopwords, Max_Features = No Restrictions, Max_Len = 500, Embedding_Dim = 300, Fasttext Embeddings
- CNN: 2 Conv Layers (64 Filters, Window Size 7, ReLU, padding=same), MaxPooling (Size 5), GlobalMaxPooling, Dropout before Dense Classifier (Rate = 0.5), 1 Hidden Dense Layer (32 Units, ReLU), Nadam, Batch Size 512
- RNN: 1 LSTM Layer (32 Units), RMSProp, Batch Size 512


##### Data Preparation:

In [None]:
# Extract Target Variable From Dataset
y = kickstarter_df["campaign_successful"].to_numpy()
text = kickstarter_df["story"]

print(type(y))
print(y.shape)
print(type(text))
print(len(text))

<class 'numpy.ndarray'>
(246891,)
<class 'pandas.core.series.Series'>
246891


In [None]:
# Remove stopwords and punctation
stopwords = set(list(ENGLISH_STOP_WORDS) + ["s"])
text = text.str.replace(r"\.", "")
text = text.progress_apply(lambda x: " ".join([token for token in x.split() if token not in stopwords]))
text = text.to_numpy()

100%|██████████| 246891/246891 [00:23<00:00, 10588.88it/s]


In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=(train_size+val_size),test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
text_subtrain, text_val, y_subtrain, y_val = train_test_split(text_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(len(text_train)))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(len(text_subtrain)))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(len(text_val)))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(len(text_test)))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: 209858
Shape of y_train: (209858,)
Shape of X_subtrain: 172824
Shape of y_subtrain: (172824,)
Shape of X_val: 37034
Shape of y_val: (37034,)
Shape of X_test: 37033
Shape of y_test: (37033,)


In [None]:
# Define parameters for text processing 
max_features = 471366
max_len = 500
embedding_dim = 300

In [None]:
# Convert Texts Into Integer Sequences (Tokenization)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_subtrain)
X_subtrain = tokenizer.texts_to_sequences(text_subtrain)
X_val = tokenizer.texts_to_sequences(text_val)
word_index = tokenizer.word_index
print("Included Token: {}".format(tokenizer.num_words))

Included Token: 471366


In [None]:
# Determine number of words for max_len
num_words = pd.DataFrame([len(x) for x in X_subtrain])
num_words.rename(columns={0:"words"}, inplace=True)
num_words.describe()

Unnamed: 0,words
count,172824.0
mean,243.835115
std,248.457559
min,0.0
25%,90.0
50%,167.0
75%,310.0
max,3555.0


In [None]:
# Pad and Truncate sequences
X_subtrain = pad_sequences(X_subtrain, maxlen=max_len, padding="pre", truncating="post")
X_val = pad_sequences(X_val, maxlen=max_len, padding="pre", truncating="post")

print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Shape of X_val: {}".format(X_val.shape))

Shape of X_subtrain: (172824, 500)
Shape of X_val: (37034, 500)


In [None]:
# Prepare the Embedding Matrix
print('Preparing Embedding Matrix...')
words_not_found = []
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
      embedding_vector = ft.get_word_vector(word)
      if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
      else:
        words_not_found.append(word)
print('Shape of Embedding Matrix: {}'.format(embedding_matrix.shape))
print('Number of Null Word Embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing Embedding Matrix...
Shape of Embedding Matrix: (471366, 300)
Number of Null Word Embeddings: 152


##### Convolutional Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(MaxPool1D(5))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(GlobalMaxPool1D())
  model.add(Dropout(0.5))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=Nadam(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 300)          141409800 
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 500, 64)           134464    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 100, 64)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 100, 64)           28736     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)               

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Training Accuracy: 0.818
Validation Accuracy: 0.756


##### Recurrent Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(LSTM(32))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 300)          141409800 
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                42624     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 141,452,457
Trainable params: 42,657
Non-trainable params: 141,409,800
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Training Accuracy: 0.769
Validation Accuracy: 0.758


#### b) Test Different Preprocessing Strategies

##### Check How Different Paddings Affect Model Performance:

###### Data Preparation:

In [None]:
# Define parameters for text processing 
max_features = 471366
max_len = 500
embedding_dim = 300

In [None]:
# Convert Texts Into Integer Sequences (Tokenization)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_subtrain)
X_subtrain = tokenizer.texts_to_sequences(text_subtrain)
X_val = tokenizer.texts_to_sequences(text_val)
word_index = tokenizer.word_index
print("Included Token: {}".format(max_features))

Included Token: 471366


In [None]:
# Determine number of words for max_len
num_words = pd.DataFrame([len(x) for x in X_subtrain])
num_words.rename(columns={0:"words"}, inplace=True)
num_words.describe()

Unnamed: 0,words
count,172824.0
mean,243.835115
std,248.457559
min,0.0
25%,90.0
50%,167.0
75%,310.0
max,3555.0


In [None]:
# Pad and Truncate sequences
X_subtrain = pad_sequences(X_subtrain, maxlen=max_len, padding="post", truncating="pre")
X_val = pad_sequences(X_val, maxlen=max_len, padding="post", truncating="pre")

print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Shape of X_val: {}".format(X_val.shape))

Shape of X_subtrain: (172824, 500)
Shape of X_val: (37034, 500)


In [None]:
# Prepare the Embedding Matrix
print('Preparing Embedding Matrix...')
words_not_found = []
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
      embedding_vector = ft.get_word_vector(word)
      if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
      else:
        words_not_found.append(word)
print('Shape of Embedding Matrix: {}'.format(embedding_matrix.shape))
print('Number of Null Word Embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing Embedding Matrix...
Shape of Embedding Matrix: (471366, 300)
Number of Null Word Embeddings: 152


###### Convolutional Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(MaxPool1D(5))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(GlobalMaxPool1D())
  model.add(Dropout(0.5))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=Nadam(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 500, 300)          141409800 
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 500, 64)           134464    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 100, 64)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 100, 64)           28736     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 64)                0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 32)               

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Training Accuracy: 0.805
Validation Accuracy: 0.757


###### Recurrent Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(LSTM(32))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 500, 300)          141409800 
_________________________________________________________________
lstm_4 (LSTM)                (None, 32)                42624     
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 33        
Total params: 141,452,457
Trainable params: 42,657
Non-trainable params: 141,409,800
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.670
Validation Accuracy: 0.670


###### Result:

Padding / Truncating:
- Pre/Pre: CNN = 80.6% vs. 75.7%; RNN = 76.4% vs. 75.6%
- Post/Post: CNN = 80.6% vs. 75.5%; 68.1% vs. 67.8%
- Pre/Post: CNN = 81.8% vs. 75.6%; RNN = 76.9% vs. 75.8%
- Post/Pre: CNN = 80.5% vs. 75.7%; RNN = 67% vs. 67%

Conclusion:
- RNN was not able to learn when padding="post" (probably because short sentences are processed sequentially and at the end many null-embeddings are considered)
- CNN was relatively unaffected, as it just searches for local cues
- Padding="pre" & Truncating="post" provided the best results

##### Check how different embedding dimensions affect model performance:

###### Data Preparation:

In [None]:
# Reduce embedding dimension
print("Before: {}".format(ft.get_dimension()))
fasttext.util.reduce_model(ft, 200)
print("After: {}".format(ft.get_dimension()))

Before: 300
After: 200


In [None]:
# Define parameters for text processing 
max_features = 471366
max_len = 500
embedding_dim = 200

In [None]:
# Convert Texts Into Integer Sequences (Tokenization)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_subtrain)
X_subtrain = tokenizer.texts_to_sequences(text_subtrain)
X_val = tokenizer.texts_to_sequences(text_val)
word_index = tokenizer.word_index
print("Included Token: {}".format(max_features))

Included Token: 471366


In [None]:
# Determine number of words for max_len
num_words = pd.DataFrame([len(x) for x in X_subtrain])
num_words.rename(columns={0:"words"}, inplace=True)
num_words.describe()

Unnamed: 0,words
count,172824.0
mean,243.835115
std,248.457559
min,0.0
25%,90.0
50%,167.0
75%,310.0
max,3555.0


In [None]:
# Pad and Truncate sequences
X_subtrain = pad_sequences(X_subtrain, maxlen=max_len, padding="pre", truncating="post")
X_val = pad_sequences(X_val, maxlen=max_len, padding="pre", truncating="post")

print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Shape of X_val: {}".format(X_val.shape))

Shape of X_subtrain: (172824, 500)
Shape of X_val: (37034, 500)


In [None]:
# Prepare the Embedding Matrix
print('Preparing Embedding Matrix...')
words_not_found = []
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
      embedding_vector = ft.get_word_vector(word)
      if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
      else:
        words_not_found.append(word)
print('Shape of Embedding Matrix: {}'.format(embedding_matrix.shape))
print('Number of Null Word Embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing Embedding Matrix...
Shape of Embedding Matrix: (471366, 200)
Number of Null Word Embeddings: 152


###### Convolutional Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(MaxPool1D(5))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(GlobalMaxPool1D())
  model.add(Dropout(0.5))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 500, 200)          94273200  
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 500, 64)           89664     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 100, 64)           0         
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 100, 64)           28736     
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 64)                0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_19 (Dense)             (None, 32)              

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Training Accuracy: 0.774
Validation Accuracy: 0.749


###### Recurrent Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(LSTM(32))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 500, 200)          94273200  
_________________________________________________________________
lstm_5 (LSTM)                (None, 32)                29824     
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 33        
Total params: 94,303,057
Trainable params: 29,857
Non-trainable params: 94,273,200
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Training Accuracy: 0.777
Validation Accuracy: 0.762


###### Results:

- 300 dimension: CNN = 79.5% vs. 75.7%; RNN = 77.2% vs. 75.8%
- 200 dimensions: CNN = 77.4% vs. 74.9% ; RNN = 77.7% vs. 76.2%
- 100 dimensions: CNN = 78.3% vs. 74.8%; RNN = 72.7% vs. 72.6%
- 50 dimensions: CNN = 77.7% vs. 74.5%; RNN = 73.8% vs. 73.6%
- 200 dimensions worked especially well with RNNs; 300 dimensions worked better with CNNs (i.e. stick with 200 dimensions for now)

##### Check if self-trained word embeddings help:

###### Data Preparation:

In [None]:
# Define parameters for text processing 
max_features = 100000
max_len = 500
embedding_dim = 50

In [None]:
# Convert Texts Into Integer Sequences (Tokenization)
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(text_subtrain)
X_subtrain = tokenizer.texts_to_sequences(text_subtrain)
X_val = tokenizer.texts_to_sequences(text_val)
word_index = tokenizer.word_index
print("Included Token: {}".format(tokenizer.num_words))

Included Token: 100000


In [None]:
# Determine number of words for max_len
num_words = pd.DataFrame([len(x) for x in X_subtrain])
num_words.rename(columns={0:"words"}, inplace=True)
num_words.describe()

Unnamed: 0,words
count,172824.0
mean,240.113306
std,243.629157
min,0.0
25%,88.0
50%,165.0
75%,305.0
max,2810.0


In [None]:
# Pad and Truncate sequences
X_subtrain = pad_sequences(X_subtrain, maxlen=max_len, padding="pre", truncating="post")
X_val = pad_sequences(X_val, maxlen=max_len, padding="pre", truncating="post")

print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Shape of X_val: {}".format(X_val.shape))

Shape of X_subtrain: (172824, 500)
Shape of X_val: (37034, 500)


###### Convolutional Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, input_length=max_len))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(MaxPool1D(5))
  model.add(Dropout(0.5))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(GlobalMaxPool1D())
  model.add(Dropout(0.5))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_21 (Embedding)     (None, 500, 50)           5000000   
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 500, 64)           22464     
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 100, 64)           0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 100, 64)           0         
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 100, 64)           28736     
_________________________________________________________________
global_max_pooling1d_11 (Glo (None, 64)                0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 64)              

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Training Accuracy: 0.820
Validation Accuracy: 0.760


###### Recurrent Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, input_length=max_len))
  model.add(LSTM(32))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 500, 50)           5000000   
_________________________________________________________________
lstm_11 (LSTM)               (None, 32)                10624     
_________________________________________________________________
dense_35 (Dense)             (None, 1)                 33        
Total params: 5,010,657
Trainable params: 5,010,657
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
 15/338 [>.............................] - ETA: 14s - loss: 0.4022 - binary_accuracy: 0.8285

###### Results:

- 100,000 tokens; 100 dimensions: CNN = 78.6% vs. 75.9%; RNN = 81.8% vs. 75.7%
- 50,000 tokens; 100 dimensions: CNN = 78.8% vs. 75.8%; RNN = 78% vs. 75.7%
- 100,000 tokens, 50 dimensions: CNN = 82% vs. 76%; RNN = 79.7% vs. 75.9%
- using own word embeddings leads to strong overfitting, which also cannot be mitigated by using dropout

##### Initialize Embeddings with Pre-Trained Weights and Fine-Tune Them:

###### Data Preparation:

In [None]:
import fasttext, fasttext.util
ft = fasttext.load_model('cc.en.100.bin')
print("Embedding Dimension: {}".format(ft.get_dimension()))

Embedding Dimension: 100




In [None]:
# Define parameters for text processing 
max_features = 100000
max_len = 500
embedding_dim = 100

In [None]:
# Convert Texts Into Integer Sequences (Tokenization)
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(text_subtrain)
X_subtrain = tokenizer.texts_to_sequences(text_subtrain)
X_val = tokenizer.texts_to_sequences(text_val)
word_index = tokenizer.word_index
print("Included Token: {}".format(tokenizer.num_words))

Included Token: 100000


In [None]:
# Determine number of words for max_len
num_words = pd.DataFrame([len(x) for x in X_subtrain])
num_words.rename(columns={0:"words"}, inplace=True)
num_words.describe()

Unnamed: 0,words
count,172824.0
mean,240.113306
std,243.629157
min,0.0
25%,88.0
50%,165.0
75%,305.0
max,2810.0


In [None]:
# Pad and Truncate sequences
X_subtrain = pad_sequences(X_subtrain, maxlen=max_len, padding="pre", truncating="post")
X_val = pad_sequences(X_val, maxlen=max_len, padding="pre", truncating="post")

print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Shape of X_val: {}".format(X_val.shape))

Shape of X_subtrain: (172824, 500)
Shape of X_val: (37034, 500)


In [None]:
# Prepare the Embedding Matrix
print('Preparing Embedding Matrix...')
words_not_found = []
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
      embedding_vector = ft.get_word_vector(word)
      if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
      else:
        words_not_found.append(word)
print('Shape of Embedding Matrix: {}'.format(embedding_matrix.shape))
print('Number of Null Word Embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing Embedding Matrix...
Shape of Embedding Matrix: (100000, 100)
Number of Null Word Embeddings: 15


###### Convolutional Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(MaxPool1D(5))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(GlobalMaxPool1D())
  model.add(Dropout(0.5))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 100)          10000000  
_________________________________________________________________
conv1d (Conv1D)              (None, 500, 64)           44864     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 100, 64)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 64)           28736     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                2

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Training Accuracy: 0.831
Validation Accuracy: 0.764


###### Recurrent Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True))
  model.add(LSTM(32))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 100)          10000000  
_________________________________________________________________
lstm (LSTM)                  (None, 32)                17024     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 10,017,057
Trainable params: 10,017,057
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Training Accuracy: 0.818
Validation Accuracy: 0.755


###### Results:

- using pre-trained word embeddings and keeping them fixed leads to way less overfitting with comparable results
- i.e. keep pre-trained embeddings

##### Check how stopwords affect model performance:

###### Data Preparation:

In [None]:
# Extract Target Variable From Dataset
y = kickstarter_df["campaign_successful"].to_numpy()
text = kickstarter_df["story"]

print(type(y))
print(y.shape)
print(type(text))
print(len(text))

<class 'numpy.ndarray'>
(246891,)
<class 'pandas.core.series.Series'>
246891


In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=(train_size+val_size),test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
text_subtrain, text_val, y_subtrain, y_val = train_test_split(text_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(len(text_train)))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(len(text_subtrain)))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(len(text_val)))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(len(text_test)))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: 209858
Shape of y_train: (209858,)
Shape of X_subtrain: 172824
Shape of y_subtrain: (172824,)
Shape of X_val: 37034
Shape of y_val: (37034,)
Shape of X_test: 37033
Shape of y_test: (37033,)


In [None]:
import fasttext, fasttext.util
ft = fasttext.load_model('cc.en.200.bin')
print("Embedding Dimension: {}".format(ft.get_dimension()))

Embedding Dimension: 200




In [None]:
# Define parameters for text processing 
max_features = 431789
max_len = 700
embedding_dim = 200

In [None]:
# Convert Texts Into Integer Sequences (Tokenization)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_subtrain)
X_subtrain = tokenizer.texts_to_sequences(text_subtrain)
X_val = tokenizer.texts_to_sequences(text_val)
word_index = tokenizer.word_index
print("Included Token: {}".format(tokenizer.num_words))

Included Token: None


In [None]:
# Determine number of words for max_len
num_words = pd.DataFrame([len(x) for x in X_subtrain])
num_words.rename(columns={0:"words"}, inplace=True)
num_words.describe()

Unnamed: 0,words
count,172824.0
mean,506.934945
std,499.431631
min,0.0
25%,194.0
50%,355.0
75%,646.0
max,6766.0


In [None]:
# Pad and Truncate sequences
X_subtrain = pad_sequences(X_subtrain, maxlen=max_len, padding="pre", truncating="post")
X_val = pad_sequences(X_val, maxlen=max_len, padding="pre", truncating="post")

print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Shape of X_val: {}".format(X_val.shape))

Shape of X_subtrain: (172824, 700)
Shape of X_val: (37034, 700)


In [None]:
# Prepare the Embedding Matrix
print('Preparing Embedding Matrix...')
words_not_found = []
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
      embedding_vector = ft.get_word_vector(word)
      if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
      else:
        words_not_found.append(word)
print('Shape of Embedding Matrix: {}'.format(embedding_matrix.shape))
print('Number of Null Word Embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing Embedding Matrix...
Shape of Embedding Matrix: (431789, 200)
Number of Null Word Embeddings: 157


###### Convolutional Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(MaxPool1D(5))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(GlobalMaxPool1D())
  model.add(Dropout(0.5))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 700, 200)          86357800  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 700, 64)           89664     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 140, 64)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 140, 64)           28736     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)               

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Training Accuracy: 0.782
Validation Accuracy: 0.744


###### Recurrent Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(LSTM(32))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 700, 200)          86357800  
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                29824     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 33        
Total params: 86,387,657
Trainable params: 29,857
Non-trainable params: 86,357,800
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Training Accuracy: 0.729
Validation Accuracy: 0.727


###### Results:

- Stopwords Not Removed: CNN = 78.2% vs. 74.4%; RNN = 72.7%
- i.e. NNs struggled when stopwords were not removed

##### Check if adding additional text features can increase model performance:

###### Data Preparation:

In [None]:
# Extract Target Variable From Dataset
y = kickstarter_df["campaign_successful"].to_numpy()
text = kickstarter_df["title"] + " " + kickstarter_df["blurb"] + " " + kickstarter_df["story"] + " " + kickstarter_df["risks"] + " " + kickstarter_df["creator_bio"] + " " + kickstarter_df["reward_description"]

print(type(y))
print(y.shape)
print(type(text))
print(len(text))

<class 'numpy.ndarray'>
(246891,)
<class 'pandas.core.series.Series'>
246891


In [None]:
# Remove stopwords and punctation
stopwords = set(list(ENGLISH_STOP_WORDS) + ["s"])
text = text.str.replace(r"\.", "")
text = text.progress_apply(lambda x: " ".join([token for token in x.split() if token not in stopwords]))
text = text.to_numpy()

100%|██████████| 246891/246891 [00:42<00:00, 5815.11it/s]


In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=(train_size+val_size),test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
text_subtrain, text_val, y_subtrain, y_val = train_test_split(text_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(len(text_train)))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(len(text_subtrain)))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(len(text_val)))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(len(text_test)))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: 209858
Shape of y_train: (209858,)
Shape of X_subtrain: 172824
Shape of y_subtrain: (172824,)
Shape of X_val: 37034
Shape of y_val: (37034,)
Shape of X_test: 37033
Shape of y_test: (37033,)


In [None]:
import fasttext, fasttext.util
ft = fasttext.load_model('cc.en.200.bin')
print("Embedding Dimension: {}".format(ft.get_dimension()))

Embedding Dimension: 200




In [None]:
# Define parameters for text processing 
max_features = 631377
max_len = 500
embedding_dim = 200

In [None]:
# Convert Texts Into Integer Sequences (Tokenization)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_subtrain)
X_subtrain = tokenizer.texts_to_sequences(text_subtrain)
X_val = tokenizer.texts_to_sequences(text_val)
word_index = tokenizer.word_index
print("Included Token: {}".format(len(word_index)))

Included Token: 631377


In [None]:
# Determine number of words for max_len
num_words = pd.DataFrame([len(x) for x in X_subtrain])
num_words.rename(columns={0:"words"}, inplace=True)
num_words.describe()

Unnamed: 0,words
count,172824.0
mean,509.529984
std,409.239714
min,14.0
25%,241.0
50%,399.0
75%,649.0
max,11553.0


In [None]:
# Pad and Truncate sequences
X_subtrain = pad_sequences(X_subtrain, maxlen=max_len, padding="pre", truncating="post")
X_val = pad_sequences(X_val, maxlen=max_len, padding="pre", truncating="post")

print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Shape of X_val: {}".format(X_val.shape))

Shape of X_subtrain: (172824, 500)
Shape of X_val: (37034, 500)


In [None]:
# Prepare the Embedding Matrix
print('Preparing Embedding Matrix...')
words_not_found = []
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
      embedding_vector = ft.get_word_vector(word)
      if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
      else:
        words_not_found.append(word)
print('Shape of Embedding Matrix: {}'.format(embedding_matrix.shape))
print('Number of Null Word Embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing Embedding Matrix...
Shape of Embedding Matrix: (631377, 200)
Number of Null Word Embeddings: 218


###### Convolutional Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(MaxPool1D(5))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(GlobalMaxPool1D())
  model.add(Dropout(0.5))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 200)          126275400 
_________________________________________________________________
conv1d (Conv1D)              (None, 500, 64)           89664     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 100, 64)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 64)           28736     
_________________________________________________________________
global_max_pooling1d (Global (None, 64)                0         
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                2

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Training Accuracy: 0.829
Validation Accuracy: 0.765


###### Recurrent Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(LSTM(32))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 200)          126275400 
_________________________________________________________________
lstm (LSTM)                  (None, 32)                29824     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 126,305,257
Trainable params: 29,857
Non-trainable params: 126,275,400
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Training Accuracy: 0.782
Validation Accuracy: 0.771


###### Results:

- Title + Blurb + Story: CNN = 78.1% vs. 75.5%; RNN = 76.6% vs. 75.9%
- Title + Blurb + Story + Risks: CNN = 78.7% vs. 75.7%; RNN = 77.7% vs. 76.5%
- Title + Blurb + Story + Risks + Creator_Bio: CNN = 79.1% vs. 76.3%; RNN = 78% vs. 76.8%
- Title + Blurb + Story + Risks + Creator_Bio + Reward_Description: CNN = 82.9% vs. 76.5%; RNN = 78.2% vs. 77.1%
- i.e. using all text attributes helped to increase predictive performance

##### Fine-Tune Max_Len:

###### Data Preparation:

In [None]:
# Extract Target Variable From Dataset
y = kickstarter_df["campaign_successful"].to_numpy()
text = kickstarter_df["title"] + " " + kickstarter_df["blurb"] + " " + kickstarter_df["story"] + " " + kickstarter_df["risks"] + " " + kickstarter_df["creator_bio"] + " " + kickstarter_df["reward_description"]

print(type(y))
print(y.shape)
print(type(text))
print(len(text))

<class 'numpy.ndarray'>
(246891,)
<class 'pandas.core.series.Series'>
246891


In [None]:
# Remove stopwords and punctation
stopwords = set(list(ENGLISH_STOP_WORDS) + ["s"])
text = text.str.replace(r"\.", "")
text = text.progress_apply(lambda x: " ".join([token for token in x.split() if token not in stopwords]))
text = text.to_numpy()

100%|██████████| 246891/246891 [00:42<00:00, 5780.91it/s]


In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=(train_size+val_size),test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
text_subtrain, text_val, y_subtrain, y_val = train_test_split(text_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(len(text_train)))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(len(text_subtrain)))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(len(text_val)))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(len(text_test)))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: 209858
Shape of y_train: (209858,)
Shape of X_subtrain: 172824
Shape of y_subtrain: (172824,)
Shape of X_val: 37034
Shape of y_val: (37034,)
Shape of X_test: 37033
Shape of y_test: (37033,)


In [None]:
import fasttext, fasttext.util
ft = fasttext.load_model('cc.en.200.bin')
print("Embedding Dimension: {}".format(ft.get_dimension()))

Embedding Dimension: 200




In [None]:
# Define parameters for text processing 
max_features = 631377
max_len = 300
embedding_dim = 200

In [None]:
# Convert Texts Into Integer Sequences (Tokenization)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_subtrain)
X_subtrain = tokenizer.texts_to_sequences(text_subtrain)
X_val = tokenizer.texts_to_sequences(text_val)
word_index = tokenizer.word_index
print("Included Token: {}".format(len(word_index)))

Included Token: 631377


In [None]:
# Determine number of words for max_len
num_words = pd.DataFrame([len(x) for x in X_subtrain])
num_words.rename(columns={0:"words"}, inplace=True)
num_words.describe()

Unnamed: 0,words
count,172824.0
mean,509.529984
std,409.239714
min,14.0
25%,241.0
50%,399.0
75%,649.0
max,11553.0


In [None]:
# Pad and Truncate sequences
X_subtrain = pad_sequences(X_subtrain, maxlen=max_len, padding="pre", truncating="post")
X_val = pad_sequences(X_val, maxlen=max_len, padding="pre", truncating="post")

print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Shape of X_val: {}".format(X_val.shape))

Shape of X_subtrain: (172824, 300)
Shape of X_val: (37034, 300)


In [None]:
# Prepare the Embedding Matrix
print('Preparing Embedding Matrix...')
words_not_found = []
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
      embedding_vector = ft.get_word_vector(word)
      if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
      else:
        words_not_found.append(word)
print('Shape of Embedding Matrix: {}'.format(embedding_matrix.shape))
print('Number of Null Word Embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing Embedding Matrix...
Shape of Embedding Matrix: (631377, 200)
Number of Null Word Embeddings: 218


###### Convolutional Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(MaxPool1D(5))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(GlobalMaxPool1D())
  model.add(Dropout(0.5))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 300, 200)          126275400 
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 300, 64)           89664     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 60, 64)            0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 60, 64)            28736     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 64)                0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 32)               

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Training Accuracy: 0.809
Validation Accuracy: 0.758


###### Recurrent Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(LSTM(32))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 300, 200)          126275400 
_________________________________________________________________
lstm_4 (LSTM)                (None, 32)                29824     
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 33        
Total params: 126,305,257
Trainable params: 29,857
Non-trainable params: 126,275,400
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Training Accuracy: 0.736
Validation Accuracy: 0.734


###### Results:

- max_len=300: CNN = 80.9% vs. 75.8%; RNN = 73.6% vs. 73.4%
- max_len=500: CNN = 82.9% vs. 76.5%; RNN = 78.2% vs. 77.1%
- max_len=700: CNN = 82.8% vs. 76.2%; RNN = 79% vs. 77.4%
- max_len=1000: CNN = 81% vs. 76.8%; RNN = 78.9% vs, 77.5%
- max_len=3000: CNN = 80.2% vs. 76.6%; RNN = takes too long to train
- i.e. max_len between 700 and 1000 provides the best trade-off between training time and accuracy

##### Fine-Tune Max_Features:

###### Data Preparation:

In [None]:
# Extract Target Variable From Dataset
y = kickstarter_df["campaign_successful"].to_numpy()
text = kickstarter_df["title"] + " " + kickstarter_df["blurb"] + " " + kickstarter_df["story"] + " " + kickstarter_df["risks"] + " " + kickstarter_df["creator_bio"] + " " + kickstarter_df["reward_description"]

print(type(y))
print(y.shape)
print(type(text))
print(len(text))

<class 'numpy.ndarray'>
(246891,)
<class 'pandas.core.series.Series'>
246891


In [None]:
# Remove stopwords and punctation
stopwords = set(list(ENGLISH_STOP_WORDS) + ["s"])
text = text.str.replace(r"\.", "")
text = text.progress_apply(lambda x: " ".join([token for token in x.split() if token not in stopwords]))
text = text.to_numpy()

100%|██████████| 246891/246891 [00:42<00:00, 5753.81it/s]


In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=(train_size+val_size),test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
text_subtrain, text_val, y_subtrain, y_val = train_test_split(text_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(len(text_train)))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(len(text_subtrain)))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(len(text_val)))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(len(text_test)))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: 209858
Shape of y_train: (209858,)
Shape of X_subtrain: 172824
Shape of y_subtrain: (172824,)
Shape of X_val: 37034
Shape of y_val: (37034,)
Shape of X_test: 37033
Shape of y_test: (37033,)


In [None]:
import fasttext, fasttext.util
ft = fasttext.load_model('cc.en.200.bin')
print("Embedding Dimension: {}".format(ft.get_dimension()))

Embedding Dimension: 200




In [None]:
# Define parameters for text processing 
max_features = 50000
max_len = 700
embedding_dim = 200

In [None]:
# Convert Texts Into Integer Sequences (Tokenization)
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(text_subtrain)
X_subtrain = tokenizer.texts_to_sequences(text_subtrain)
X_val = tokenizer.texts_to_sequences(text_val)
word_index = tokenizer.word_index
print("Included Token: {}".format(tokenizer.num_words))

Included Token: 50000


In [None]:
# Determine number of words for max_len
num_words = pd.DataFrame([len(x) for x in X_subtrain])
num_words.rename(columns={0:"words"}, inplace=True)
num_words.describe()

Unnamed: 0,words
count,172824.0
mean,495.489689
std,396.125344
min,14.0
25%,235.0
50%,388.0
75%,631.0
max,11407.0


In [None]:
# Pad and Truncate sequences
X_subtrain = pad_sequences(X_subtrain, maxlen=max_len, padding="pre", truncating="post")
X_val = pad_sequences(X_val, maxlen=max_len, padding="pre", truncating="post")

print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Shape of X_val: {}".format(X_val.shape))

Shape of X_subtrain: (172824, 700)
Shape of X_val: (37034, 700)


In [None]:
# Prepare the Embedding Matrix
print('Preparing Embedding Matrix...')
words_not_found = []
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
      embedding_vector = ft.get_word_vector(word)
      if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
      else:
        words_not_found.append(word)
print('Shape of Embedding Matrix: {}'.format(embedding_matrix.shape))
print('Number of Null Word Embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing Embedding Matrix...
Shape of Embedding Matrix: (50000, 200)
Number of Null Word Embeddings: 5


###### Convolutional Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(MaxPool1D(5))
  model.add(Conv1D(64, 7, activation='relu', padding='same'))
  model.add(GlobalMaxPool1D())
  model.add(Dropout(0.5))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 700, 200)          10000000  
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 700, 64)           89664     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 140, 64)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 140, 64)           28736     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)               

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Training Accuracy: 0.784
Validation Accuracy: 0.761


###### Recurrent Neural Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(LSTM(32))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 700, 200)          10000000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                29824     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 10,029,857
Trainable params: 29,857
Non-trainable params: 10,000,000
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Training Accuracy: 0.770
Validation Accuracy: 0.764


###### Results:

- max_features = None: CNN = 81% vs. 76.8%; RNN = 78.9% vs, 77.5%
- max_features = 200,000: CNN = 83.1% vs. 76.2%; RNN = 78.2% vs. 77.1%
- max_features = 100,000: CNN = 79.5% vs. 76.4%; RNN = 73.2% vs. 73.1%
- max_features = 50,000: CNN = 78.4% vs. 76.1%; RNN = 77% vs. 76.4%
- i.e. including all tokens helps to increase model performance (but also higher overfitting, which need to be addressed)

#### c) Fine-Tune CNN Model

##### Data Preparation:

In [None]:
# Extract Target Variable From Dataset
y = kickstarter_df["campaign_successful"].to_numpy()
text = kickstarter_df["title"] + " " + kickstarter_df["blurb"] + " " + kickstarter_df["story"] + " " + kickstarter_df["risks"] + " " + kickstarter_df["creator_bio"] + " " + kickstarter_df["reward_description"]

print(type(y))
print(y.shape)
print(type(text))
print(len(text))

<class 'numpy.ndarray'>
(246891,)
<class 'pandas.core.series.Series'>
246891


In [None]:
# Remove stopwords and punctation
stopwords = set(list(ENGLISH_STOP_WORDS) + ["s"])
text = text.str.replace(r"\.", "")
text = text.progress_apply(lambda x: " ".join([token for token in x.split() if token not in stopwords]))
text = text.to_numpy()

100%|██████████| 246891/246891 [00:43<00:00, 5674.08it/s]


In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=(train_size+val_size),test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
text_subtrain, text_val, y_subtrain, y_val = train_test_split(text_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(len(text_train)))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(len(text_subtrain)))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(len(text_val)))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(len(text_test)))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: 209858
Shape of y_train: (209858,)
Shape of X_subtrain: 172824
Shape of y_subtrain: (172824,)
Shape of X_val: 37034
Shape of y_val: (37034,)
Shape of X_test: 37033
Shape of y_test: (37033,)


In [None]:
import fasttext, fasttext.util
ft = fasttext.load_model('cc.en.200.bin')
print("Embedding Dimension: {}".format(ft.get_dimension()))

Embedding Dimension: 200




In [None]:
# Define parameters for text processing 
max_features = 631377
max_len = 1000
embedding_dim = 200

In [None]:
# Convert Texts Into Integer Sequences (Tokenization)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_subtrain)
X_subtrain = tokenizer.texts_to_sequences(text_subtrain)
X_val = tokenizer.texts_to_sequences(text_val)
word_index = tokenizer.word_index
print("Included Token: {}".format(len(word_index)))

Included Token: 631377


In [None]:
# Determine number of words for max_len
num_words = pd.DataFrame([len(x) for x in X_subtrain])
num_words.rename(columns={0:"words"}, inplace=True)
num_words.describe()

Unnamed: 0,words
count,172824.0
mean,509.529984
std,409.239714
min,14.0
25%,241.0
50%,399.0
75%,649.0
max,11553.0


In [None]:
# Pad and Truncate sequences
X_subtrain = pad_sequences(X_subtrain, maxlen=max_len, padding="pre", truncating="post")
X_val = pad_sequences(X_val, maxlen=max_len, padding="pre", truncating="post")

print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Shape of X_val: {}".format(X_val.shape))

Shape of X_subtrain: (172824, 1000)
Shape of X_val: (37034, 1000)


In [None]:
# Prepare the Embedding Matrix
print('Preparing Embedding Matrix...')
words_not_found = []
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
      embedding_vector = ft.get_word_vector(word)
      if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
      else:
        words_not_found.append(word)
print('Shape of Embedding Matrix: {}'.format(embedding_matrix.shape))
print('Number of Null Word Embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing Embedding Matrix...
Shape of Embedding Matrix: (631377, 200)
Number of Null Word Embeddings: 218


##### Fine-Tune Network Architecture:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(Conv1D(64, 5, activation='relu', padding='same'))
  model.add(AvgPool1D(2))
  model.add(Conv1D(64, 5, activation='relu', padding='same'))
  model.add(AvgPool1D(2))
  model.add(Conv1D(64, 5, activation='relu', padding='same'))
  model.add(AvgPool1D(2))
  model.add(Conv1D(64, 5, activation='relu', padding='same'))
  model.add(AvgPool1D())
  model.add(Flatten())
  model.add(Dense(32, activation='relu'))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_70"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_69 (Embedding)     (None, 1000, 200)         126275400 
_________________________________________________________________
conv1d_221 (Conv1D)          (None, 1000, 64)          64064     
_________________________________________________________________
average_pooling1d_52 (Averag (None, 500, 64)           0         
_________________________________________________________________
conv1d_222 (Conv1D)          (None, 500, 64)           20544     
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 500, 64)           0         
_________________________________________________________________
average_pooling1d_53 (Averag (None, 250, 64)           0         
_________________________________________________________________
conv1d_223 (Conv1D)          (None, 250, 64)         

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Training Accuracy: 0.787
Validation Accuracy: 0.760


Result:
- 1 CNN Layer, 1 Dense Layers, 64 Filters, Kernel Size 7, Pooling Size 2: 77.9% vs. 74.6%
- 1 CNN Layer, 1 Dense Layers, 64 Filters, Kernel Size 7, Pooling Size 5: 79.5% vs. 75%
- 1 CNN Layer, 1 Dense Layers, 64 Filters, Kernel Size 7, Pooling Size 10: 82% vs. 75.4%
- 1 CNN Layer, 1 Dense Layers, 32 Filters, Kernel Size 7, Pooling Size 10: 77.3% vs. 75.3%
- 1 CNN Layer, 1 Dense Layers, 16 Filters, Kernel Size 7, Pooling Size 10: 78.7% vs. 75.5%
-> i.e. the less parameters before the output layer, the less overfitting
- 1 CNN Layer, 2 Dense Layers, 16 Filters, Kernel Size 7, Pooling Size 10, Dense Neurons 32: 79.6% vs. 76.1%
- 1 CNN Layer, 2 Dense Layers, 16 Filters, Kernel Size 7, Pooling Size 10, Dense Neurons 64: 76.9% vs. 75.7%
- 1 CNN Layer, 2 Dense Layers, 16 Filters, Kernel Size 7, Pooling Size 10, Dense Neurons 16: 78.3% vs. 75.9%
- 1 CNN Layer, 3 Dense Layers, 16 Filters, Kernel Size 7, Pooling Size 10, Dense Neurons [64,16]: 78.8% vs. 75.9%
- 1 CNN Layer, 3 Dense Layers, 16 Filters, Kernel Size 7, Pooling Size 10, Dense Neurons [32,8]: 78.1% vs. 75.8%
-> i.e. adding an additional hidden Dense layer helps to improve performance (stick first with 1 hidden layer)
- 2 CNN Layers, 2 Dense Layers, 16 Filters, Kernel Size 7, Pooling Size 10, Dense Neurons 32: 78.2% vs. 76.2% 
- 2 CNN Layers, 2 Dense Layers, 16 Filters, Kernel Size 7, Pooling Size 10, Dense Neurons 16: 79.1% vs. 76%
- 2 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 7, Pooling Size 10, Dense Neurons 32: 79.1% vs. 76.5%
- 2 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 7, Pooling Size 5, Dense Neurons 32: 79.4% vs. 76.5%
- 2 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 79.7% vs. 76.7%
- 2 CNN Layers, 2 Dense Layers, 32 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 78.7% vs. 76.6%
-> i.e. adding an additional CNN layer helps
- 3 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 82.7% vs. 76.6%
- 3 CNN Layers, 2 Dense Layers, 32 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 79.8% vs. 76.9%
- 3 CNN Layers, 2 Dense Layers, 16 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 77.5% vs. 76.5%
- 4 CNN Layers, 2 Dense Layers, 32 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 80% vs. 77%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 79% vs. 77.1%
- 4 CNN Layers, 2 Dense Layers, 16 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 78.6% vs. 77%
- 5 CNN Layers, 2 Dense Layers, 16 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 78.7% vs. 76.7%
- 5 CNN Layers, 2 Dense Layers, 32 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 78.9% vs. 77%
- 5 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 80.1% vs. 76.9%
- 6 CNN Layers, 2 Dense Layers, 32 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 80.5% vs. 76.8%
- 6 CNN Layers, 2 Dense Layers, 16 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 78.9% vs. 76.6%
-> i.e. 4 CNN layers worked the best; 32-64 filters worked the best; pooling of 2 worked the best
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 3, Pooling Size 2, Dense Neurons 32: 77.5% vs. 76.6%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32: 80.6% vs. 77.4%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 7, Pooling Size 2, Dense Neurons 32: 79% vs. 77.1%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 10, Pooling Size 2, Dense Neurons 32: 78.5% vs. 76.7%
-> i.e. kernel size of 5 performs the best
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32: 80.6% vs. 77.4%
- 4 CNN Layers, 2 Dense Layers, 32 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32: 77.2% vs. 76.6%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 64: 78% vs. 76.6%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 16: 80.9% vs. 77.2%
- 4 CNN Layers, 3 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons [64,16]: 83.5% vs. 76.5%
- 4 CNN Layers, 3 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons [32,32]: 80.2% vs. 77.4%
- 4 CNN Layers, 3 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons [64,64]: 78.2% vs. 76.9%
- 4 CNN Layers, 3 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons [16,16]: 80.8% vs. 77.2%
-> i.e. 2 Dense Layers with 32 hidden units provided best results
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32, padding=same: 80.6% vs. 77.4%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32, padding=valid: 79.3% vs. 77.2%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32, padding=causal: 80.3% vs. 77.2%
-> i.e. same padding is a little bit better
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32, pooling=max, flatten: 80.6% vs. 77.4%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32, pooling=avg, flatten: 80.1% vs. 77.5%
- 4 CNN Layers, 1 Dense Layer, 64 Filters, Kernel Size 5, Pooling Size 2, pooling=avg, global: 80.9% vs. 77.1%
- 4 CNN Layers, 1 Dense Layer, 64 Filters, Kernel Size 5, Pooling Size 2, pooling=max, global: 79.5% vs. 76.5%
-> i.e. average pooling with flatten provided the best results
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32, pooling=avg, flatten, relu: 80.4% vs. 77.4%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32, pooling=avg, flatten, elu: 76.5% vs. 75.1%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32, pooling=avg, flatten, selu: 75.3% vs. 73.5%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32, pooling=avg, flatten, gelu: 76.5% vs. 75.6%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32, pooling=avg, flatten, swish: 79% vs. 75.9%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32, pooling=avg, flatten, tanh: 78% vs. 74.5%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32, pooling=avg, flatten, sigmoid: 74.8% vs. 74.6%
- 4 CNN Layers, 2 Dense Layers, 64 Filters, Kernel Size 5, Pooling Size 2, Dense Neurons 32, pooling=avg, flatten, leaky relu: 78.7% vs. 76%
-> i.e. relu activation was the best

##### Add Regularization To The Network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(Conv1D(64, 5, activation='relu', padding='same'))
  model.add(AvgPool1D(2))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Conv1D(64, 5, activation='relu', padding='same'))
  model.add(AvgPool1D(2))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Conv1D(64, 5, activation='relu', padding='same'))
  model.add(AvgPool1D(2))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Conv1D(64, 5, activation='relu', padding='same'))
  model.add(AvgPool1D())
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Flatten())
  model.add(Dense(32, activation='relu'))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 200)         126275400 
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 1000, 64)          64064     
_________________________________________________________________
average_pooling1d_4 (Average (None, 500, 64)           0         
_________________________________________________________________
gaussian_noise_1 (GaussianNo (None, 500, 64)           0         
_________________________________________________________________
layer_normalization_5 (Layer (None, 500, 64)           128       
_________________________________________________________________
dropout_5 (Dropout)          (None, 500, 64)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 500, 64)          

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=10, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Training Accuracy: 0.725
Validation Accuracy: 0.725


Result:
- None: 80.4% vs. 77.4%
- Dropout: 80.5% vs. 77.8% (Input=0.2, Layers=0.5)
- Batch Normalization: led to extreme overfitting; 81.4% vs. 77.6% (with dropout)
- Layer Normalization: 81.7% vs. 78.2% (first normalization, then dropout, no input dropout)
- Gaussian Noise: 79.9% vs. 78.2% (incl. normalization + dropout)
- L2: 72.5% vs. 72.5%
-> i.e. a combination of Gaussian Noise + Layer Normalization + Dropout provided the best results

##### Fine-Tune Optimization Algorithm:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(AvgPool1D(2))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(AvgPool1D(2))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(AvgPool1D(2))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(AvgPool1D())
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Flatten())
  model.add(Dense(32, activation='relu', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Dense(1, activation='sigmoid', kernel_initializer=GlorotUniform(seed=seed_value)))
  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=Nadam(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 1000, 200)         126275400 
_________________________________________________________________
conv1d_52 (Conv1D)           (None, 1000, 64)          64064     
_________________________________________________________________
average_pooling1d_52 (Averag (None, 500, 64)           0         
_________________________________________________________________
gaussian_noise_61 (GaussianN (None, 500, 64)           0         
_________________________________________________________________
layer_normalization_65 (Laye (None, 500, 64)           128       
_________________________________________________________________
dropout_65 (Dropout)         (None, 500, 64)           0         
_________________________________________________________________
conv1d_53 (Conv1D)           (None, 500, 64)         

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=10, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Training Accuracy: 0.790
Validation Accuracy: 0.777


Results:
- RMSprop: 79.9% vs. 78.2%
- RMSprop_centered: 79.1% vs. 78.1%
- Adam: 80.1% vs. 78.2%
- Adam_amsgrad: not supported with TPU
- Adamax: 79.5% vs. 78.2% (but took extremely long)
- Nadam: 79.8% vs. 78.3%
-> i.e. Nadam provided the best results
- GlorotUniform: 79.8% vs. 78.3%
- GlorotNormal: 80.3% vs. 78.2%
- HeUniform: 80.1% vs. 78.3%
- HeNormal: 80.9% vs. 78.2%
- LecunUniform: 80.8% vs. 78.3%
- LecunNormal: 80% vs. 78.1%
-> i.e. GlorotUniform provided the best results
- But: differences were only marginal between all optimizers and initializers

##### Try different types of CNN layers:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(AvgPool1D(2))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(AvgPool1D(2))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(AvgPool1D(2))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(AvgPool1D())
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Flatten())
  model.add(Dense(32, activation='relu', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Dense(1, activation='sigmoid', kernel_initializer=GlorotUniform(seed=seed_value)))
  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=Nadam(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 1000, 200)         126275400 
_________________________________________________________________
locally_connected1d_1 (Local (None, 996, 64)           63807744  
_________________________________________________________________
average_pooling1d_60 (Averag (None, 498, 64)           0         
_________________________________________________________________
gaussian_noise_71 (GaussianN (None, 498, 64)           0         
_________________________________________________________________
layer_normalization_75 (Laye (None, 498, 64)           128       
_________________________________________________________________
dropout_75 (Dropout)         (None, 498, 64)           0         
_________________________________________________________________
locally_connected1d_2 (Local (None, 494, 64)         

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=10, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Results:
- CNN: 79.8% vs. 78.3%
- Separable CNN: 80% vs. 78.1%
- Locally-Connected CNN: overfits quite badly

#### d) Fine-Tune RNN Model

##### Data Preparation:

In [None]:
# Extract Target Variable From Dataset
y = kickstarter_df["campaign_successful"].to_numpy()
text = kickstarter_df["title"] + " " + kickstarter_df["blurb"] + " " + kickstarter_df["story"] + " " + kickstarter_df["risks"] + " " + kickstarter_df["creator_bio"] + " " + kickstarter_df["reward_description"]

print(type(y))
print(y.shape)
print(type(text))
print(len(text))

<class 'numpy.ndarray'>
(246891,)
<class 'pandas.core.series.Series'>
246891


In [None]:
# Remove stopwords and punctation
stopwords = set(list(ENGLISH_STOP_WORDS) + ["s"])
text = text.str.replace(r"\.", "")
text = text.progress_apply(lambda x: " ".join([token for token in x.split() if token not in stopwords]))
text = text.to_numpy()

100%|██████████| 246891/246891 [00:43<00:00, 5664.08it/s]


In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=(train_size+val_size),test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
text_subtrain, text_val, y_subtrain, y_val = train_test_split(text_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(len(text_train)))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(len(text_subtrain)))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(len(text_val)))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(len(text_test)))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: 209858
Shape of y_train: (209858,)
Shape of X_subtrain: 172824
Shape of y_subtrain: (172824,)
Shape of X_val: 37034
Shape of y_val: (37034,)
Shape of X_test: 37033
Shape of y_test: (37033,)


In [None]:
import fasttext, fasttext.util
ft = fasttext.load_model('cc.en.300.bin')
print("Embedding Dimension: {}".format(ft.get_dimension()))

Embedding Dimension: 200




In [None]:
# Define parameters for text processing 
max_features = 631377
max_len = 1000
embedding_dim = 300

In [None]:
# Convert Texts Into Integer Sequences (Tokenization)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_subtrain)
X_subtrain = tokenizer.texts_to_sequences(text_subtrain)
X_val = tokenizer.texts_to_sequences(text_val)
word_index = tokenizer.word_index
print("Included Token: {}".format(len(word_index)))

Included Token: 631377


In [None]:
# Determine number of words for max_len
num_words = pd.DataFrame([len(x) for x in X_subtrain])
num_words.rename(columns={0:"words"}, inplace=True)
num_words.describe()

Unnamed: 0,words
count,172824.0
mean,509.529984
std,409.239714
min,14.0
25%,241.0
50%,399.0
75%,649.0
max,11553.0


In [None]:
# Pad and Truncate sequences
X_subtrain = pad_sequences(X_subtrain, maxlen=max_len, padding="pre", truncating="post")
X_val = pad_sequences(X_val, maxlen=max_len, padding="pre", truncating="post")

print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Shape of X_val: {}".format(X_val.shape))

Shape of X_subtrain: (172824, 1000)
Shape of X_val: (37034, 1000)


In [None]:
# Prepare the Embedding Matrix
print('Preparing Embedding Matrix...')
words_not_found = []
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
      embedding_vector = ft.get_word_vector(word)
      if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
      else:
        words_not_found.append(word)
print('Shape of Embedding Matrix: {}'.format(embedding_matrix.shape))
print('Number of Null Word Embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing Embedding Matrix...
Shape of Embedding Matrix: (631377, 300)
Number of Null Word Embeddings: 218


##### Test LSTM vs GRU:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(GRU(32, kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 1000, 200)         126275400 
_________________________________________________________________
gru (GRU)                    (None, 32)                22464     
_________________________________________________________________
dense_37 (Dense)             (None, 1)                 33        
Total params: 126,297,897
Trainable params: 22,497
Non-trainable params: 126,275,400
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=10, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Training Accuracy: 0.793
Validation Accuracy: 0.777


Result: 
- LSTM: 79.1% vs. 77.3%
- GRU: 79.3% vs. 77.7%
-> i.e. GRU was better and converged faster

##### Test Regularization Techniques Before Dense Layer and After Input Layer:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, embeddings_initializer=Constant(embedding_matrix), input_length=max_len, trainable=False))
  model.add(GaussianNoise(stddev=0.1))
  model.add(GRU(64, kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=Nadam(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 1000, 200)         126275400 
_________________________________________________________________
gaussian_noise_1 (GaussianNo (None, 1000, 200)         0         
_________________________________________________________________
gru_12 (GRU)                 (None, 64)                51072     
_________________________________________________________________
dropout_8 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 65        
Total params: 126,326,537
Trainable params: 51,137
Non-trainable params: 126,275,400
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=10, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Results:
- Dropout in RNN = takes too long to train
- Dropout Before Dense = 80% vs. 78.2% (Dense=0.5)
- Dropout Before Dense + After Input: 80.5% vs. 78.3%; but took longer to train (Input=0.2, Dense=0.5)
- Batch Normalization Before Dense: was not useful; accuracy jumped around
- Layer Normalization Before Dense: 79.1% vs. 77.6%
- Layer Normalization + Dropout Before Dense: 79.9% vs. 78.1%
- Gaussian Noise Before Dense: 79.2% vs. 78%
-> i.e. adding only dropout provided the best results; maybe now it is possible to increase the capacity of the network

##### Fine-Tune Network Architecture:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, embeddings_initializer=Constant(embedding_matrix), input_length=max_len, trainable=False))
  model.add(Bidirectional(GRU(64, kernel_initializer=GlorotUniform(seed=seed_value))))
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=Nadam(), loss="binary_crossentropy", metrics=["binary_accuracy"], steps_per_execution=100)

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_35"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_35 (Embedding)     (None, 1000, 200)         126275400 
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               102144    
_________________________________________________________________
dropout_31 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_35 (Dense)             (None, 1)                 129       
Total params: 126,377,673
Trainable params: 102,273
Non-trainable params: 126,275,400
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Training Accuracy: 0.805
Validation Accuracy: 0.776


Results:
- 512 Units: 79.2% vs. 77.5%
- 256 Units: 81.6% vs. 77.7%
- 128 Units: 80.5% vs. 78%
- 64 Units: 80% vs. 78.2%
- 32 Units: 79.9% vs. 78%
- 2 Layers, 32 Units: 79.2% vs. 77.9%
- 2 Layers, 64 Units: 79.8% vs. 77.8%
- Bidirectional 64 Units: 80.5% vs. 77.6%
-> i.e. 1 Layer, 64 Units is the best


##### Test Recurrent Dropout:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, embeddings_initializer=Constant(embedding_matrix), input_length=max_len, trainable=False))
  model.add(GRU(256, kernel_initializer=GlorotUniform(seed=seed_value), dropout=0.5, recurrent_dropout=0.5))
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=Nadam(clipnorm=1.0), loss="binary_crossentropy", metrics=["binary_accuracy"], steps_per_execution=338)

In [None]:
# Show summary of the model
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 200)         126275400 
_________________________________________________________________
gru (GRU)                    (None, 256)               351744    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 126,627,401
Trainable params: 352,001
Non-trainable params: 126,275,400
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Training Accuracy: 0.809
Validation Accuracy: 0.788


Results:
- dropout=0.5, recurrent_dropout=0.5, 32 Units: underfits too much
- dropout=0.5, recurrent_dropout=0.5, 64 Units: 80.3% vs. 78.6%
- dropout=0.5, recurrent_dropout=0.5, 128 Units: 79.9% vs. 78.4%
- dropout=0.5, recurrent_dropout=0.5, 256 Units: 82.2% vs. 78.8%
- dropout=0.5, recurrent_dropout=0.5, 512 Units: jumps around to heavily
- dropout=0.5, recurrent_dropout=0.5, 64 Units, 2 Layers: 80.3% vs. 78.7%
- dropout=0.5, recurrent_dropout=0.5, 256 Units, 2 Layers: 82.1% vs. 78.8%
- dropout=0.5, recurrent_dropout=0.5, 64 Units, 3 Layers: 80.2% vs. 78.7%
-> i.e. dropout + 1 layer + 256 units provided the best results
-> adding additional layers did not help to improve performance
- dropout=0.3, recurrent_dropout=0.3, 64 Units: 79.4% vs. 78.4%
- dropout=0.3, recurrent_dropout=0.3, 256 Units: 80.4% vs.78.4%
-> i.e. a high dropout of 0.5 provided the best results
- dropout=0.5, recurrent_dropout=0.5, dense_dropout=0.5, 256 Units: 81.4% vs. 78.8%
-> i.e. adding Dense dropout helps a little bit to reduce overfitting
- dropout=0.5, recurrent_dropout=0.5, dense_dropout=0.5, 256 Units; gradient-clipping: 80.9% vs. 78.8%
-> i.e. gradient clipping further reduced the overfitting effect

##### Re-try 300-dimensional embeddings with the regularized network:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, embeddings_initializer=Constant(embedding_matrix), input_length=max_len, trainable=False))
  model.add(GRU(256, kernel_initializer=GlorotUniform(seed=seed_value), dropout=0.5, recurrent_dropout=0.5))
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=Nadam(clipnorm=1.0), loss="binary_crossentropy", metrics=["binary_accuracy"], steps_per_execution=338)

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1000, 300)         189413100 
_________________________________________________________________
gru_1 (GRU)                  (None, 256)               428544    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 189,841,901
Trainable params: 428,801
Non-trainable params: 189,413,100
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Training Accuracy: 0.819
Validation Accuracy: 0.789


Result:
- 81.9% vs. 78.9%
-> i.e. using 300-dimensional embeddings minimally increased model performance
-> since 300-dimensional embeddings also worked better with CNNs, switch back again to 300 dimensions

#### e) Try Using different embeddings

##### Data Preparation:

In [None]:
# Extract Target Variable From Dataset
y = kickstarter_df["campaign_successful"].to_numpy()
text = kickstarter_df["title"] + " " + kickstarter_df["blurb"] + " " + kickstarter_df["story"] + " " + kickstarter_df["risks"] + " " + kickstarter_df["creator_bio"] + " " + kickstarter_df["reward_description"]

print(type(y))
print(y.shape)
print(type(text))
print(len(text))

<class 'numpy.ndarray'>
(246891,)
<class 'pandas.core.series.Series'>
246891


In [None]:
# Remove stopwords and punctation
stopwords = set(list(ENGLISH_STOP_WORDS) + ["s"])
text = text.str.replace(r"\.", "")
text = text.progress_apply(lambda x: " ".join([token for token in x.split() if token not in stopwords]))
text = text.to_numpy()

In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=(train_size+val_size),test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
text_subtrain, text_val, y_subtrain, y_val = train_test_split(text_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(len(text_train)))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(len(text_subtrain)))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(len(text_val)))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(len(text_test)))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: 209858
Shape of y_train: (209858,)
Shape of X_subtrain: 172824
Shape of y_subtrain: (172824,)
Shape of X_val: 37034
Shape of y_val: (37034,)
Shape of X_test: 37033
Shape of y_test: (37033,)


In [None]:
# Download word embeddings
import gensim.downloader as api
glove = api.load("glove-twitter-200")

In [None]:
# Define parameters for text processing 
max_features = 100000
max_len = 1000
embedding_dim = 200

In [None]:
# Convert Texts Into Integer Sequences (Tokenization)
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(text_subtrain)
X_subtrain = tokenizer.texts_to_sequences(text_subtrain)
X_val = tokenizer.texts_to_sequences(text_val)
word_index = tokenizer.word_index
print("Included Token: {}".format(tokenizer.num_words))

Included Token: 100000


In [None]:
# Determine number of words for max_len
num_words = pd.DataFrame([len(x) for x in X_subtrain])
num_words.rename(columns={0:"words"}, inplace=True)
num_words.describe()

Unnamed: 0,words
count,172824.0
mean,502.025529
std,402.349536
min,14.0
25%,237.0
50%,393.0
75%,640.0
max,11478.0


In [None]:
# Pad and Truncate sequences
X_subtrain = pad_sequences(X_subtrain, maxlen=max_len, padding="pre", truncating="post")
X_val = pad_sequences(X_val, maxlen=max_len, padding="pre", truncating="post")

print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Shape of X_val: {}".format(X_val.shape))

Shape of X_subtrain: (172824, 1000)
Shape of X_val: (37034, 1000)


In [None]:
# Prepare the Embedding Matrix
print('Preparing Embedding Matrix...')
words_not_found = []
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
      if word in glove.vocab:
        embedding_matrix[i] = glove.get_vector(word)
      else:
        words_not_found.append(word)
print('Shape of Embedding Matrix: {}'.format(embedding_matrix.shape))
print('Number of Null Word Embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing Embedding Matrix...
Shape of Embedding Matrix: (100000, 300)
Number of Null Word Embeddings: 41096


##### CNN:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
  model.add(Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(AvgPool1D(2))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(AvgPool1D(2))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(AvgPool1D(2))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(AvgPool1D())
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Flatten())
  model.add(Dense(32, activation='relu', kernel_initializer=GlorotUniform(seed=seed_value)))
  model.add(GaussianNoise(stddev=0.1))
  model.add(LayerNormalization())
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Dense(1, activation='sigmoid', kernel_initializer=GlorotUniform(seed=seed_value)))
  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=Nadam(), loss="binary_crossentropy", metrics=["binary_accuracy"])

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_26 (Embedding)     (None, 1000, 300)         30000000  
_________________________________________________________________
conv1d_27 (Conv1D)           (None, 1000, 64)          96064     
_________________________________________________________________
average_pooling1d_27 (Averag (None, 500, 64)           0         
_________________________________________________________________
gaussian_noise_27 (GaussianN (None, 500, 64)           0         
_________________________________________________________________
layer_normalization_29 (Laye (None, 500, 64)           128       
_________________________________________________________________
dropout_42 (Dropout)         (None, 500, 64)           0         
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 500, 64)         

In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Training Accuracy: 0.834
Validation Accuracy: 0.784


##### RNN:

In [None]:
# Define the model
def create_model():
  model = Sequential()
  model.add(Embedding(max_features, embedding_dim, embeddings_initializer=Constant(embedding_matrix), input_length=max_len, trainable=False))
  model.add(GRU(256, kernel_initializer=GlorotUniform(seed=seed_value), dropout=0.5, recurrent_dropout=0.5))
  model.add(Dropout(rate=0.5, seed=seed_value))
  model.add(Dense(1, activation='sigmoid'))
  return model

In [None]:
# Compile the model
with strategy.scope():
    model = create_model()
    model.compile(optimizer=Nadam(clipnorm=1.0), loss="binary_crossentropy", metrics=["binary_accuracy"], steps_per_execution=338)

In [None]:
# Show summary of the model
model.summary()

Model: "sequential_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_27 (Embedding)     (None, 1000, 300)         30000000  
_________________________________________________________________
gru_13 (GRU)                 (None, 256)               428544    
_________________________________________________________________
dropout_47 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 1)                 257       
Total params: 30,428,801
Trainable params: 428,801
Non-trainable params: 30,000,000
_________________________________________________________________


In [None]:
# Train the model
model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Training Accuracy: 0.820
Validation Accuracy: 0.789


##### Results:

- FastText 300d: CNN = 79.8% vs. 78.3%; RNN = 81.9% vs. 78.9%
- Glove Twitter 200d: 82.1% vs. 78.2%; RNN = 82% vs. 78.9%
- Glove Wikipedia + Gigaword 300d: CNN = 80.8% vs. 78.3%; RNN = 80.3% vs. 78.4%
- Word2Vec Google News 300d: CNN = 83.4% vs. 78.4%; RNN = 82% vs. 78.9%
- i.e. different embeddings did not make a huge difference
- i.e. stick to FastText, as it provided the best trade-off between good accuracy and low overfitting

#### f) Best-Found Model: NN Text

- Preprocessing: Stopwords Removed; All Text Attributes; No max_features restrictions; max_len=1000; embedding dimension = 300; Fasttext
- CNN: 4 Conv Layer (64 Filter, Window Size 5, ReLU, padding=same); AveragePooling (Size 2); Gaussian Noise; Layer Normalization; Dropout (Rate=0.5); 1 Dense Layer (32 Units)
- RNN: 1 GRU Layer (256 Units); Input/Output/Recurrent Dropout (Rate=0.5)
- Optimization: GlorotUniform; Nadam; clipnorm=1.0; Batch Size: 512