#### Setup:

In [None]:
!pip install -U tensorflow keras

In [None]:
# Import general Python libraries
import pandas as pd
import numpy as np
import random
import sklearn
import seaborn as sns
import os
import matplotlib.pyplot as plt
import itertools

In [None]:
# Specify seeds for random-operations
seed_value = 0
os.environ['PYTHONHASHSEED']=str(seed_value)
np.random.seed(seed_value)
random.seed(seed_value)

In [None]:
# Import sklearn-specific modules
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer, ENGLISH_STOP_WORDS

In [None]:
# Import tensorflow-specific modules
import tensorflow as tf
tf.random.set_seed(seed_value)
print("Tensorflow Version: {}".format(tf.__version__))
print("Keras Version: {}".format(tf.keras.__version__))

Tensorflow Version: 2.4.1
Keras Version: 2.4.0


In [None]:
# Import keras-specific modules
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adadelta, RMSprop, Adam, Adamax, Nadam
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.initializers import GlorotNormal, GlorotUniform, HeNormal, HeUniform, LecunNormal, LecunUniform
from tensorflow.keras.metrics import AUC, Precision, Recall

In [None]:
# Set pandas options
pd.set_option("display.max_columns", None)

In [None]:
# Adjust stopword list
stopwords = list(ENGLISH_STOP_WORDS) + ["kickstarter", "year", "dollar"]
len(stopwords)

321

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### a) Prepare Dataset

In [None]:
# Import Dataset
kickstarter_df = pd.read_csv("04_Final Datasets/Kickstarter_Text.csv", index_col=0)
print(kickstarter_df.shape)
print(len(kickstarter_df.index.unique()))
kickstarter_df.head(1)

(246891, 7)
246891


Unnamed: 0,campaign_successful,title,blurb,story,risks,reward_description,creator_bio
22821161,0,sentio golf putters. feel is the difference,choose the feel you want with our patented flo...,sentio putters feature a unique floating face...,high tech process although we have made severa...,our eternal gratitude. every little bit helps ...,sentio golf is driven to produce the most adva...


In [None]:
# Convert dataset and target variable to Numpy Arrays
y = kickstarter_df["campaign_successful"].to_numpy()
text = kickstarter_df["story"].to_numpy()

print(type(y))
print(y.shape)
print(type(text))
print(text.shape)

<class 'numpy.ndarray'>
(246891,)
<class 'numpy.ndarray'>
(246891,)


In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=(train_size+val_size),test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
text_subtrain, text_val, y_subtrain, y_val = train_test_split(text_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(text_train.shape))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(text_subtrain.shape))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(text_val.shape))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(text_test.shape))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: (209858,)
Shape of y_train: (209858,)
Shape of X_subtrain: (172824,)
Shape of y_subtrain: (172824,)
Shape of X_val: (37034,)
Shape of y_val: (37034,)
Shape of X_test: (37033,)
Shape of y_test: (37033,)


#### b) Baseline

This model provides the baseline, which will be used as starting point for the hyperparameter search. The goal is to gradually improve the model performance.

**Hyperparameters:**
- CountVectorizer; No Limits; English Stop Words; No N-grams
- Batch_Size = 512
- No Regularization
- Weight Initializer = GlorotNormal
- Optimizer = RMSProp_centered

In [None]:
# Convert Text into a BOW representation
vectorizer = CountVectorizer(stop_words=stopwords, ngram_range=(1,1), analyzer="word", min_df=0.0, max_df=1.0, binary=False, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)

print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Datatype of X_subtrain: {}, {}".format(type(X_subtrain), X_subtrain.dtype))
print("Shape of X_val: {}".format(X_val.shape))
print("Datatype of X_val: {}, {}".format(type(X_val), X_val.dtype))

Shape of X_subtrain: (172824, 431211)
Datatype of X_subtrain: <class 'scipy.sparse.csr.csr_matrix'>, float32
Shape of X_val: (37034, 431211)
Datatype of X_val: <class 'scipy.sparse.csr.csr_matrix'>, float32


In [None]:
model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=GlorotNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(centered=True), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Training Accuracy: 0.799
Validation Accuracy: 0.758


Result: model starts to overfit very quickly

#### c) Hyperparameter Tuning

##### Increase the patience to see if the model maybe was stuck in a plateau:

In [None]:
model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=GlorotNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(centered=True), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=10, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Training Accuracy: 0.782
Validation Accuracy: 0.757


Result: model was not stuck in a plateau

##### Tune different regularizers:

In [None]:
counter=0
for regularizer, reg_rate in itertools.product(["L1", "L2"],[0.001, 0.01, 0.1, 0, 1, 10, 100]):
  counter += 1
  print("Round: {}, Params: {} {}".format(counter, regularizer, reg_rate))
  if regularizer=="L1": reg = L1(l1=reg_rate)
  if regularizer=="L2": reg = L2(l2=reg_rate)
  model = Sequential()
  model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
  model.add(Dense(1, activation="sigmoid", kernel_regularizer=reg, kernel_initializer=GlorotNormal(seed=seed_value)))
  model.compile(optimizer=RMSprop(centered=True), loss="binary_crossentropy", metrics=["binary_accuracy"])
  history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
  print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
  print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Round: 1, Params: L1 0.001
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Training Accuracy: 0.742
Validation Accuracy: 0.738
Round: 2, Params: L1 0.01
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Training Accuracy: 0.691
Validation Accuracy: 0.691
Round: 3, Params: L1 0.1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Training Accuracy: 0.650
Validation Accuracy: 0.650
Round: 4, Params: L1 0
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Training Accuracy: 0.784
Validation Accuracy: 0.758
Round: 5, Params: L1 1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/10

Result: 
- only very low regularization didn't lead to a decrease in performance
- L2 regularization performed better than L1
- in general: regularization didn't help to decrease overfitting, but rather decreased both training and validation accuracy (probably the dimensionality is just too high)

##### Tune different optimizers:

In [None]:
counter=0
for optimizer in ["RMSprop", "RMSprop_centered", "Adam", "Adam_amsgrad", "Adamax", "Nadam"]:
  counter += 1
  print("Round: {}, Params: {}".format(counter, optimizer))
  if optimizer=="RMSprop": opt = RMSprop()
  if optimizer=="RMSprop_centered": opt = RMSprop(centered=True)
  if optimizer=="Adam": opt = Adam()
  if optimizer=="Adam_amsgrad": opt = Adam(amsgrad=True)
  if optimizer=="Adamax": opt = Adamax()
  if optimizer=="Nadam": opt = Nadam()
  model = Sequential()
  model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
  model.add(Dense(1, activation="sigmoid", kernel_initializer=GlorotNormal(seed=seed_value)))
  model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["binary_accuracy"])
  history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
  print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
  print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Round: 1, Params: RMSprop
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Training Accuracy: 0.800
Validation Accuracy: 0.764
Round: 2, Params: RMSprop_centered
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Training Accuracy: 0.798
Validation Accuracy: 0.758
Round: 3, Params: Adam
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.801
Validation Accuracy: 0.763
Round: 4, Params: Adam_amsgrad
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Training Accuracy: 0.824
Validation Accuracy: 0.760
Round: 5, Params: Adamax
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Training Accuracy: 0.775
Validation Accuracy: 0.758
Round: 6, Params: Nadam
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.801
Validation Accuracy: 0.762


Result:
- RMSprop led to the best result and the least overfitting (i.e. use RMSprop)

##### Tune different weight initializers:

In [None]:
counter=0
for initializer in ["glorot_normal", "glorot_uniform", "he_normal", "he_uniform", "lecun_normal", "lecun_uniform"]:
  counter += 1
  print("Round: {}, Params: {}".format(counter, initializer))
  if initializer=="glorot_normal": init = GlorotNormal(seed=seed_value)
  if initializer=="glorot_uniform": init = GlorotUniform(seed=seed_value)
  if initializer=="he_normal": init = HeNormal(seed=seed_value)
  if initializer=="he_uniform": init = HeUniform(seed=seed_value)
  if initializer=="lecun_normal": init = LecunNormal(seed=seed_value)
  if initializer=="lecun_uniform": init = LecunUniform(seed=seed_value)
  model = Sequential()
  model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
  model.add(Dense(1, activation="sigmoid", kernel_initializer=init))
  model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
  history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
  print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
  print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Round: 1, Params: glorot_normal
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.787
Validation Accuracy: 0.761
Round: 2, Params: glorot_uniform
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.785
Validation Accuracy: 0.760
Round: 3, Params: he_normal
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.788
Validation Accuracy: 0.764
Round: 4, Params: he_uniform
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.788
Validation Accuracy: 0.764
Round: 5, Params: lecun_normal
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.788
Validation Accuracy: 0.763
Round: 6, Params: lecun_uniform
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Training Accuracy: 0.799
Validation Accuracy: 0.762


Result:
- normal distributions worked a little bit better
- He was the best; Lecun the second-best
- however, the differences were only marginal

##### Tune different batch sizes:

In [None]:
counter=0
for batch_size in [16, 32, 64, 128, 256, 512, 1024]:
  counter += 1
  print("Round: {}, Params: {}".format(counter, batch_size))
  model = Sequential()
  model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=batch_size))
  model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
  model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
  history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=batch_size, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
  print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=batch_size, verbose=0)[1]))
  print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=batch_size, verbose=0)[1]))

Round: 1, Params: 16
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.767
Validation Accuracy: 0.744
Round: 2, Params: 32
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.775
Validation Accuracy: 0.751
Round: 3, Params: 64
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.782
Validation Accuracy: 0.755
Round: 4, Params: 128
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.786
Validation Accuracy: 0.759
Round: 5, Params: 256
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.784
Validation Accuracy: 0.759
Round: 6, Params: 512
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Training Accuracy: 0.799
Validation Accuracy: 0.761
Round: 7, Params: 1024
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Training

Result: higher batch sizes led to the best results

##### Test different max_df variations:

In [None]:
vectorizer = CountVectorizer(stop_words=stopwords, ngram_range=(1,1), analyzer="word", min_df=0.0, max_df=0.0001, binary=False, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Tokens: 388183
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Training Accuracy: 0.793
Validation Accuracy: 0.679


Results:
- max_df=0.01 -> nearly no tokens were dropped -> reduce
- max_df=0.0001 -> reduced the number of tokens to some extent, but the accuracy dropped significantly
- i.e. tuning max_df is the wrong approach

##### Tune different min_df variations:

In [None]:
vectorizer = CountVectorizer(stop_words=stopwords, ngram_range=(1,1), analyzer="word", min_df=0.0001, max_df=1.0, binary=False, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Tokens: 43028
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.783
Validation Accuracy: 0.764


Results:
- min_df=0.0001 (17 documents) -> 43,000 Token -> 76.4% accuracy (a little bit more overfitting)
- min_df=0.001 (172 documents) -> 13,600 Token -> 76.1% accuracy (way less overfitting)
- min_df=0.01 (1720 documents) -> 3,000 Token -> 75.1% accuracy (nearly no overfitting)
- min_df=0.1 (17,200 documents) -> 200 Token -> 71% accuracy
- i.e. tune model to have values in the range of 10,000 - 50,000 token


##### Tune different max_feature values:

In [None]:
vectorizer = CountVectorizer(stop_words=stopwords, ngram_range=(1,1), analyzer="word", min_df=0.0, max_df=1.0, max_features=45000, binary=False, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Tokens: 45000
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.782
Validation Accuracy: 0.763


Results:
- max_features=10,000 -> 77.7% vs. 76%
- max_features=20,000 -> 77.8% vs. 76.1%
- max_features=30,000 -> 78.1% vs. 76.3%
- max_features=35,000 -> 78.2% vs. 76.4%
- max_features=40,000 -> 78.4% vs. 76.4%
- max_features=45,000 -> 78.2% vs. 76.3%
- max_features=50,000 -> 78.4% vs. 76.3%

##### See if different types of vectorizers help:

In [None]:
# Binary vectorization instead of count-based
vectorizer = CountVectorizer(stop_words=stopwords, ngram_range=(1,1), analyzer="word", min_df=0.0, max_df=1.0, max_features=35000, binary=True, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Tokens: 35000
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Training Accuracy: 0.789
Validation Accuracy: 0.766


In [None]:
# Tfidf vectorization instead of count-based
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,1), analyzer="word", min_df=0.0, max_df=1.0, max_features=35000, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
X_subtrain.sort_indices()
X_val.sort_indices()
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Tokens: 35000
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Training Accuracy: 0.798
Validation Accuracy: 0.767


Result: Tfidf-rescaling looks very promising

In [None]:
# Tfidf vectorization with sub-linear term-frequency instead of count-based
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,1), analyzer="word", min_df=0.0, max_df=1.0, max_features=35000, sublinear_tf=True ,dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
X_subtrain.sort_indices()
X_val.sort_indices()
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Tokens: 35000
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Training Accuracy: 0.798
Validation Accuracy: 0.769


Result: adding sublinear TF helped to increase performance

In [None]:
# Tfidf vectorization with binary occurence instead of count-based
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,1), analyzer="word", min_df=0.0, max_df=1.0, max_features=35000, binary=True,dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
X_subtrain.sort_indices()
X_val.sort_indices()
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Tokens: 35000
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Training Accuracy: 0.804
Validation Accuracy: 0.769


Result: did not help to improve performance

In [None]:
# Tfidf vectorization with L1 norm instead of L2
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,1), analyzer="word", min_df=0.0, max_df=1.0, max_features=35000, sublinear_tf=True, norm="l1", dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
X_subtrain.sort_indices()
X_val.sort_indices()
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=200, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Tokens: 35000
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 7

Result: using L1 norm extremely prolonged the training time and did not lead to a better result

Results:
- using Tfidf instead of count-based extremely increased model performance (model did not overfit anymore and was able to learn)
- also adding sublinear_tf helped to further increase model performance

##### Try to see if adding n-grams help:

In [None]:
# Drop-in replacement with the baseline
vectorizer = CountVectorizer(stop_words=stopwords, ngram_range=(1,2), analyzer="word", min_df=0.0, max_df=1.0, max_features=100000, binary=False, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Tokens: 100000
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.803
Validation Accuracy: 0.769


Result: definitely helps in increasing accuracy, but still the issue that model overfits very fast -> i.e. switch to Tfidf

In [None]:
# Tfidf vectorization with sub-linear term-frequency
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), analyzer="word", min_df=0.0, max_df=1.0, max_features=100000, sublinear_tf=True, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
X_subtrain.sort_indices()
X_val.sort_indices()
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Tokens: 100000
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Training Accuracy: 0.828
Validation Accuracy: 0.777


Result: adding n-grams helped to increase model performance

In [None]:
# Hashing-Vectorizer with 3-grams
vectorizer = HashingVectorizer(stop_words=stopwords, ngram_range=(1,3), analyzer="word", n_features=100000, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
X_subtrain.sort_indices()
X_val.sort_indices()

model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeUniform(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Training Accuracy: 0.813
Validation Accuracy: 0.761


Result: adding 3-grams was too much (i.e. stick to 2-grams)

##### Fine-Tune Number of Included 2-Grams:

In [None]:
# Fine-Tune min_df
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), analyzer="word", min_df=0.0001, max_df=1.0, sublinear_tf=True, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
X_subtrain.sort_indices()
X_val.sort_indices()
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Tokens: 264282
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Training Accuracy: 0.856
Validation Accuracy: 0.779


Results:
- min_df=0.0 -> 0 documents -> 15,000,000 Token -> too high-dimensional; overfits strongly; extremely slow learning -> 94.1% vs. 78.1%
- min_df=0.1 -> 17,200 documents -> 200 Token -> bad performance -> 72% vs. 71.7%
- min_df=0.01 -> 1,720 documents -> 3000 Token -> better performance, but still potential to increase number of token -> 77% vs. 76.2%
- min_df=0.001 -> 172 documents -> 25,000 Token -> 79.9% vs. 77.4%
- min_df=0.0001 -> 17 documents -> 250,000 Token -> overfitted too much -> 85.6% vs. 77.9%
- optimal number lies somewhere between 50,000 and 200,000 Token

In [None]:
# Fine-Tune max_features
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), analyzer="word", max_features=150000, sublinear_tf=True, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
X_subtrain.sort_indices()
X_val.sort_indices()
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Tokens: 150000
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Training Accuracy: 0.840
Validation Accuracy: 0.778


Results:
- max_features = 25,000 -> 79.4% vs. 77.3%
- max_features = 50,000 -> 80.9% vs. 77.4%
- max_features = 75.000 -> 82.2% vs 77.6%
- max_features = 100,000 -> 82.9% vs. 77.7%
- max_features = 125,000 -> 82.8% vs. 77.7%
- max_features = 150,000 -> 84% vs. 77.8%
- i.e. max_features = 75,000 provides the best trade-off between good accuracy and low overfitting

In [None]:
# Perform final data transformation based on the best found model
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), analyzer="word", max_features=75000, sublinear_tf=True, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
X_subtrain.sort_indices()
X_val.sort_indices()
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

Tokens: 75000


In [None]:
model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Training Accuracy: 0.822
Validation Accuracy: 0.776


##### Fine-Tune Logistic Regression Based On Tfidf-Dataset with 2-Grams

In [None]:
# Fine-Tune Regularizers
counter=0
for regularizer, reg_rate in itertools.product(["L1", "L2"],[0.0001, 0.001, 0.01, 0.1, 0, 1, 10]):
  counter += 1
  print("Round: {}, Params: {} {}".format(counter, regularizer, reg_rate))
  if regularizer=="L1": reg = L1(l1=reg_rate)
  if regularizer=="L2": reg = L2(l2=reg_rate)
  model = Sequential()
  model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
  model.add(Dense(1, activation="sigmoid", kernel_regularizer=reg, kernel_initializer=HeNormal(seed=seed_value)))
  model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
  history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
  print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
  print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Round: 1, Params: L1 0.0001
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Training Accuracy: 0.732
Validation Accuracy: 0.728
Round: 2, Params: L1 0.001
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.650
Validation Accuracy: 0.650
Round: 3, Params: L1 0.01
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.650
Validation Accuracy: 0.650
Round: 4, Params: L1 0.1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Training Accuracy: 0.650
Validation A

Result: regularization led to a significant drop in model performance (i.e. don't use)

In [None]:
# Tune different optimizers
counter=0
for optimizer in ["RMSprop", "RMSprop_centered", "Adam", "Adam_amsgrad", "Adamax", "Nadam"]:
  counter += 1
  print("Round: {}, Params: {}".format(counter, optimizer))
  if optimizer=="RMSprop": opt = RMSprop()
  if optimizer=="RMSprop_centered": opt = RMSprop(centered=True)
  if optimizer=="Adam": opt = Adam()
  if optimizer=="Adam_amsgrad": opt = Adam(amsgrad=True)
  if optimizer=="Adamax": opt = Adamax()
  if optimizer=="Nadam": opt = Nadam()
  model = Sequential()
  model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
  model.add(Dense(1, activation="sigmoid", kernel_initializer=HeNormal(seed=seed_value)))
  model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["binary_accuracy"])
  history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
  print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
  print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Round: 1, Params: RMSprop
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Training Accuracy: 0.823
Validation Accuracy: 0.776
Round: 2, Params: RMSprop_centered
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Training Accuracy: 0.821
Validation Accuracy: 0.776
Round: 3, Params: Adam
Epoch 1/100
Epoch 2/100


Result: stick with RMSprop, but differences were marginal

In [None]:
# Fine-tune different weight initializers
counter=0
for initializer in ["glorot_normal", "glorot_uniform", "he_normal", "he_uniform", "lecun_normal", "lecun_uniform"]:
  counter += 1
  print("Round: {}, Params: {}".format(counter, initializer))
  if initializer=="glorot_normal": init = GlorotNormal(seed=seed_value)
  if initializer=="glorot_uniform": init = GlorotUniform(seed=seed_value)
  if initializer=="he_normal": init = HeNormal(seed=seed_value)
  if initializer=="he_uniform": init = HeUniform(seed=seed_value)
  if initializer=="lecun_normal": init = LecunNormal(seed=seed_value)
  if initializer=="lecun_uniform": init = LecunUniform(seed=seed_value)
  model = Sequential()
  model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
  model.add(Dense(1, activation="sigmoid", kernel_initializer=init))
  model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
  history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
  print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
  print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Round: 1, Params: glorot_normal
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Training Accuracy: 0.820
Validation Accuracy: 0.776
Round: 2, Params: glorot_uniform
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Training Accuracy: 0.821
Validation Accuracy: 0.776
Round: 3, Params: he_normal
Ep

Result: Glorot Normal performed the best, but differences were marginal

In [None]:
# Fine-tune different batch sizes
counter=0
for batch_size in [16, 32, 64, 128, 256, 512, 1024]:
  counter += 1
  print("Round: {}, Params: {}".format(counter, batch_size))
  model = Sequential()
  model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=batch_size))
  model.add(Dense(1, activation="sigmoid", kernel_initializer=GlorotNormal(seed=seed_value)))
  model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
  history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=batch_size, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=5, verbose=0, mode='max', restore_best_weights=True)])
  print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=batch_size, verbose=0)[1]))
  print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=batch_size, verbose=0)[1]))

Round: 1, Params: 16
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Training Accuracy: 0.775
Validation Accuracy: 0.764
Round: 2, Params: 32
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Training Accuracy: 0.787
Validation Accuracy: 0.768
Round: 3, Params: 64
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Training Accuracy: 0.794
Validation Accuracy: 0.772
Round: 4, Params: 128
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Training Accuracy: 0.804
Validation Accuracy: 0.773
Round: 5, Params: 256
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
E

Result: batch size of 512 was the best

##### Test If Adding Additional Text Attributes Can Increase Model Performance:

###### 1. Include Title + Blurb

In [None]:
# Import Dataset
kickstarter_df = pd.read_csv("04_Final Datasets/Kickstarter_Text.csv", index_col=0)
print(kickstarter_df.shape)
print(len(kickstarter_df.index.unique()))
kickstarter_df.head(1)

(246891, 7)
246891


Unnamed: 0,campaign_successful,title,blurb,story,risks,reward_description,creator_bio
22821161,0,sentio golf putters. feel is the difference,choose the feel you want with our patented flo...,sentio putters feature a unique floating face...,high tech process although we have made severa...,our eternal gratitude. every little bit helps ...,sentio golf is driven to produce the most adva...


In [None]:
# Merge Story + Title + Blurb together
text_df = kickstarter_df["title"] + " " + kickstarter_df["blurb"] + " " + kickstarter_df["story"]
text_df.head()

22821161    sentio golf putters. feel is the difference ch...
22823613    brainade we intend to create an online platfor...
22835897    the alphabet story children s book a richly il...
22845619    the gears lp from scratch. with you. after a y...
22848517    polynesian adventure the board game polynesian...
dtype: object

In [None]:
# Convert dataset and target variable to Numpy Arrays
y = kickstarter_df["campaign_successful"].to_numpy()
text = text_df.to_numpy()

print(type(y))
print(y.shape)
print(type(text))
print(text.shape)

<class 'numpy.ndarray'>
(246891,)
<class 'numpy.ndarray'>
(246891,)


In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=(train_size+val_size),test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
text_subtrain, text_val, y_subtrain, y_val = train_test_split(text_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(text_train.shape))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(text_subtrain.shape))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(text_val.shape))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(text_test.shape))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: (209858,)
Shape of y_train: (209858,)
Shape of X_subtrain: (172824,)
Shape of y_subtrain: (172824,)
Shape of X_val: (37034,)
Shape of y_val: (37034,)
Shape of X_test: (37033,)
Shape of y_test: (37033,)


In [None]:
# Perform best data transformation identified in hyperparameter search
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), analyzer="word", max_features=75000, sublinear_tf=True, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
X_subtrain.sort_indices()
X_val.sort_indices()
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

Tokens: 75000


In [None]:
# Evaluate on best model identified in hyperparameter search
model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=GlorotNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, verbose=0, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Training Accuracy: 0.827
Validation Accuracy: 0.778


Result: Including Title + Blurb increased model performance

###### 2. Include Title + Blurb + Risks

In [None]:
# Import Dataset
kickstarter_df = pd.read_csv("04_Final Datasets/Kickstarter_Text.csv", index_col=0)
print(kickstarter_df.shape)
print(len(kickstarter_df.index.unique()))
kickstarter_df.head(1)

(246891, 7)
246891


Unnamed: 0,campaign_successful,title,blurb,story,risks,reward_description,creator_bio
22821161,0,sentio golf putters. feel is the difference,choose the feel you want with our patented flo...,sentio putters feature a unique floating face...,high tech process although we have made severa...,our eternal gratitude. every little bit helps ...,sentio golf is driven to produce the most adva...


In [None]:
# Merge Story + Title + Blurb + Risks together
text_df = kickstarter_df["title"] + " " + kickstarter_df["blurb"] + " " + kickstarter_df["story"]+ " " + kickstarter_df["risks"]
text_df.head()

22821161    sentio golf putters. feel is the difference ch...
22823613    brainade we intend to create an online platfor...
22835897    the alphabet story children s book a richly il...
22845619    the gears lp from scratch. with you. after a y...
22848517    polynesian adventure the board game polynesian...
dtype: object

In [None]:
# Convert dataset and target variable to Numpy Arrays
y = kickstarter_df["campaign_successful"].to_numpy()
text = text_df.to_numpy()

print(type(y))
print(y.shape)
print(type(text))
print(text.shape)

<class 'numpy.ndarray'>
(246891,)
<class 'numpy.ndarray'>
(246891,)


In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=(train_size+val_size),test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
text_subtrain, text_val, y_subtrain, y_val = train_test_split(text_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(text_train.shape))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(text_subtrain.shape))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(text_val.shape))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(text_test.shape))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: (209858,)
Shape of y_train: (209858,)
Shape of X_subtrain: (172824,)
Shape of y_subtrain: (172824,)
Shape of X_val: (37034,)
Shape of y_val: (37034,)
Shape of X_test: (37033,)
Shape of y_test: (37033,)


In [None]:
# Perform best data transformation identified in hyperparameter search
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), analyzer="word", max_features=75000, sublinear_tf=True, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
X_subtrain.sort_indices()
X_val.sort_indices()
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

Tokens: 75000


In [None]:
# Evaluate on best model identified in hyperparameter search
model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=GlorotNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, verbose=0, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Training Accuracy: 0.821
Validation Accuracy: 0.782


Results: adding Risks further increased accuracy

###### 3. Include Title + Blurb + Risks + Creator Bio

In [None]:
# Import Dataset
kickstarter_df = pd.read_csv("04_Final Datasets/Kickstarter_Text.csv", index_col=0)
print(kickstarter_df.shape)
print(len(kickstarter_df.index.unique()))
kickstarter_df.head(1)

(246891, 7)
246891


Unnamed: 0,campaign_successful,title,blurb,story,risks,reward_description,creator_bio
22821161,0,sentio golf putters. feel is the difference,choose the feel you want with our patented flo...,sentio putters feature a unique floating face...,high tech process although we have made severa...,our eternal gratitude. every little bit helps ...,sentio golf is driven to produce the most adva...


In [None]:
# Merge Story + Title + Blurb + Risks + Creator Bio together
text_df = kickstarter_df["title"] + " " + kickstarter_df["blurb"] + " " + kickstarter_df["story"] + " " + kickstarter_df["risks"] + " " + kickstarter_df["creator_bio"]
text_df.head()

22821161    sentio golf putters. feel is the difference ch...
22823613    brainade we intend to create an online platfor...
22835897    the alphabet story children s book a richly il...
22845619    the gears lp from scratch. with you. after a y...
22848517    polynesian adventure the board game polynesian...
dtype: object

In [None]:
# Convert dataset and target variable to Numpy Arrays
y = kickstarter_df["campaign_successful"].to_numpy()
text = text_df.to_numpy()

print(type(y))
print(y.shape)
print(type(text))
print(text.shape)

<class 'numpy.ndarray'>
(246891,)
<class 'numpy.ndarray'>
(246891,)


In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=(train_size+val_size),test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
text_subtrain, text_val, y_subtrain, y_val = train_test_split(text_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(text_train.shape))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(text_subtrain.shape))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(text_val.shape))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(text_test.shape))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: (209858,)
Shape of y_train: (209858,)
Shape of X_subtrain: (172824,)
Shape of y_subtrain: (172824,)
Shape of X_val: (37034,)
Shape of y_val: (37034,)
Shape of X_test: (37033,)
Shape of y_test: (37033,)


In [None]:
# Perform best data transformation identified in hyperparameter search
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), analyzer="word", max_features=75000, sublinear_tf=True, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
X_subtrain.sort_indices()
X_val.sort_indices()
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

Tokens: 75000


In [None]:
# Evaluate on best model identified in hyperparameter search
model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=GlorotNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, verbose=0, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Training Accuracy: 0.830
Validation Accuracy: 0.789


Result: including creator_bio increased model performance

###### 4. Inlude Title + Blurb + Risks + Creator Bio + Reward Description

In [None]:
# Import Dataset
kickstarter_df = pd.read_csv("04_Final Datasets/Kickstarter_Text.csv", index_col=0)
print(kickstarter_df.shape)
print(len(kickstarter_df.index.unique()))
kickstarter_df.head(1)

(246891, 7)
246891


Unnamed: 0,campaign_successful,title,blurb,story,risks,reward_description,creator_bio
22821161,0,sentio golf putters. feel is the difference,choose the feel you want with our patented flo...,sentio putters feature a unique floating face...,high tech process although we have made severa...,our eternal gratitude. every little bit helps ...,sentio golf is driven to produce the most adva...


In [None]:
# Merge Story + Title + Blurb + Risks + Creator Bio + Reward Description together
text_df = kickstarter_df["title"] + " " + kickstarter_df["blurb"] + " " + kickstarter_df["story"] + " " + kickstarter_df["risks"] + " " + kickstarter_df["creator_bio"] + " " + kickstarter_df["reward_description"]
text_df.head()

22821161    sentio golf putters. feel is the difference ch...
22823613    brainade we intend to create an online platfor...
22835897    the alphabet story children s book a richly il...
22845619    the gears lp from scratch. with you. after a y...
22848517    polynesian adventure the board game polynesian...
dtype: object

In [None]:
# Convert dataset and target variable to Numpy Arrays
y = kickstarter_df["campaign_successful"].to_numpy()
text = text_df.to_numpy()

print(type(y))
print(y.shape)
print(type(text))
print(text.shape)

<class 'numpy.ndarray'>
(246891,)
<class 'numpy.ndarray'>
(246891,)


In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

text_train, text_test, y_train, y_test = train_test_split(text, y, train_size=(train_size+val_size),test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
text_subtrain, text_val, y_subtrain, y_val = train_test_split(text_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(text_train.shape))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(text_subtrain.shape))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(text_val.shape))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(text_test.shape))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: (209858,)
Shape of y_train: (209858,)
Shape of X_subtrain: (172824,)
Shape of y_subtrain: (172824,)
Shape of X_val: (37034,)
Shape of y_val: (37034,)
Shape of X_test: (37033,)
Shape of y_test: (37033,)


In [None]:
# Perform best data transformation identified in hyperparameter search
vectorizer = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), analyzer="word", max_features=75000, sublinear_tf=True, dtype=np.float32)
vectorizer.fit(text_subtrain)
X_subtrain = vectorizer.transform(text_subtrain)
X_val = vectorizer.transform(text_val)
X_subtrain.sort_indices()
X_val.sort_indices()
print("Tokens: {}".format(len(vectorizer.vocabulary_)))

Tokens: 75000


In [None]:
# Evaluate on best model identified in hyperparameter search
model = Sequential()
model.add(Input(shape=(X_subtrain.shape[1],), sparse=True, batch_size=512))
model.add(Dense(1, activation="sigmoid", kernel_initializer=GlorotNormal(seed=seed_value)))
model.compile(optimizer=RMSprop(), loss="binary_crossentropy", metrics=["binary_accuracy"])
history = model.fit(X_subtrain, y_subtrain, validation_data=(X_val, y_val), batch_size=512, epochs=100, verbose=1, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=5, verbose=0, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate(X_subtrain, y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate(X_val, y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Training Accuracy: 0.841
Validation Accuracy: 0.795


Result: adding reward description further increased model performance

#### d) Best-Found Model: LR Text

**Preprocessing:**
- stopwords removed
- all text features included
- 2-grams, max_features=75000
- TFIDF Vectorizer + sublinear_tf 

**Model**:
- Activation = sigmoid (necessary to output probability between 0 and 1)
- GlorotNormal initilization
- RMSProp optimizer
- Batch Size: 512