#### Setup

In [None]:
!pip install -U tensorflow keras

In [None]:
!pip install -U talos

In [None]:
!pip install -U fasttext

In [None]:
# Import general Python libraries
import pandas as pd
import numpy as np
import random
import sklearn
import seaborn as sns
import os
import io
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

In [None]:
# Specify seeds for random-operations
seed_value = 0
os.environ['PYTHONHASHSEED']=str(seed_value)
np.random.seed(seed_value)
random.seed(seed_value)

In [None]:
# Import sklearn-specific modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
# Import tensorflow-specific modules
import tensorflow as tf
tf.random.set_seed(seed_value)
print("Tensorflow Version: {}".format(tf.__version__))
print("Keras Version: {}".format(tf.keras.__version__))

Tensorflow Version: 2.4.1
Keras Version: 2.4.0


In [None]:
# Import keras-specific modules
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dropout, BatchNormalization, LayerNormalization, GaussianNoise, Activation
from tensorflow.keras.layers import Dense, Flatten, Concatenate, Average, Embedding, Conv1D, MaxPool1D, AvgPool1D, GlobalMaxPool1D, GlobalAvgPool1D, RNN, GRU, LSTM, SeparableConv1D, SimpleRNN, Bidirectional, LocallyConnected1D, LeakyReLU, Input
from tensorflow.keras.optimizers import Adadelta, RMSprop, Adam, Adamax, Nadam
from tensorflow.keras.regularizers import L1, L2
from tensorflow.keras.initializers import GlorotNormal, GlorotUniform, LecunNormal, LecunUniform, HeNormal, HeUniform, Constant
from tensorflow.keras.metrics import AUC, Precision, Recall
from tensorflow.keras.utils import plot_model
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

In [None]:
# Import talos-specific modules
import talos

In [None]:
# Set pandas options
pd.set_option("display.max_columns", None)

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Set up TPU configurations
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.TPUStrategy(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))





INFO:tensorflow:Initializing the TPU system: grpc://10.29.128.210:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.29.128.210:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


All devices:  [LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:7', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:6', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:5', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:4', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:3', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:0', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:1', device_type='TPU'), LogicalDevice(name='/job:worker/replica:0/task:0/device:TPU:2', device_type='TPU')]


#### Prepare Dataset:

In [None]:
# Import Word Embeddings
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('cc.en.300.bin')

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz





In [None]:
# Import Kickstarter Dataset
kickstarter_df = pd.read_csv("04_Final Datasets/Kickstarter_Structured_and_Text.csv", index_col=0)
print(kickstarter_df.shape)
print(len(kickstarter_df.index.unique()))
kickstarter_df.head(1)

(246891, 41)
246891


Unnamed: 0,campaign_successful,title,blurb,story,risks,reward_description,creator_bio,goal,number_of_collaborators,funding_period,days_between_created_and_launched,launch_quartal,staff_pick,campaign_has_demo_video,campaign_has_environmental_commitments,number_of_images,number_of_videos,number_of_audios,number_of_interactives,number_of_words,number_of_links,creator_verified_identity,creator_fb_auth,creator_has_image,creator_allows_follows,number_of_creator_backings,number_of_creator_projects,facebook_linked,twitter_linked,instagram_linked,linkedin_linked,number_of_rewards,number_of_words_per_reward,lowest_pledge_level,highest_pledge_level,has_limited_rewards,has_shipped_rewards,has_restricted_shipping_rewards,avg_months_until_reward,location,category
22821161,0,sentio golf putters. feel is the difference,choose the feel you want with our patented flo...,sentio putters feature a unique floating face...,high tech process although we have made severa...,our eternal gratitude. every little bit helps ...,sentio golf is driven to produce the most adva...,50000.0,0,45,102,3,0,1,0,13,0,0,0,1378,2,1,0,1,1,0,1,0,0,0,0,8,36.75,5.0,1417.0,0,1,1,5.5,United States,Design_Product Design


In [None]:
# Merge Text Attributes Together
kickstarter_df["text"] = kickstarter_df["title"] + " " + kickstarter_df["blurb"] + " " + kickstarter_df["story"] + " " + kickstarter_df["risks"] + " " + kickstarter_df["creator_bio"] + " " + kickstarter_df["reward_description"]
kickstarter_df.drop(columns=["title", "blurb", "story", "risks", "creator_bio", "reward_description"], inplace=True)
print(kickstarter_df.shape)
kickstarter_df.head(1)

(246891, 36)


Unnamed: 0,campaign_successful,goal,number_of_collaborators,funding_period,days_between_created_and_launched,launch_quartal,staff_pick,campaign_has_demo_video,campaign_has_environmental_commitments,number_of_images,number_of_videos,number_of_audios,number_of_interactives,number_of_words,number_of_links,creator_verified_identity,creator_fb_auth,creator_has_image,creator_allows_follows,number_of_creator_backings,number_of_creator_projects,facebook_linked,twitter_linked,instagram_linked,linkedin_linked,number_of_rewards,number_of_words_per_reward,lowest_pledge_level,highest_pledge_level,has_limited_rewards,has_shipped_rewards,has_restricted_shipping_rewards,avg_months_until_reward,location,category,text
22821161,0,50000.0,0,45,102,3,0,1,0,13,0,0,0,1378,2,1,0,1,1,0,1,0,0,0,0,8,36.75,5.0,1417.0,0,1,1,5.5,United States,Design_Product Design,sentio golf putters. feel is the difference ch...


In [None]:
# Remove stopwords and punctation from text
stopwords = set(list(ENGLISH_STOP_WORDS) + ["s"])
kickstarter_df["text"] = kickstarter_df.text.str.replace(r"\.", "")
kickstarter_df["text"] = kickstarter_df.text.progress_apply(lambda x: " ".join([token for token in x.split() if token not in stopwords]))

100%|██████████| 246891/246891 [00:44<00:00, 5566.01it/s]


In [None]:
# Convert categorical features into dummy-variables
print("Before: {}".format(kickstarter_df.shape))
kickstarter_df["launch_quartal"] = kickstarter_df.launch_quartal.apply(str)
kickstarter_df = pd.get_dummies(kickstarter_df, prefix=["launch_quartal", "location", "category"], columns=["launch_quartal", "location", "category"], drop_first=False)
print("After: {}".format(kickstarter_df.shape))
kickstarter_df.head(1)

Before: (246891, 36)
After: (246891, 235)


Unnamed: 0,campaign_successful,goal,number_of_collaborators,funding_period,days_between_created_and_launched,staff_pick,campaign_has_demo_video,campaign_has_environmental_commitments,number_of_images,number_of_videos,number_of_audios,number_of_interactives,number_of_words,number_of_links,creator_verified_identity,creator_fb_auth,creator_has_image,creator_allows_follows,number_of_creator_backings,number_of_creator_projects,facebook_linked,twitter_linked,instagram_linked,linkedin_linked,number_of_rewards,number_of_words_per_reward,lowest_pledge_level,highest_pledge_level,has_limited_rewards,has_shipped_rewards,has_restricted_shipping_rewards,avg_months_until_reward,text,launch_quartal_1,launch_quartal_2,launch_quartal_3,launch_quartal_4,location_Africa,location_Australia,location_Belgium,location_Canada,location_China,location_Denmark,location_France,location_Germany,location_Hong Kong,location_Ireland,location_Italy,location_Japan,location_Latin and South America,location_Mexico,location_Netherlands,location_New Zealand,location_No Location,location_Norway,location_Oceania and Antarctica,location_Rest of Asia,location_Rest of Europe,location_Singapore,location_Spain,location_Sweden,location_Switzerland,location_United Kingdom,location_United States,category_Art_Ceramics,category_Art_Conceptual Art,category_Art_Digital Art,category_Art_Illustration,category_Art_Installations,category_Art_Mixed Media,category_Art_No Subcategory,category_Art_Painting,category_Art_Performance Art,category_Art_Public Art,category_Art_Sculpture,category_Art_Social Practice,category_Art_Textiles,category_Art_Video Art,category_Comics_Anthologies,category_Comics_Comic Books,category_Comics_Events,category_Comics_Graphic Novels,category_Comics_No Subcategory,category_Comics_Webcomics,category_Crafts_Candles,category_Crafts_Crochet,category_Crafts_DIY,category_Crafts_Embroidery,category_Crafts_Glass,category_Crafts_Knitting,category_Crafts_No Subcategory,category_Crafts_Pottery,category_Crafts_Printing,category_Crafts_Quilts,category_Crafts_Stationery,category_Crafts_Taxidermy,category_Crafts_Weaving,category_Crafts_Woodworking,category_Dance_No Subcategory,category_Dance_Performances,category_Dance_Residencies,category_Dance_Spaces,category_Dance_Workshops,category_Design_Architecture,category_Design_Civic Design,category_Design_Graphic Design,category_Design_Interactive Design,category_Design_No Subcategory,category_Design_Product Design,category_Design_Toys,category_Design_Typography,category_Fashion_Accessories,category_Fashion_Apparel,category_Fashion_Childrenswear,category_Fashion_Couture,category_Fashion_Footwear,category_Fashion_Jewelry,category_Fashion_No Subcategory,category_Fashion_Pet Fashion,category_Fashion_Ready-to-wear,category_Film & Video_Action,category_Film & Video_Animation,category_Film & Video_Comedy,category_Film & Video_Documentary,category_Film & Video_Drama,category_Film & Video_Experimental,category_Film & Video_Family,category_Film & Video_Fantasy,category_Film & Video_Festivals,category_Film & Video_Horror,category_Film & Video_Movie Theaters,category_Film & Video_Music Videos,category_Film & Video_Narrative Film,category_Film & Video_No Subcategory,category_Film & Video_Romance,category_Film & Video_Science Fiction,category_Film & Video_Shorts,category_Film & Video_Television,category_Film & Video_Thrillers,category_Film & Video_Webseries,category_Food_Bacon,category_Food_Community Gardens,category_Food_Cookbooks,category_Food_Drinks,category_Food_Events,category_Food_Farmer's Markets,category_Food_Farms,category_Food_Food Trucks,category_Food_No Subcategory,category_Food_Restaurants,category_Food_Small Batch,category_Food_Spaces,category_Food_Vegan,category_Games_Gaming Hardware,category_Games_Live Games,category_Games_Mobile Games,category_Games_No Subcategory,category_Games_Playing Cards,category_Games_Puzzles,category_Games_Tabletop Games,category_Games_Video Games,category_Journalism_Audio,category_Journalism_No Subcategory,category_Journalism_Photo,category_Journalism_Print,category_Journalism_Video,category_Journalism_Web,category_Music_Blues,category_Music_Chiptune,category_Music_Classical Music,category_Music_Comedy,category_Music_Country & Folk,category_Music_Electronic Music,category_Music_Faith,category_Music_Hip-Hop,category_Music_Indie Rock,category_Music_Jazz,category_Music_Kids,category_Music_Latin,category_Music_Metal,category_Music_No Subcategory,category_Music_Pop,category_Music_Punk,category_Music_R&B,category_Music_Rock,category_Music_World Music,category_Photography_Animals,category_Photography_Fine Art,category_Photography_Nature,category_Photography_No Subcategory,category_Photography_People,category_Photography_Photobooks,category_Photography_Places,category_Publishing_Academic,category_Publishing_Anthologies,category_Publishing_Art Books,category_Publishing_Calendars,category_Publishing_Children's Books,category_Publishing_Comedy,category_Publishing_Fiction,category_Publishing_Letterpress,category_Publishing_Literary Journals,category_Publishing_Literary Spaces,category_Publishing_No Subcategory,category_Publishing_Nonfiction,category_Publishing_Periodicals,category_Publishing_Poetry,category_Publishing_Radio & Podcasts,category_Publishing_Translations,category_Publishing_Young Adult,category_Publishing_Zines,category_Technology_3D Printing,category_Technology_Apps,category_Technology_Camera Equipment,category_Technology_DIY Electronics,category_Technology_Fabrication Tools,category_Technology_Flight,category_Technology_Gadgets,category_Technology_Hardware,category_Technology_Makerspaces,category_Technology_No Subcategory,category_Technology_Robots,category_Technology_Software,category_Technology_Sound,category_Technology_Space Exploration,category_Technology_Wearables,category_Technology_Web,category_Theater_Comedy,category_Theater_Experimental,category_Theater_Festivals,category_Theater_Immersive,category_Theater_Musical,category_Theater_No Subcategory,category_Theater_Plays,category_Theater_Spaces
22821161,0,50000.0,0,45,102,0,1,0,13,0,0,0,1378,2,1,0,1,1,0,1,0,0,0,0,8,36.75,5.0,1417.0,0,1,1,5.5,sentio golf putters feel difference choose fee...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Convert dataset and target variable to Numpy Arrays
y = kickstarter_df["campaign_successful"].to_numpy()
kickstarter_df.drop(columns=["campaign_successful"], inplace=True)
X = kickstarter_df.to_numpy()

print(type(y))
print(y.shape)
print(type(X))
print(X.shape)

<class 'numpy.ndarray'>
(246891,)
<class 'numpy.ndarray'>
(246891, 234)


In [None]:
# Retrieve column names which will be used for later pre-processing
feature_names = kickstarter_df.columns.values
text_feature_names = "text"
numeric_feature_names = ["goal", "number_of_collaborators", "funding_period", "days_between_created_and_launched", "number_of_images", "number_of_videos", "number_of_audios", "number_of_interactives", "number_of_words", "number_of_links", "number_of_creator_backings", "number_of_creator_projects", "number_of_rewards", "number_of_words_per_reward", "lowest_pledge_level", "highest_pledge_level", "avg_months_until_reward"]
binary_feature_names = [x for x in feature_names if (x not in numeric_feature_names) & (x != text_feature_names)]
text_features = kickstarter_df.columns.get_loc(text_feature_names)
numeric_features = [kickstarter_df.columns.get_loc(x) for x in numeric_feature_names]
binary_features = [kickstarter_df.columns.get_loc(x) for x in binary_feature_names]
print(len(numeric_feature_names)+len(binary_feature_names) + 1)

234


In [None]:
# Split dataset into training, subtraining, validation, and test set
train_size = round(kickstarter_df.shape[0]*0.7*1)
val_size = round(kickstarter_df.shape[0]*0.15*1)
test_size = round(kickstarter_df.shape[0]*1) - val_size - train_size

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=(train_size+val_size), test_size=test_size, shuffle=True, stratify=y, random_state=seed_value)
X_subtrain, X_val, y_subtrain, y_val = train_test_split(X_train, y_train, train_size=train_size, test_size=val_size, shuffle=True, stratify=y_train, random_state=seed_value)

print("Shape of X_train: {}".format(X_train.shape))
print("Shape of y_train: {}".format(y_train.shape))
print("Shape of X_subtrain: {}".format(X_subtrain.shape))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val: {}".format(X_val.shape))
print("Shape of y_val: {}".format(y_val.shape))
print("Shape of X_test: {}".format(X_test.shape))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: (209858, 234)
Shape of y_train: (209858,)
Shape of X_subtrain: (172824, 234)
Shape of y_subtrain: (172824,)
Shape of X_val: (37034, 234)
Shape of y_val: (37034,)
Shape of X_test: (37033, 234)
Shape of y_test: (37033,)


In [None]:
# Create a dummy transformer that simply returns the original column (used for binary features)
class NoTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, np.ndarray)
        return X

In [None]:
# Strategy: PowerTransform + Standardize + Decorrelate (PCA) Numerical Features; Leave Binary + Text Features as they are
preprocessing = ColumnTransformer(
    [
      ("numeric", Pipeline([
                            ("powertransform", PowerTransformer(method="yeo-johnson", standardize=True)),
                            ("pca", PCA())
                          ]), numeric_features),
      ("binary", NoTransformer(), binary_features)
    ],
    remainder="passthrough", verbose=True, n_jobs=-1).fit(X_subtrain)

X_subtrain = preprocessing.transform(X_subtrain)
X_val = preprocessing.transform(X_val)

In [None]:
# Re-arrange feature_names, since they have been changed by the ColumnTransformer
feature_names = numeric_feature_names + binary_feature_names + [text_feature_names]

In [None]:
# Validation
tmp = pd.DataFrame(X_subtrain, columns=feature_names)
print(tmp.shape)
tmp.head(3)

(172824, 234)


Unnamed: 0,goal,number_of_collaborators,funding_period,days_between_created_and_launched,number_of_images,number_of_videos,number_of_audios,number_of_interactives,number_of_words,number_of_links,number_of_creator_backings,number_of_creator_projects,number_of_rewards,number_of_words_per_reward,lowest_pledge_level,highest_pledge_level,avg_months_until_reward,staff_pick,campaign_has_demo_video,campaign_has_environmental_commitments,creator_verified_identity,creator_fb_auth,creator_has_image,creator_allows_follows,facebook_linked,twitter_linked,instagram_linked,linkedin_linked,has_limited_rewards,has_shipped_rewards,has_restricted_shipping_rewards,launch_quartal_1,launch_quartal_2,launch_quartal_3,launch_quartal_4,location_Africa,location_Australia,location_Belgium,location_Canada,location_China,location_Denmark,location_France,location_Germany,location_Hong Kong,location_Ireland,location_Italy,location_Japan,location_Latin and South America,location_Mexico,location_Netherlands,location_New Zealand,location_No Location,location_Norway,location_Oceania and Antarctica,location_Rest of Asia,location_Rest of Europe,location_Singapore,location_Spain,location_Sweden,location_Switzerland,location_United Kingdom,location_United States,category_Art_Ceramics,category_Art_Conceptual Art,category_Art_Digital Art,category_Art_Illustration,category_Art_Installations,category_Art_Mixed Media,category_Art_No Subcategory,category_Art_Painting,category_Art_Performance Art,category_Art_Public Art,category_Art_Sculpture,category_Art_Social Practice,category_Art_Textiles,category_Art_Video Art,category_Comics_Anthologies,category_Comics_Comic Books,category_Comics_Events,category_Comics_Graphic Novels,category_Comics_No Subcategory,category_Comics_Webcomics,category_Crafts_Candles,category_Crafts_Crochet,category_Crafts_DIY,category_Crafts_Embroidery,category_Crafts_Glass,category_Crafts_Knitting,category_Crafts_No Subcategory,category_Crafts_Pottery,category_Crafts_Printing,category_Crafts_Quilts,category_Crafts_Stationery,category_Crafts_Taxidermy,category_Crafts_Weaving,category_Crafts_Woodworking,category_Dance_No Subcategory,category_Dance_Performances,category_Dance_Residencies,category_Dance_Spaces,category_Dance_Workshops,category_Design_Architecture,category_Design_Civic Design,category_Design_Graphic Design,category_Design_Interactive Design,category_Design_No Subcategory,category_Design_Product Design,category_Design_Toys,category_Design_Typography,category_Fashion_Accessories,category_Fashion_Apparel,category_Fashion_Childrenswear,category_Fashion_Couture,category_Fashion_Footwear,category_Fashion_Jewelry,category_Fashion_No Subcategory,category_Fashion_Pet Fashion,category_Fashion_Ready-to-wear,category_Film & Video_Action,category_Film & Video_Animation,category_Film & Video_Comedy,category_Film & Video_Documentary,category_Film & Video_Drama,category_Film & Video_Experimental,category_Film & Video_Family,category_Film & Video_Fantasy,category_Film & Video_Festivals,category_Film & Video_Horror,category_Film & Video_Movie Theaters,category_Film & Video_Music Videos,category_Film & Video_Narrative Film,category_Film & Video_No Subcategory,category_Film & Video_Romance,category_Film & Video_Science Fiction,category_Film & Video_Shorts,category_Film & Video_Television,category_Film & Video_Thrillers,category_Film & Video_Webseries,category_Food_Bacon,category_Food_Community Gardens,category_Food_Cookbooks,category_Food_Drinks,category_Food_Events,category_Food_Farmer's Markets,category_Food_Farms,category_Food_Food Trucks,category_Food_No Subcategory,category_Food_Restaurants,category_Food_Small Batch,category_Food_Spaces,category_Food_Vegan,category_Games_Gaming Hardware,category_Games_Live Games,category_Games_Mobile Games,category_Games_No Subcategory,category_Games_Playing Cards,category_Games_Puzzles,category_Games_Tabletop Games,category_Games_Video Games,category_Journalism_Audio,category_Journalism_No Subcategory,category_Journalism_Photo,category_Journalism_Print,category_Journalism_Video,category_Journalism_Web,category_Music_Blues,category_Music_Chiptune,category_Music_Classical Music,category_Music_Comedy,category_Music_Country & Folk,category_Music_Electronic Music,category_Music_Faith,category_Music_Hip-Hop,category_Music_Indie Rock,category_Music_Jazz,category_Music_Kids,category_Music_Latin,category_Music_Metal,category_Music_No Subcategory,category_Music_Pop,category_Music_Punk,category_Music_R&B,category_Music_Rock,category_Music_World Music,category_Photography_Animals,category_Photography_Fine Art,category_Photography_Nature,category_Photography_No Subcategory,category_Photography_People,category_Photography_Photobooks,category_Photography_Places,category_Publishing_Academic,category_Publishing_Anthologies,category_Publishing_Art Books,category_Publishing_Calendars,category_Publishing_Children's Books,category_Publishing_Comedy,category_Publishing_Fiction,category_Publishing_Letterpress,category_Publishing_Literary Journals,category_Publishing_Literary Spaces,category_Publishing_No Subcategory,category_Publishing_Nonfiction,category_Publishing_Periodicals,category_Publishing_Poetry,category_Publishing_Radio & Podcasts,category_Publishing_Translations,category_Publishing_Young Adult,category_Publishing_Zines,category_Technology_3D Printing,category_Technology_Apps,category_Technology_Camera Equipment,category_Technology_DIY Electronics,category_Technology_Fabrication Tools,category_Technology_Flight,category_Technology_Gadgets,category_Technology_Hardware,category_Technology_Makerspaces,category_Technology_No Subcategory,category_Technology_Robots,category_Technology_Software,category_Technology_Sound,category_Technology_Space Exploration,category_Technology_Wearables,category_Technology_Web,category_Theater_Comedy,category_Theater_Experimental,category_Theater_Festivals,category_Theater_Immersive,category_Theater_Musical,category_Theater_No Subcategory,category_Theater_Plays,category_Theater_Spaces,text
0,-2.60096,-0.410025,-0.40477,1.1643,-0.0916838,-0.212521,-0.528896,2.29474,0.580159,0.953616,-0.148651,-0.430532,0.61289,0.8519,-0.351657,0.900323,0.139024,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,roadside greg stevie young couple coming home ...
1,-1.3901,-2.21121,0.641878,0.480697,-0.239074,-0.38711,2.10918,0.654295,-0.251728,-0.133694,-0.882519,0.938515,1.28653,-0.575927,0.198325,-0.318505,0.0271604,0,1,0,0,1,1,1,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,look short film thousands misunderstood kids w...
2,2.88439,-2.40946,2.01379,-0.189682,0.38569,0.0397768,-1.41573,0.18612,-2.2013,-0.502901,0.497166,-0.674055,0.354359,0.169478,1.20088,-1.1918,-0.339643,0,1,0,1,1,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,author lady windermere fan role playing game d...


In [None]:
# Split dataset into structured and text features
X_subtrain_structured = X_subtrain[:,0:233].astype("float32")
X_subtrain_text = X_subtrain[:,233]
X_val_structured = X_val[:,0:233].astype("float32")
X_val_text = X_val[:,233]
print("Shape of X_subtrain_structured: {}".format(X_subtrain_structured.shape))
print("Shape of X_subtrain_text: {}".format(X_subtrain_text.shape))
print("Shape of y_subtrain: {}".format(y_subtrain.shape))
print("Shape of X_val_structured: {}".format(X_val_structured.shape))
print("Shape of X_val_text: {}".format(X_val_text.shape))
print("Shape of y_val: {}".format(y_val.shape))

Shape of X_subtrain_structured: (172824, 233)
Shape of X_subtrain_text: (172824,)
Shape of y_subtrain: (172824,)
Shape of X_val_structured: (37034, 233)
Shape of X_val_text: (37034,)
Shape of y_val: (37034,)


In [None]:
# Define parameters for text processing 
max_features = 631377
max_len = 1000
embedding_dim = 300

In [None]:
# Convert Texts Into Integer Sequences (Tokenization)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_subtrain_text)
X_subtrain_text = tokenizer.texts_to_sequences(X_subtrain_text)
X_val_text = tokenizer.texts_to_sequences(X_val_text)
word_index = tokenizer.word_index
print("Included Token: {}".format(len(word_index)))

Included Token: 631377


In [None]:
# Determine number of words for max_len
num_words = pd.DataFrame([len(x) for x in X_subtrain_text])
num_words.rename(columns={0:"words"}, inplace=True)
num_words.describe()

Unnamed: 0,words
count,172824.0
mean,509.529984
std,409.239714
min,14.0
25%,241.0
50%,399.0
75%,649.0
max,11553.0


In [None]:
# Pad and Truncate sequences
X_subtrain_text = pad_sequences(X_subtrain_text, maxlen=max_len, padding="pre", truncating="post")
X_val_text = pad_sequences(X_val_text, maxlen=max_len, padding="pre", truncating="post")

print("Shape of X_subtrain: {}".format(X_subtrain_text.shape))
print("Shape of X_val: {}".format(X_val_text.shape))

Shape of X_subtrain: (172824, 1000)
Shape of X_val: (37034, 1000)


In [None]:
# Prepare the Embedding Matrix
print('Preparing Embedding Matrix...')
words_not_found = []
embedding_matrix = np.zeros((max_features, embedding_dim))
for word, i in word_index.items():
    if i < max_features:
      embedding_vector = ft.get_word_vector(word)
      if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
      else:
        words_not_found.append(word)
print('Shape of Embedding Matrix: {}'.format(embedding_matrix.shape))
print('Number of Null Word Embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing Embedding Matrix...
Shape of Embedding Matrix: (631377, 300)
Number of Null Word Embeddings: 218


#### Combine CNN with FCNN:

##### 1. Average Ensemble of the Best Found Models

In [None]:
# Define the model
def create_model():
  # Inputs
  input_structured = Input(shape=(233,))
  input_text = Input(shape=(max_len,))

  # Text Branch
  embeddings = Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_text)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=HeUniform(seed=seed_value))(embeddings)
  cnn = AvgPool1D(2)(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=HeUniform(seed=seed_value))(cnn)
  cnn = AvgPool1D(2)(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=HeUniform(seed=seed_value))(cnn)
  cnn = AvgPool1D(2)(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=HeUniform(seed=seed_value))(cnn)
  cnn = AvgPool1D()(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Flatten()(cnn)
  cnn = Dense(32, activation='relu', kernel_initializer=HeUniform(seed=seed_value))(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  output_text = Dense(1, activation='sigmoid', kernel_initializer=HeUniform(seed=seed_value))(cnn)

  # Structured Branch
  dense = Dense(500, activation="gelu", kernel_initializer=HeUniform(seed=seed_value))(input_structured)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(350, activation="gelu", kernel_initializer=HeUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(245, activation="gelu", kernel_initializer=HeUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  output_structured = Dense(1, activation="sigmoid", kernel_initializer=HeUniform(seed=seed_value))(dense)

  # Average Predictions
  output = Average()([output_structured, output_text])

  # Define model
  model = Model([input_structured, input_text], output)
  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=Nadam(clipnorm=1.0), loss="binary_crossentropy", steps_per_execution=318, metrics=["binary_accuracy"])

In [None]:
# Plot model
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
# Show summary of the model
model.summary()

In [None]:
# Train the model
model.fit([X_subtrain_structured, X_subtrain_text], y_subtrain, validation_data=([X_val_structured, X_val_text], y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate([X_subtrain_structured, X_subtrain_text], y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate([X_val_structured, X_val_text], y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Training Accuracy: 0.839
Validation Accuracy: 0.824


Results:
- Nadam: 84.2% vs. 82.8%
- Adam: 84.9% vs. 82.8%
- RMSprop: 83.8% vs. 82.7%
-> i.e. Nadam was the best
- Nadam + gradient clipping: 85.1% vs. 82.9%
-> i.e. gradient clipping helps a little bit
- same activations "relu": 85.1% vs. 82.6%
- same activations "gelu": 83.8% vs. 82.7%
-> i.e. different activations work best
- same weight initializers GlorotUniform: 84.1% vs. 82.8%
- same weight initializers HeUniform: 84.8% vs. 82.9%
-> i.e. HeUniform initializer worked best
- remove Gaussian Noise: 85.3% vs. 82.5% (i.e. Gaussian Noise helps)
- remove LayerNormalization: 84.2% vs. 82.6% (i.e. Layer Normalization helps)
- try BatchNormalization: 84.6% vs. 82.7% (i.e. Layer Normalization works better)
- remove Dropout: 83.9% vs. 82.4%
-> i.e. keep same regularizers as before

##### 2. Stacking Ensemble of Best Found Models

In [None]:
# Define the model
def create_model():
  # Inputs
  input_structured = Input(shape=(233,))
  input_text = Input(shape=(max_len,))

  # Text Branch
  embeddings = Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_text)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=HeUniform(seed=seed_value))(embeddings)
  cnn = AvgPool1D(2)(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=HeUniform(seed=seed_value))(cnn)
  cnn = AvgPool1D(2)(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=HeUniform(seed=seed_value))(cnn)
  cnn = AvgPool1D(2)(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=HeUniform(seed=seed_value))(cnn)
  cnn = AvgPool1D()(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Flatten()(cnn)
  cnn = Dense(32, activation='relu', kernel_initializer=HeUniform(seed=seed_value))(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  output_text = Dense(1, activation='sigmoid', kernel_initializer=HeUniform(seed=seed_value))(cnn)

  # Structured Branch
  dense = Dense(500, activation="gelu", kernel_initializer=HeUniform(seed=seed_value))(input_structured)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(350, activation="gelu", kernel_initializer=HeUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(245, activation="gelu", kernel_initializer=HeUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  output_structured = Dense(1, activation="sigmoid", kernel_initializer=HeUniform(seed=seed_value))(dense)

  # Stacking Ensemble
  concat = Concatenate()([output_structured, output_text])
  output = Dense(1, activation="sigmoid", kernel_initializer=HeUniform(seed=seed_value))(concat)

  # Define model
  model = Model([input_structured, input_text], output)
  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=Nadam(clipnorm=1.0), loss="binary_crossentropy", steps_per_execution=318, metrics=["binary_accuracy"])

In [None]:
# Plot model
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
# Show summary of the model
model.summary()

In [None]:
# Train the model
model.fit([X_subtrain_structured, X_subtrain_text], y_subtrain, validation_data=([X_val_structured, X_val_text], y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate([X_subtrain_structured, X_subtrain_text], y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate([X_val_structured, X_val_text], y_val, batch_size=512, verbose=0)[1]))

Results:
- Averaging: 84.8% vs. 82.9%
- Stacking: 87.4% vs. 82.8%
-> i.e. Averaging Ensemble is better


##### 3. Add CNN Output to Structured Features

In [None]:
# Define the model
def create_model():
  # Inputs
  input_structured = Input(shape=(233,))
  input_text = Input(shape=(max_len,))

  # Text Branch
  embeddings = Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_text)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value))(embeddings)
  cnn = AvgPool1D(2)(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value))(cnn)
  cnn = AvgPool1D(2)(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value))(cnn)
  cnn = AvgPool1D(2)(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value))(cnn)
  cnn = AvgPool1D()(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  output_cnn = Flatten()(cnn)

  # Concatenate Features
  concat = Concatenate()([input_structured, output_cnn])

  # Dense Classifier
  dense = Dense(500, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(concat)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(350, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(245, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  output = Dense(1, activation="sigmoid", kernel_initializer=GlorotUniform(seed=seed_value))(dense)

  # Define model
  model = Model([input_structured, input_text], output)
  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=Nadam(clipnorm=1.0), loss="binary_crossentropy", steps_per_execution=318, metrics=["binary_accuracy"])

In [None]:
# Plot model
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
# Show summary of the model
model.summary()

In [None]:
# Train the model
model.fit([X_subtrain_structured, X_subtrain_text], y_subtrain, validation_data=([X_val_structured, X_val_text], y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate([X_subtrain_structured, X_subtrain_text], y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate([X_val_structured, X_val_text], y_val, batch_size=512, verbose=0)[1]))

Results:
- Baseline:  86.2% vs. 82.9%
- same initializer HeUniform: 85.6% vs. 82.8%
- same initializer GlorotUniform:  84.7% vs. 82.9%
-> i.e. GlorotUniform worked best
- same activation relu: 86.5% vs. 82.9%
- same activation gelu: 84.6% vs. 82.5%
-> i.e. different activations worked best
- no clipnorm: 86.1% vs. 82.8%
-> i.e. clipnorm worked best
- Adam: 84.4% vs. 82.8%
- RMSprop: 83.9% vs. 82.6%
-> i.e. Nadam worked best
- add Dropout after Concat: 81.8% vs. 80.8% (i.e. this strategy doesn't work well)
- remove Dropout before Flatten: 85.1% vs. 82.4% (i.e. this strategy doesn't work well)
- remove all regularizers before Flatten: 86.5% vs. 82.6%
- remove gaussian noise: 85% vs. 82.7% (i.e. adding gaussian noise helps)
- remove layer normalization: 84.4% vs. 82.8% (i.e. layer normalization helps a little bit)
- remove regularizers in CNN: 83.5% vs. 82.2%
- remove regularizers in CNN+Dense: 84.8% vs. 82.5%
-> i.e. keep regularizers as they are


##### 4. Multi-Branch Network

In [None]:
# Define the model
def create_model():
  # Inputs
  input_structured = Input(shape=(233,))
  input_text = Input(shape=(max_len,))

  # Text Branch
  embeddings = Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_text)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value))(embeddings)
  cnn = AvgPool1D(2)(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value))(cnn)
  cnn = AvgPool1D(2)(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value))(cnn)
  cnn = AvgPool1D(2)(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Conv1D(64, 5, activation='relu', padding='same', kernel_initializer=GlorotUniform(seed=seed_value))(cnn)
  cnn = AvgPool1D()(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  cnn = Dropout(rate=0.5, seed=seed_value)(cnn)
  cnn = Flatten()(cnn)
  cnn = Dense(490, activation='gelu', kernel_initializer=GlorotUniform(seed=seed_value))(cnn)
  cnn = GaussianNoise(stddev=0.1)(cnn)
  cnn = LayerNormalization()(cnn)
  output_cnn = Dropout(rate=0.5, seed=seed_value)(cnn)

  # Structured Branch
  mlp = Dense(500, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(input_structured)
  mlp = Dropout(rate=0.5, seed=seed_value)(mlp)
  mlp = Dense(350, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(mlp)
  mlp = Dropout(rate=0.5, seed=seed_value)(mlp)
  mlp = Dense(245, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(mlp)
  output_mlp = Dropout(rate=0.5, seed=seed_value)(mlp)

  # Concatenate Features
  concat = Concatenate()([output_mlp, output_cnn])

  # Dense Classifier
  dense = Dense(256, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(concat)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(64, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  output = Dense(1, activation="sigmoid", kernel_initializer=GlorotUniform(seed=seed_value))(dense)

  # Define model
  model = Model([input_structured, input_text], output)
  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=Nadam(clipnorm=1.0), loss="binary_crossentropy", steps_per_execution=318, metrics=["binary_accuracy"])

In [None]:
# Plot model
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
# Show summary of the model
model.summary()

In [None]:
# Train the model
model.fit([X_subtrain_structured, X_subtrain_text], y_subtrain, validation_data=([X_val_structured, X_val_text], y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate([X_subtrain_structured, X_subtrain_text], y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate([X_val_structured, X_val_text], y_val, batch_size=512, verbose=0)[1]))

Results:
- Baseline: 86.7% vs. 82.9%
- add Dense layer (466 units) to CNN: 85.3% vs. 83%
-> i.e. adding Dense layer to CNN helps
- pre-process structured data before concatenating features: 85.8% vs. 83.2%
-> i.e. adding densely-connected classifier on top of the pre-processed features helps to increase model performance


#### Combine RNN with FCNN:

##### 1. Average Ensemble of the Best Found Models

In [None]:
# Define the model
def create_model():
  # Inputs
  input_structured = Input(shape=(233,))
  input_text = Input(shape=(max_len,))

  # Text Branch
  embeddings = Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_text)
  rnn = GRU(64, kernel_initializer=GlorotUniform(seed=seed_value), dropout=0.5, recurrent_dropout=0.5)(embeddings)
  rnn = Dropout(rate=0.5, seed=seed_value)(rnn)
  output_text = Dense(1, activation='sigmoid')(rnn)

  # Structured Branch
  dense = Dense(500, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(input_structured)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(350, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(245, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  output_structured = Dense(1, activation="sigmoid", kernel_initializer=GlorotUniform(seed=seed_value))(dense)

  # Average Predictions
  output = Average()([output_structured, output_text])

  # Define model
  model = Model([input_structured, input_text], output)
  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=Nadam(clipnorm=1.0), loss="binary_crossentropy", steps_per_execution=318, metrics=["binary_accuracy"])

In [None]:
# Plot model
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
# Show summary of the model
model.summary()

In [None]:
# Train the model
model.fit([X_subtrain_structured, X_subtrain_text], y_subtrain, validation_data=([X_val_structured, X_val_text], y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate([X_subtrain_structured, X_subtrain_text], y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate([X_val_structured, X_val_text], y_val, batch_size=512, verbose=0)[1]))

Results:
- RNN with 256 units was not scalable; switch to second-best model with 64 units
- Dense Dropout 0.5: 84.4% vs. 82.8%
- Dense Dropout 0.3: 84.8% vs. 82.8%
- No Dense Dropout: 84% vs. 82.7%
-> i.e. dense dropout of 0.5 was the best
- dropout=0.3, recurrent dropout=0.3: 85.1% vs. 82.8%
-> i.e. dropout=0.5 + recurrent_dropout=0.5 was the best

##### 2. Stacking Ensemble of Best Found Models

In [None]:
# Define the model
def create_model():
  # Inputs
  input_structured = Input(shape=(233,))
  input_text = Input(shape=(max_len,))

  # Text Branch
  embeddings = Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_text)
  rnn = GRU(64, kernel_initializer=GlorotUniform(seed=seed_value), dropout=0.5, recurrent_dropout=0.5)(embeddings)
  rnn = Dropout(rate=0.5, seed=seed_value)(rnn)
  output_text = Dense(1, activation='sigmoid')(rnn)

  # Structured Branch
  dense = Dense(500, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(input_structured)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(350, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(245, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  output_structured = Dense(1, activation="sigmoid", kernel_initializer=GlorotUniform(seed=seed_value))(dense)

  # Stacking ensemble
  concat = Concatenate()([output_structured, output_text])
  output = Dense(1, activation="sigmoid", kernel_initializer=GlorotUniform(seed=seed_value))(concat)

  # Define model
  model = Model([input_structured, input_text], output)
  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=Nadam(clipnorm=1.0), loss="binary_crossentropy", steps_per_execution=318, metrics=["binary_accuracy"])

In [None]:
# Plot model
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
# Show summary of the model
model.summary()

In [None]:
# Train the model
model.fit([X_subtrain_structured, X_subtrain_text], y_subtrain, validation_data=([X_val_structured, X_val_text], y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate([X_subtrain_structured, X_subtrain_text], y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate([X_val_structured, X_val_text], y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Training Accuracy: 0.850
Validation Accuracy: 0.826


Result:
- Average: 84.4% vs. 82.8%
- Stacking: 85% vs. 82.6%
-> i.e. average ensemble is better

##### 3. Add RNN Output To Structured Features

In [None]:
# Define the model
def create_model():
  # Inputs
  input_structured = Input(shape=(233,))
  input_text = Input(shape=(max_len,))

  # Text Branch
  embeddings = Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_text)
  rnn = GRU(64, kernel_initializer=GlorotUniform(seed=seed_value), dropout=0.5, recurrent_dropout=0.5)(embeddings)
  output_rnn = Dropout(rate=0.5, seed=seed_value)(rnn)

  # Merge RNN output to Structured Features
  concat = Concatenate()([input_structured, output_rnn])

  # Dense Classifier
  dense = Dense(500, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(concat)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(350, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(245, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  output = Dense(1, activation="sigmoid", kernel_initializer=GlorotUniform(seed=seed_value))(dense)

  # Define model
  model = Model([input_structured, input_text], output)
  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=Nadam(clipnorm=1.0), loss="binary_crossentropy", steps_per_execution=318, metrics=["binary_accuracy"])

In [None]:
# Plot model
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
# Show summary of the model
model.summary()

In [None]:
# Train the model
model.fit([X_subtrain_structured, X_subtrain_text], y_subtrain, validation_data=([X_val_structured, X_val_text], y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate([X_subtrain_structured, X_subtrain_text], y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate([X_val_structured, X_val_text], y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Training Accuracy: 0.846
Validation Accuracy: 0.830


Results:
- with dropout after RNN: 85.6% vs. 83.2%
- without dropout after RNN: 84.6% vs. 83%

##### 4. Multi-Branch Network

In [None]:
# Define the model
def create_model():
  # Inputs
  input_structured = Input(shape=(233,))
  input_text = Input(shape=(max_len,))

  # Text Branch
  embeddings = Embedding(max_features, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_text)
  rnn = GRU(64, kernel_initializer=GlorotUniform(seed=seed_value), dropout=0.5, recurrent_dropout=0.5)(embeddings)
  output_rnn = Dropout(rate=0.5, seed=seed_value)(rnn)

  # Structured Branch
  mlp = Dense(500, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(input_structured)
  mlp = Dropout(rate=0.5, seed=seed_value)(mlp)
  mlp = Dense(350, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(mlp)
  mlp = Dropout(rate=0.5, seed=seed_value)(mlp)
  mlp = Dense(245, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(mlp)
  output_mlp = Dropout(rate=0.5, seed=seed_value)(mlp)
  
  # Dense Classifier
  concat = Concatenate()([output_mlp, output_rnn])
  dense = Dense(64, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(concat)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(32, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  dense = Dense(16, activation="gelu", kernel_initializer=GlorotUniform(seed=seed_value))(dense)
  dense = Dropout(rate=0.5, seed=seed_value)(dense)
  output = Dense(1, activation="sigmoid", kernel_initializer=GlorotUniform(seed=seed_value))(dense)

  # Define model
  model = Model([input_structured, input_text], output)
  return model

In [None]:
# Compile the model
with strategy.scope():
  model = create_model()
  model.compile(optimizer=Nadam(clipnorm=1.0), loss="binary_crossentropy", steps_per_execution=318, metrics=["binary_accuracy"])

In [None]:
# Plot model
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
# Show summary of the model
model.summary()

In [None]:
# Train the model
model.fit([X_subtrain_structured, X_subtrain_text], y_subtrain, validation_data=([X_val_structured, X_val_text], y_val), batch_size=512, epochs=100, verbose=1, shuffle=True, callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_binary_accuracy", patience=7, mode="max", restore_best_weights=True)])
print("Training Accuracy: {:.3f}".format(model.evaluate([X_subtrain_structured, X_subtrain_text], y_subtrain, batch_size=512, verbose=0)[1]))
print("Validation Accuracy: {:.3f}".format(model.evaluate([X_val_structured, X_val_text], y_val, batch_size=512, verbose=0)[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Training Accuracy: 0.847
Validation Accuracy: 0.829


Results:
- No Hidden Layer: 81.4% vs. 80.9%
- 1 Hidden Layer, 32 Units: 84.8% vs. 83%
- 1 Hidden Layer, 64 Units: 84.1% vs. 82.9%
- 1 Hidden Layer, 128 Units: 84.6% vs. 82.9%
- 1 Hidden Layer, 256 Units: 84.2% vs. 82.9%
- 2 Hidden Layers, [256, 64] Units: 85% vs. 83.1%
- 2 Hidden Layers, [128, 32] Units: 84.8% vs. 83%
- 2 Hidden Layers, [64, 16] Units: 85.5% vs. 83.1%
- 3 Hidden Layers, [256, 128, 64] Units: 84.7% vs. 82.9%
- 3 Hidden Layers, [64, 32, 16] Units: 84.7% vs. 82.9%

#### Best-Found Model: NN Structured + Text

**CNN-Based:**
- Multi-branch network consisting of Best NN Structured + Best NN text, cascaded into a Dense classifier
- No changes to structured branch
- added FCNN Layer with 490 units in text branch
- Dense Classifier = 2 Hidden Layers; [256, 64] Units; GELU; Dropout (0.5)
- Optimization: GlorotUniform; Nadam; clipnorm=1.0; Batch Size: 512


**RNN-Based:**
- Adding RNN-Output to structured features and then process merged dataset with Best NN Structured
- No changes to best-found NN Text; no changes to best found NN Structured
-Optimization: GlorotUniform; Nadam; clipnorm=1.0; Batch Size: 512
