This notebook is exploring applying Recurrent Neural Nets for the title data.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Embedding, Flatten, LSTM
from keras.preprocessing.sequence import pad_sequences

In [2]:
posts_df = pd.read_csv('./posts_scores_dates.csv')
posts_df.head()

Unnamed: 0,id,title,created_utc,comments_old,score_old,post_time_utc,score,comments,percentile,viral
0,uu6g0w,[homemade] Polynesian (Chick-Fil-A sauce) chic...,1653077037,0,1,2022-05-20 16:03:57,69,7.0,0.83673,0
1,uu6cni,"[I ate] Scotch mutton pie, pub in Edinburgh",1653076799,0,1,2022-05-20 15:59:59,37,4.0,0.699491,0
2,uu6apo,[homemade] 🇲🇦,1653076639,0,1,2022-05-20 15:57:19,1,0.0,0.117647,0
3,uu644e,"[homemade] Chilli Paneer, Spinach, Potatoes wi...",1653076091,0,1,2022-05-20 15:48:11,16,2.0,0.443585,0
4,uu5x2y,"[Homemade] Tart - Salmon, spinach and goat cheese",1653075500,0,1,2022-05-20 15:38:20,1,0.0,0.117647,0


In [3]:
posts_df.shape

(10795, 10)

In [4]:
posts_df.dtypes

id                object
title             object
created_utc        int64
comments_old       int64
score_old          int64
post_time_utc     object
score              int64
comments         float64
percentile       float64
viral              int64
dtype: object

In [5]:
X = posts_df['title']
y = posts_df['viral']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 20, stratify = y)

In [6]:
tokenizer = Tokenizer(num_words=500)

In [7]:
tokenizer.fit_on_texts(X_train)

In [8]:
X_train.head()

10358         Closer Look Easter Chick Cupcakes [homemade]
3667     [Homemade] double stuffed bacon, egg, and chee...
1314                [Homemade] A sunny Sunday Charcuterie.
2388                                [homemade] Cheesesteak
6370     [homemade]If you want to eat mushrooms but hat...
Name: title, dtype: object

In [9]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [10]:
X_train = pad_sequences(X_train, maxlen=100)
X_test = pad_sequences(X_test, maxlen=100)

Basic RNN model

In [11]:
model = Sequential()
model.add(Embedding(input_dim=tokenizer.num_words, output_dim=64))
model.add(SimpleRNN(64))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [14]:
model.compile(loss='binary_crossentropy', metrics=['acc', 'bce'])

In [15]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f86a36c1490>

Basic LSTM model

In [16]:
model = Sequential()
model.add(Embedding(input_dim=tokenizer.num_words, output_dim=64))
model.add(LSTM(64))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='bce', metrics=['acc'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f86a428a950>