# LSTM model


In [1]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [2]:
!gcloud auth login

Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?code_challenge=grRJZIi00nlnE2CrsnKpkdQuzTThUWqr-CtGED7U1fo&prompt=select_account&code_challenge_method=S256&access_type=offline&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&response_type=code&client_id=32555940559.apps.googleusercontent.com&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth


Enter verification code: 4/vQHXeU22_RFwzKSDtdGwiwPlJK34hj8zMh-I0JkBAnMEx0GNorH5CZM
If you need to use ADC, see:
  gcloud auth application-default --help

You are now logged in as [galli.giuly@gmail.com].
Your current project is [None].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


To take a quick anonymous survey, run:
  $ gcloud survey



In [3]:
%env GCLOUD_PROJECT=reddit-master

env: GCLOUD_PROJECT=reddit-master


In [4]:
import numpy as np 
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import re
import plotly.graph_objs as go
import chart_studio.plotly as py
import cufflinks
import plotly.figure_factory as ff
import logging
import nltk
nltk.download('stopwords')
import keras.backend as K

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import log_loss
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
from plotly.offline import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')
from tensorflow import metrics, local_variables_initializer
from keras.models import load_model

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Using TensorFlow backend.


In [5]:
# importing my final ds

!gsutil cp gs://reddit_final_results/reddit_clear_df.pkl .

Copying gs://reddit_final_results/reddit_clear_df.pkl...
- [1 files][253.2 MiB/253.2 MiB]                                                
Operation completed over 1 objects/253.2 MiB.                                    


In [0]:
model_LSTM_df = pd.read_pickle('reddit_clear_df.pkl')

In [7]:
# importing my final tokenizer

!gsutil cp gs://reddit_models/reddit_tokenizer.pkl .

Copying gs://reddit_models/reddit_tokenizer.pkl...
- [1 files][ 47.0 MiB/ 47.0 MiB]                                                
Operation completed over 1 objects/47.0 MiB.                                     


In [0]:
with open('reddit_tokenizer.pkl', 'rb') as file:
    tokenizer = pkl.load(file)

## Implementing the model

In [0]:
MAX_NB_WORDS = 75000
MAX_SEQUENCE_LENGTH = 450
EMBEDDING_DIM = 100

In [11]:
X = tokenizer.texts_to_sequences(model_LSTM_df['body'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (1546818, 450)


In [12]:
Y = pd.get_dummies(model_LSTM_df['subreddit']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (1546818, 10)


In [19]:
%%time

def train_dev_test_split(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42, stratify=y_val)
    return (X_train,
                X_val,
                X_test,
                y_train,
                y_val,
                y_test)


X_train, X_val, X_test, Y_train, Y_val, Y_test = train_dev_test_split(X,Y)

print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)
print(X_test.shape,Y_test.shape)

(1082772, 450) (1082772, 10)
(232023, 450) (232023, 10)
(232023, 450) (232023, 10)
CPU times: user 20.7 s, sys: 890 ms, total: 21.6 s
Wall time: 21.6 s


# Training with auc 

In [0]:
# defining auc function

def auc(y_true, y_pred):
    auc = metrics.auc(y_true, y_pred)[1]
    K.get_session().run(local_variables_initializer())
    return auc


In [0]:
K.clear_session()

# Creating the neural network using Keras' functional api so that
# I'm able to inspect each one of the layers later.

inpt = Input(shape=(MAX_SEQUENCE_LENGTH,)) 
emb = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1])(inpt)
dropout = SpatialDropout1D(0.2)(emb)
lstm = LSTM(30, dropout=0.2, recurrent_dropout=0.2)(dropout)
output = Dense(10, activation='softmax')(lstm)
model = Model(inputs =[inpt], outputs=[output])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[auc])

print(model.summary())

In [0]:
%%time

epochs = 10
batch_size = 150
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=1,
    min_delta=0.01)

history = model.fit(
    X_train,
    Y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_val, Y_val),
    callbacks=[early_stopping]
)

Train on 1082772 samples, validate on 232023 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
CPU times: user 7h 19min 40s, sys: 27min 11s, total: 7h 46min 51s
Wall time: 5h 16min 58s


In [0]:
model.save('auc_model_lstm_30_batchsize_150_10_subreddits.h5')

In [0]:
!gsutil cp auc_model_lstm_30_batchsize_150_10_subreddits.h5 gs://reddit_models

Copying file://auc_model_lstm_30_batchsize_150_10_subreddits.h5 [Content-Type=application/octet-stream]...
\
Operation completed over 1 objects/86.0 MiB.                                     


In [20]:
result_auc = model.evaluate(X_test,Y_test)
print(print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(result_auc[0],result_auc[1]))

Test set
  Loss: 0.941
  Accuracy: 0.946


# Training with metric 'accuracy'

In [0]:
K.clear_session()

# Creating the neural network using Keras' functional api so that
# I'm able to inspect each one of the layers later.

inpt = Input(shape=(MAX_SEQUENCE_LENGTH,)) 
emb = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1])(inpt)
dropout = SpatialDropout1D(0.2)(emb)
lstm = LSTM(30, dropout=0.2, recurrent_dropout=0.2)(dropout)
output = Dense(10, activation='softmax')(lstm)
model = Model(inputs =[inpt], outputs=[output])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 450)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 450, 100)          7500000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 450, 100)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 30)                15720     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                310       
Total params: 7,516,030
Trainable params: 7,516,030
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
%%time

epochs = 10
batch_size = 150
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=1,
    min_delta=0.01)

history = model.fit(
    X_train,
    Y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_val, Y_val),
    callbacks=[early_stopping]
)

Train on 1082772 samples, validate on 232023 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
CPU times: user 6h 29min 16s, sys: 24min 22s, total: 6h 53min 39s
Wall time: 4h 37min 27s


In [0]:
model.save('accuracy_model_lstm_30_batchsize_150_10_subreddits.h5')

In [0]:
!gsutil cp accuracy_model_lstm_30_batchsize_150_10_subreddits.h5 gs://reddit_models

Copying file://accuracy_model_lstm_30_batchsize_150_10_subreddits.h5 [Content-Type=application/octet-stream]...
\
Operation completed over 1 objects/86.0 MiB.                                     


In [21]:
# In case you need to upload the saved model

!gsutil cp gs://reddit_models/accuracy_model_lstm_30_batchsize_150_10_subreddits.h5 .

Copying gs://reddit_models/accuracy_model_lstm_30_batchsize_150_10_subreddits.h5...
- [1 files][ 86.0 MiB/ 86.0 MiB]                                                
Operation completed over 1 objects/86.0 MiB.                                     


In [0]:
model_acc = load_model("accuracy_model_lstm_30_batchsize_150_10_subreddits.h5")

In [23]:
result_accr = model_acc.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(result_accr[0],result_accr[1]))

Test set
  Loss: 0.941
  Accuracy: 0.698


# Testing the best model

#### best model: auc_model_lstm_30_batchsize_150_10_subreddits

In [13]:
# In case you need to upload the saved model

!gsutil cp gs://reddit_models/auc_model_lstm_30_batchsize_150_10_subreddits.h5 .

Copying gs://reddit_models/auc_model_lstm_30_batchsize_150_10_subreddits.h5...
- [1 files][ 86.0 MiB/ 86.0 MiB]                                                
Operation completed over 1 objects/86.0 MiB.                                     


In [0]:
dependencies = {
    'auc': auc
}

In [18]:
model = load_model("auc_model_lstm_30_batchsize_150_10_subreddits.h5", custom_objects=dependencies)

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




In [0]:
def label_decoder(subreddit):
    dict_labels = {
        0:"Fitness",
        1:"atheism",
        2:"aww",
        3:"europe",
        4:"gaming",
        5:"movies",
        6:"nba",
        7:"politics",
        8:"science",
        9:"technology"
    }
    
    return dict_labels[subreddit]

In [0]:
text = input("Please, enter the text of your blog: ") 
print("")
print("Text correctly entered, I'll give you that subreddit in a minute")

Please, enter the text of your blog: Multi-label classification originated from the investigation of text categorisation problem, where each document may belong to several predefined topics simultaneously. Multi-label classification of textual data is an important problem. Examples range from news articles to emails. For instance, this can be employed to find the genres that a movie belongs to, based on the summary of its plot.

Text correctly entered, I'll give you that subreddit in a minute


In [0]:
MAX_NB_WORDS = 75000
MAX_SEQUENCE_LENGTH = 450
EMBEDDING_DIM = 100

X = tokenizer.texts_to_sequences([text])
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (1, 450)


In [0]:
result = model.predict(X)

def convert_to_cat(arr):
    biggest = 0
    for x in range(0, len(arr)):
        if arr[x] > arr[biggest]:
            biggest = x
    return biggest

category = [label_decoder(convert_to_cat(x)) for x in result]

print("you should publish your text into the sureddit: ", category)

you should publish your text into the sureddit:  ['technology']
