# Imports

In [1]:
import os

import keras
import pandas as pd

import utils.word_utils as wu
from utils.KerasModels import calculate_metrics

keras.utils.set_random_seed(812)
MODELS_DIR = os.path.join('..', 'models', 'wordGRU')
EPOCHS = 200
MAX_LEN = 100
VOCAB_SIZE = 15000

# Simple data

In [2]:
data_path = os.path.join('..', 'data', 'preprocessed_url_simple')

x_train, y_train, x_valid, y_valid, x_test, y_test = wu.prepare_data(
    data_path)
os.makedirs(MODELS_DIR, exist_ok=True)
text_vectorizer = wu.prepare_text_vectorizer(x_train, max_len=MAX_LEN, vocab_size=VOCAB_SIZE)
model = wu.get_wordgru(text_vectorizer)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 100, 128)          1920000   
                                                                 
 bidirectional (Bidirectiona  (None, 1024)             1972224   
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense (Dense)               (None, 1)                 1025  

In [3]:
# Train GRU model.
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
save_best = keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR, "simple"), monitor='val_loss',
                                            save_best_only=True, restore_best_weights=True)
model.fit(x_train, y_train, batch_size=256, epochs=EPOCHS,
          validation_data=(x_valid, y_valid),
          callbacks=[early_stopping, save_best])

Epoch 1/200



INFO:tensorflow:Assets written to: ..\models\wordGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\wordGRU\simple\assets


Epoch 2/200



INFO:tensorflow:Assets written to: ..\models\wordGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\wordGRU\simple\assets


Epoch 3/200



INFO:tensorflow:Assets written to: ..\models\wordGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\wordGRU\simple\assets


Epoch 4/200



INFO:tensorflow:Assets written to: ..\models\wordGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\wordGRU\simple\assets


Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200


<keras.callbacks.History at 0x1cb36ecb4c0>

In [4]:
probas = model.predict(x_test)
y_pred = (probas > 0.5).astype('int32')
results = calculate_metrics(y_test, y_pred)
results



{'balanced_accuracy': 0.8045456768388106,
 'f1_score': 0.8019017432646594,
 'precision': 0.8135048231511254,
 'recall': 0.790625}

# Lemmatized data

In [5]:
data_path = os.path.join('..', 'data', 'lemmatized')

x_train, y_train, x_valid, y_valid, x_test, y_test = wu.prepare_data(
    data_path)
os.makedirs(MODELS_DIR, exist_ok=True)
text_vectorizer = wu.prepare_text_vectorizer(x_train, max_len=MAX_LEN, vocab_size=VOCAB_SIZE)
model = wu.get_wordgru(text_vectorizer)
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_1 (TextV  (None, 100)              0         
 ectorization)                                                   
                                                                 
 embedding_1 (Embedding)     (None, 100, 128)          1920000   
                                                                 
 bidirectional_1 (Bidirectio  (None, 1024)             1972224   
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 1024)              0         
                                                                 
 dense_1 (Dense)             (None, 1)                 1025

In [6]:
# Train GRU model.
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
save_best = keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR, "lemmatized"), monitor='val_loss',
                                            save_best_only=True, restore_best_weights=True)
model.fit(x_train, y_train, batch_size=256, epochs=EPOCHS,
          validation_data=(x_valid, y_valid),
          callbacks=[early_stopping, save_best])

Epoch 1/200



INFO:tensorflow:Assets written to: ..\models\wordGRU\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\wordGRU\lemmatized\assets


Epoch 2/200



INFO:tensorflow:Assets written to: ..\models\wordGRU\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\wordGRU\lemmatized\assets


Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200


<keras.callbacks.History at 0x1cb43b87a00>

In [7]:
probas = model.predict(x_test)
y_pred = (probas > 0.5).astype('int32')
results_lem = calculate_metrics(y_test, y_pred)
results_lem



{'balanced_accuracy': 0.8287320275821597,
 'f1_score': 0.837295690936107,
 'precision': 0.7981586402266289,
 'recall': 0.88046875}

# Stemmed data

In [8]:
data_path = os.path.join('..', 'data', 'stemmed')

x_train, y_train, x_valid, y_valid, x_test, y_test = wu.prepare_data(
    data_path)
os.makedirs(MODELS_DIR, exist_ok=True)
text_vectorizer = wu.prepare_text_vectorizer(x_train, max_len=MAX_LEN, vocab_size=VOCAB_SIZE)
model = wu.get_wordgru(text_vectorizer)
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization_2 (TextV  (None, 100)              0         
 ectorization)                                                   
                                                                 
 embedding_2 (Embedding)     (None, 100, 128)          1920000   
                                                                 
 bidirectional_2 (Bidirectio  (None, 1024)             1972224   
 nal)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 1024)              0         
                                                                 
 dense_2 (Dense)             (None, 1)                 1025

In [9]:
# Train GRU model.
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
save_best = keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR, "stemmed"), monitor='val_loss',
                                            save_best_only=True, restore_best_weights=True)
model.fit(x_train, y_train, batch_size=256, epochs=EPOCHS,
          validation_data=(x_valid, y_valid),
          callbacks=[early_stopping, save_best])

Epoch 1/200



INFO:tensorflow:Assets written to: ..\models\wordGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\wordGRU\stemmed\assets


Epoch 2/200



INFO:tensorflow:Assets written to: ..\models\wordGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\wordGRU\stemmed\assets


Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200


<keras.callbacks.History at 0x1cb41f189a0>

In [10]:
probas = model.predict(x_test)
y_pred = (probas > 0.5).astype('int32')
results_stem = calculate_metrics(y_test, y_pred)
results_stem



{'balanced_accuracy': 0.8244363751956182,
 'f1_score': 0.8322749346283153,
 'precision': 0.7974230493915533,
 'recall': 0.8703125}

# Results

In [11]:
# Create csv with all results

results_all = pd.DataFrame([results, results_lem, results_stem])
results_all['model'] = 'WordGRU'
results_all['dataset'] = ['simple', 'lemmatized', 'stemmed']

if not os.path.exists('results'):
    os.makedirs('results')
results_all.to_csv('results/word_gru.csv', index=False)
results_all

Unnamed: 0,balanced_accuracy,f1_score,precision,recall,model,dataset
0,0.804546,0.801902,0.813505,0.790625,WordGRU,simple
1,0.828732,0.837296,0.798159,0.880469,WordGRU,lemmatized
2,0.824436,0.832275,0.797423,0.870313,WordGRU,stemmed
