# Training CNN model

In [1]:
import os

import keras
import numpy as np
import pandas as pd

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from utils.KerasModels import buildCharGRUModel, prepare_data, proba_to_pred, calculate_metrics

keras.utils.set_random_seed(812)

MODELS_DIR = os.path.join('..', 'models', 'charGRU')
EPOCHS = 200

In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
DATA_PATH = os.path.join('..', 'data', 'preprocessed_url_simple')
train_features, val_features, test_features, train_labels, val_labels, test_labels, vocab_size = prepare_data(DATA_PATH)

model = buildCharGRUModel(vocab_size, embSize=32, inputSize=320)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 320)]             0         
                                                                 
 embedding (Embedding)       (None, 320, 32)           3360      
                                                                 
 bidirectional (Bidirectiona  (None, 1024)             1677312   
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense (Dense)               (None, 1)                 1025      
                                                                 
Total params: 1,681,697
Trainable params: 1,681,697
Non-trainable params: 0
___________________________________________________

In [3]:
# Train GRU model.
os.makedirs(MODELS_DIR, exist_ok=True)
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
save_best = keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR, "simple"), monitor='val_loss',
                                            save_best_only=True, restore_best_weights=True)
model.fit(np.array(train_features), np.array(train_labels), batch_size=256, epochs=EPOCHS,
          validation_data=(np.array(val_features), np.array(val_labels)),
          callbacks=[early_stopping, save_best])

Epoch 1/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 2/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 3/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 4/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 5/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 6/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 7/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 8/200
Epoch 9/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 10/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 11/200
Epoch 12/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 13/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 14/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 15/200
Epoch 16/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 17/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 18/200
Epoch 19/200
Epoch 20/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 21/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 22/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 23/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 24/200
Epoch 25/200
Epoch 26/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 27/200
Epoch 28/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200



INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\simple\assets


Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200


<keras.callbacks.History at 0x2af01bd2ac0>

In [4]:
probas = model.predict(np.array(test_features))
y_pred = proba_to_pred(probas)
results = calculate_metrics(test_labels, y_pred)
results



{'balanced_accuracy': 0.8353861013302035,
 'f1_score': 0.8420262664165105,
 'precision': 0.8101083032490974,
 'recall': 0.8765625}

# Lemmatized data 

In [5]:
LEMMATIZED_DATA_PATH = os.path.join('..', 'data', 'lemmatized')
lem_train_features, lem_val_features, lem_test_features, lem_train_labels, lem_val_labels, lem_test_labels, lem_vocab_size = prepare_data(
    LEMMATIZED_DATA_PATH)

modelLEM = buildCharGRUModel(lem_vocab_size, embSize=32, inputSize=320)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 320)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 320, 32)           3360      
                                                                 
 bidirectional_1 (Bidirectio  (None, 1024)             1677312   
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 1024)              0         
                                                                 
 dense_1 (Dense)             (None, 1)                 1025      
                                                                 
Total params: 1,681,697
Trainable params: 1,681,697
Non-trainable params: 0
_________________________________________________

In [6]:
# Train CNN model.
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
save_best = keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR, "lemmatized"), monitor='val_loss',
                                            save_best_only=True, restore_best_weights=True)
modelLEM.fit(np.array(lem_train_features), np.array(lem_train_labels), batch_size=256, epochs=EPOCHS,
             validation_data=(np.array(lem_val_features), np.array(lem_val_labels)),
             callbacks=[early_stopping, save_best])

Epoch 1/200



INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


Epoch 2/200



INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


Epoch 3/200
Epoch 4/200



INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


Epoch 5/200



INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200



INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


Epoch 11/200



INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


Epoch 12/200



INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


Epoch 13/200



INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


Epoch 14/200



INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\lemmatized\assets


Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200


<keras.callbacks.History at 0x2af172b5550>

In [7]:
probas_lem = modelLEM.predict(np.array(lem_test_features))
y_pred_lem = proba_to_pred(probas_lem)
results_lem = calculate_metrics(lem_test_labels, y_pred_lem)
results_lem



{'balanced_accuracy': 0.8243856367370892,
 'f1_score': 0.8420682377769961,
 'precision': 0.7658349328214972,
 'recall': 0.93515625}

# Stemmed data

In [8]:
STEMMED_DATA_PATH = os.path.join('..', 'data', 'stemmed')
stem_train_features, stem_val_features, stem_test_features, stem_train_labels, stem_val_labels, stem_test_labels, stem_vocab_size = prepare_data(
    STEMMED_DATA_PATH)

modelSTEM = buildCharGRUModel(stem_vocab_size, embSize=32, inputSize=320)

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 320)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 320, 32)           3360      
                                                                 
 bidirectional_2 (Bidirectio  (None, 1024)             1677312   
 nal)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 1024)              0         
                                                                 
 dense_2 (Dense)             (None, 1)                 1025      
                                                                 
Total params: 1,681,697
Trainable params: 1,681,697
Non-trainable params: 0
_________________________________________________

In [9]:
# Train CNN model.
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
save_best = keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR, "stemmed"), monitor='val_loss',
                                            save_best_only=True, restore_best_weights=True)
modelSTEM.fit(np.array(stem_train_features), np.array(stem_train_labels), batch_size=256, epochs=EPOCHS,
              validation_data=(np.array(stem_val_features), np.array(stem_val_labels)),
              callbacks=[early_stopping, save_best])

Epoch 1/200



INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


Epoch 2/200



INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


Epoch 3/200



INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


Epoch 4/200



INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


Epoch 5/200
Epoch 6/200
Epoch 7/200



INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


Epoch 8/200



INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


Epoch 9/200



INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


Epoch 10/200



INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


Epoch 11/200



INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


Epoch 12/200
Epoch 13/200



INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


Epoch 14/200
Epoch 15/200
Epoch 16/200



INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


Epoch 17/200
Epoch 18/200



INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charGRU\stemmed\assets


Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200


<keras.callbacks.History at 0x2af155fd6a0>

In [10]:
probas_stem = modelSTEM.predict(np.array(stem_test_features))
y_pred_stem = proba_to_pred(probas_stem)
results_stem = calculate_metrics(stem_test_labels, y_pred_stem)
results_stem



{'balanced_accuracy': 0.8290988116197183,
 'f1_score': 0.8422952002887045,
 'precision': 0.7826961770623743,
 'recall': 0.91171875}

In [11]:
# Create csv with all results

results_all = pd.DataFrame([results, results_lem, results_stem])
results_all['model'] = 'CharGRU'
results_all['dataset'] = ['simple', 'lemmatized', 'stemmed']

if not os.path.exists('results'):
    os.makedirs('results')
results_all.to_csv('results/char_gru.csv', index=False)
results_all

Unnamed: 0,balanced_accuracy,f1_score,precision,recall,model,dataset
0,0.835386,0.842026,0.810108,0.876563,CharGRU,simple
1,0.824386,0.842068,0.765835,0.935156,CharGRU,lemmatized
2,0.829099,0.842295,0.782696,0.911719,CharGRU,stemmed
