# Imports

In [1]:
import os

import keras
import pandas as pd

import utils.word_utils as wu
from utils.KerasModels import calculate_metrics

keras.utils.set_random_seed(812)
MODELS_DIR = os.path.join('..', 'models', 'wordCNN')
EPOCHS = 200
MAX_LEN = 100
VOCAB_SIZE = 15000

# Simple data

In [2]:
data_path = os.path.join('..', 'data', 'preprocessed_url_simple')

x_train, y_train, x_valid, y_valid, x_test, y_test = wu.prepare_data(
    data_path)
os.makedirs(MODELS_DIR, exist_ok=True)
text_vectorizer = wu.prepare_text_vectorizer(x_train, max_len=MAX_LEN, vocab_size=VOCAB_SIZE)
model = wu.get_wordcnn(text_vectorizer)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization (TextVector  (None, 100)         0           ['input_1[0][0]']                
 ization)                                                                                         
                                                                                                  
 embedding (Embedding)          (None, 100, 128)     1920000     ['text_vectorization[0][0]']     
                                                                                                  
 conv1d (Conv1D)                (None, 98, 128)      49280       ['embedding[0][0]']          

In [3]:
# Train CNN model.
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
save_best = keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR, "simple"), monitor='val_loss',
                                            save_best_only=True, restore_best_weights=True)
model.fit(x_train, y_train, batch_size=256, epochs=EPOCHS,
          validation_data=(x_valid, y_valid),
          callbacks=[early_stopping, save_best])

Epoch 1/200



INFO:tensorflow:Assets written to: ..\models\wordCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\wordCNN\simple\assets


Epoch 2/200



INFO:tensorflow:Assets written to: ..\models\wordCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\wordCNN\simple\assets


Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200


<keras.callbacks.History at 0x24305aea970>

In [4]:
probas = model.predict(x_test)
y_pred = (probas > 0.5).astype('int32')
results = calculate_metrics(y_test, y_pred)
results



{'balanced_accuracy': 0.81819309956964,
 'f1_score': 0.8237968927624101,
 'precision': 0.7998528329654158,
 'recall': 0.84921875}

# Lemmatized data

In [5]:
data_path = os.path.join('..', 'data', 'lemmatized')

x_train, y_train, x_valid, y_valid, x_test, y_test = wu.prepare_data(
    data_path)
os.makedirs(MODELS_DIR, exist_ok=True)
text_vectorizer = wu.prepare_text_vectorizer(x_train, max_len=MAX_LEN, vocab_size=VOCAB_SIZE)
model = wu.get_wordcnn(text_vectorizer)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization_1 (TextVect  (None, 100)         0           ['input_2[0][0]']                
 orization)                                                                                       
                                                                                                  
 embedding_1 (Embedding)        (None, 100, 128)     1920000     ['text_vectorization_1[0][0]']   
                                                                                                  
 conv1d_3 (Conv1D)              (None, 98, 128)      49280       ['embedding_1[0][0]']      

In [6]:
# Train CNN model.
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
save_best = keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR, "lemmatized"), monitor='val_loss',
                                            save_best_only=True, restore_best_weights=True)
model.fit(x_train, y_train, batch_size=256, epochs=EPOCHS,
          validation_data=(x_valid, y_valid),
          callbacks=[early_stopping, save_best])

Epoch 1/200



INFO:tensorflow:Assets written to: ..\models\wordCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\wordCNN\lemmatized\assets


Epoch 2/200



INFO:tensorflow:Assets written to: ..\models\wordCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\wordCNN\lemmatized\assets


Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200


<keras.callbacks.History at 0x24360ef8ee0>

In [7]:
probas = model.predict(x_test)
y_pred = (probas > 0.5).astype('int32')
results_lem = calculate_metrics(y_test, y_pred)
results_lem



{'balanced_accuracy': 0.8096030174100157,
 'f1_score': 0.8130518234165067,
 'precision': 0.7992452830188679,
 'recall': 0.82734375}

# Stemmed data

In [8]:
data_path = os.path.join('..', 'data', 'stemmed')

x_train, y_train, x_valid, y_valid, x_test, y_test = wu.prepare_data(
    data_path)
os.makedirs(MODELS_DIR, exist_ok=True)
text_vectorizer = wu.prepare_text_vectorizer(x_train, max_len=MAX_LEN, vocab_size=VOCAB_SIZE)
model = wu.get_wordcnn(text_vectorizer)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization_2 (TextVect  (None, 100)         0           ['input_3[0][0]']                
 orization)                                                                                       
                                                                                                  
 embedding_2 (Embedding)        (None, 100, 128)     1920000     ['text_vectorization_2[0][0]']   
                                                                                                  
 conv1d_6 (Conv1D)              (None, 98, 128)      49280       ['embedding_2[0][0]']      

In [9]:
# Train CNN model.
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
save_best = keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR, "lemmatized"), monitor='val_loss',
                                            save_best_only=True, restore_best_weights=True)
model.fit(x_train, y_train, batch_size=256, epochs=EPOCHS,
          validation_data=(x_valid, y_valid),
          callbacks=[early_stopping, save_best])

Epoch 1/200



INFO:tensorflow:Assets written to: ..\models\wordCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\wordCNN\lemmatized\assets


Epoch 2/200



INFO:tensorflow:Assets written to: ..\models\wordCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\wordCNN\lemmatized\assets


Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200


<keras.callbacks.History at 0x2455dcd9c10>

In [10]:
probas = model.predict(x_test)
y_pred = (probas > 0.5).astype('int32')
results_stem = calculate_metrics(y_test, y_pred)
results_stem



{'balanced_accuracy': 0.8201511150234742,
 'f1_score': 0.8249619482496195,
 'precision': 0.8041543026706232,
 'recall': 0.846875}

# Results

In [11]:
# Create csv with all results

results_all = pd.DataFrame([results, results_lem, results_stem])
results_all['model'] = 'WordCNN'
results_all['dataset'] = ['simple', 'lemmatized', 'stemmed']

if not os.path.exists('results'):
    os.makedirs('results')
results_all.to_csv('results/word_cnn.csv', index=False)
results_all

Unnamed: 0,balanced_accuracy,f1_score,precision,recall,model,dataset
0,0.818193,0.823797,0.799853,0.849219,WordCNN,simple
1,0.809603,0.813052,0.799245,0.827344,WordCNN,lemmatized
2,0.820151,0.824962,0.804154,0.846875,WordCNN,stemmed
