# Training CNN model

In [1]:
import pandas as pd
import numpy as np 
import os 
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from numpy import savetxt, loadtxt 
from utils.KerasModels import generateExpData
from utils.KerasModels import buildCharCNNModel, buildCharGRUModel, prepare_data, proba_to_pred, calculate_metrics
from sklearn.metrics import balanced_accuracy_score, make_scorer, f1_score, precision_score, recall_score





In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
DATA_PATH = os.path.join('..', 'data', 'preprocessed_url_simple')
train_features, val_features, test_features, train_labels, val_labels, test_labels, vocab_size = prepare_data(DATA_PATH)

model = buildCharGRUModel(vocab_size, embSize=32, inputSize=320)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 320)]             0         
                                                                 
 embedding (Embedding)       (None, 320, 32)           3360      
                                                                 
 bidirectional (Bidirectiona  (None, 1024)             1677312   
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense (Dense)               (None, 1)                 1025      
                                                                 
Total params: 1,681,697
Trainable params: 1,681,697
Non-trainable params: 0
___________________________________________________

In [4]:
# Train  model.
model.fit(np.array(train_features), np.array(train_labels), batch_size=256, epochs=10, validation_data=(np.array(val_features), np.array(val_labels)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1473ab690>

In [5]:
probas = model.predict(np.array(test_features))
y_pred = proba_to_pred(probas)
results = calculate_metrics(test_labels, y_pred)
results



{'balanced_accuracy': 0.8337361738910609,
 'f1_score': 0.8468516542155816,
 'precision': 0.7847371023570133,
 'recall': 0.9196445818041337}

# Lemmatized data 

In [9]:
LEMMATIZED_DATA_PATH = os.path.join('..', 'data', 'lemmatized')
lem_train_features, lem_val_features, lem_test_features, lem_train_labels, lem_val_labels, lem_test_labels, lem_vocab_size = prepare_data(LEMMATIZED_DATA_PATH)

modelLEM = buildCharGRUModel(lem_vocab_size, embSize=32, inputSize=320)

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 320)]             0         
                                                                 
 embedding_3 (Embedding)     (None, 320, 32)           3360      
                                                                 
 bidirectional_3 (Bidirectio  (None, 1024)             1677312   
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 1024)              0         
                                                                 
 dense_3 (Dense)             (None, 1)                 1025      
                                                                 
Total params: 1,681,697
Trainable params: 1,681,697
Non-trainable params: 0
_________________________________________________

In [10]:
# Train CNN model.
modelLEM.fit(np.array(lem_train_features), np.array(lem_train_labels), batch_size=256, epochs=10, validation_data=(np.array(lem_val_features), np.array(lem_val_labels)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
 4/81 [>.............................] - ETA: 9:02 - loss: 0.5486 - accuracy: 0.7109

KeyboardInterrupt: 

In [None]:
probas_lem = modelLEM.predict(np.array(lem_test_features))
y_pred_lem = proba_to_pred(probas_lem)
results_lem = calculate_metrics(lem_test_labels, y_pred_lem)
results_lem



{'balanced_accuracy': 0.9082764405352697,
 'f1_score': 0.9130673499267936,
 'precision': 0.8675013041210224,
 'recall': 0.9636855321614834}

# Stemmed data

In [8]:
STEMMED_DATA_PATH = os.path.join('..', 'data', 'stemmed')
stem_train_features, stem_val_features, stem_test_features, stem_train_labels, stem_val_labels, stem_test_labels, stem_vocab_size = prepare_data(STEMMED_DATA_PATH)

modelSTEM = buildCharGRUModel(stem_vocab_size, embSize=32, inputSize=320)

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 320)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 320, 32)           3360      
                                                                 
 bidirectional_2 (Bidirectio  (None, 1024)             1677312   
 nal)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 1024)              0         
                                                                 
 dense_2 (Dense)             (None, 1)                 1025      
                                                                 
Total params: 1,681,697
Trainable params: 1,681,697
Non-trainable params: 0
_________________________________________________

In [None]:
# Train CNN model.
modelSTEM.fit(np.array(stem_train_features), np.array(stem_train_labels), batch_size=256, epochs=10, validation_data=(np.array(stem_val_features), np.array(stem_val_labels)))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x14cb50d10>

In [None]:
probas_stem = modelSTEM.predict(np.array(stem_test_features))
y_pred_stem = proba_to_pred(probas_stem)
results_stem = calculate_metrics(stem_test_labels, y_pred_stem)
results_stem



{'balanced_accuracy': 0.9185041834188477,
 'f1_score': 0.9196649533599848,
 'precision': 0.9065490711202853,
 'recall': 0.9331659262120919}

In [None]:
# Create csv with all results

results_all = pd.DataFrame([results, results_lem, results_stem])
results_all['model'] = 'CharGRU'
results_all['dataset'] = ['simple', 'lemmatized', 'stemmed']

if not os.path.exists('results'):
    os.makedirs('results')
results_all.to_csv('results/char_cnn.csv', index=False)
results_all

Unnamed: 0,balanced_accuracy,f1_score,precision,recall,model,dataset
0,0.921062,0.921759,0.913497,0.930172,CharCNN,simple
1,0.908276,0.913067,0.867501,0.963686,CharCNN,lemmatized
2,0.918504,0.919665,0.906549,0.933166,CharCNN,stemmed
