# Training CNN model

In [1]:
import os

import keras
import numpy as np
import pandas as pd
from sklearn.metrics import balanced_accuracy_score, f1_score, precision_score, recall_score

from utils.KerasModels import buildCharCNNModel
from utils.KerasModels import generateExpData

keras.utils.set_random_seed(812)
MODELS_DIR = os.path.join('..', 'models', 'charCNN')
EPOCHS = 200

In [2]:
def prepare_data(data_directory):
    train = pd.read_csv(os.path.join(data_directory, 'train.csv'))
    validation = pd.read_csv(os.path.join(data_directory, 'validation.csv'))
    test = pd.read_csv(os.path.join(data_directory, 'test.csv'))

    dfTrainDataset = train[["screen_name", "text", "account.type"]]
    dfValDataset = validation[["screen_name", "text", "account.type"]]
    dfTestDataset = test[["screen_name", "text", "account.type"]]

    tokenizer = None
    train_features, tokenizer = generateExpData(dfTrainDataset, tokenizer=tokenizer)
    val_features, tokenizer = generateExpData(dfValDataset, tokenizer=tokenizer)
    test_features, tokenizer = generateExpData(dfTestDataset, tokenizer=tokenizer)

    dictLabels = {"human": 0, "bot": 1}
    y_train = dfTrainDataset["account.type"].apply(lambda x: dictLabels[x])
    y_val = dfValDataset["account.type"].apply(lambda x: dictLabels[x])
    y_test = dfTestDataset["account.type"].apply(lambda x: dictLabels[x])

    train_labels = y_train.tolist()
    val_labels = y_val.tolist()
    test_labels = y_test.tolist()

    vocab_size = len(tokenizer.word_index)

    return train_features, val_features, test_features, train_labels, val_labels, test_labels, vocab_size


def proba_to_pred(y_proba):
    y_pred_char_cnn = (y_proba > 0.5).astype(int)
    return y_pred_char_cnn


def calculate_metrics(y_true, y_pred):
    results = {
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred)
    }
    return results

In [3]:
DATA_PATH = os.path.join('..', 'data', 'preprocessed_url_simple')
train_features, val_features, test_features, train_labels, val_labels, test_labels, vocab_size = prepare_data(DATA_PATH)
os.makedirs(MODELS_DIR, exist_ok=True)
model = buildCharCNNModel(vocab_size, embSize=32, inputSize=320)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 320)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 320, 32)      3360        ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 318, 128)     12416       ['embedding[0][0]']              
                                                                                                  
 conv1d_1 (Conv1D)              (None, 317, 128)     16512       ['embedding[0][0]']              
                                                                                              

In [4]:
# Train CNN model.
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
save_best = keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR, "simple"), monitor='val_loss',
                                            save_best_only=True, restore_best_weights=True)
model.fit(np.array(train_features), np.array(train_labels), batch_size=256, epochs=EPOCHS,
          validation_data=(np.array(val_features), np.array(val_labels)),
          callbacks=[early_stopping, save_best])

Epoch 1/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 2/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 3/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 4/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 5/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 6/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 7/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 8/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 9/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 10/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 11/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 12/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 13/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 14/200
Epoch 15/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 16/200
Epoch 17/200
Epoch 18/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 19/200
Epoch 20/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 21/200
Epoch 22/200
Epoch 23/200



INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\simple\assets


Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200


<keras.callbacks.History at 0x1bddaf69fd0>

In [5]:
probas = model.predict(np.array(test_features))
y_pred = proba_to_pred(probas)
results = calculate_metrics(test_labels, y_pred)
results



{'balanced_accuracy': 0.8396909233176839,
 'f1_score': 0.8452830188679246,
 'precision': 0.8175182481751825,
 'recall': 0.875}

# Lemmatized data 

In [6]:
LEMMATIZED_DATA_PATH = os.path.join('..', 'data', 'lemmatized')
lem_train_features, lem_val_features, lem_test_features, lem_train_labels, lem_val_labels, lem_test_labels, lem_vocab_size = prepare_data(
    LEMMATIZED_DATA_PATH)

modelLEM = buildCharCNNModel(lem_vocab_size, embSize=32, inputSize=320, verbose=False)

In [7]:
# Train CNN model.
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
save_best = keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR, "lemmatized"), monitor='val_loss',
                                            save_best_only=True, restore_best_weights=True)
modelLEM.fit(np.array(lem_train_features), np.array(lem_train_labels), batch_size=256, epochs=EPOCHS,
             validation_data=(np.array(lem_val_features), np.array(lem_val_labels)),
             callbacks=[early_stopping, save_best])

Epoch 1/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 2/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 3/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 4/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 5/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 6/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 7/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 8/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 9/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 10/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 11/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 12/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 13/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 14/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 15/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 16/200
Epoch 17/200
Epoch 18/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 19/200
Epoch 20/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 21/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 22/200
Epoch 23/200



INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\lemmatized\assets


Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200


<keras.callbacks.History at 0x1bddad76160>

In [8]:
probas_lem = modelLEM.predict(np.array(lem_test_features))
y_pred_lem = proba_to_pred(probas_lem)
results_lem = calculate_metrics(lem_test_labels, y_pred_lem)
results_lem



{'balanced_accuracy': 0.8338327709311424,
 'f1_score': 0.8384644621816799,
 'precision': 0.8164322723908216,
 'recall': 0.86171875}

# Stemmed data

In [9]:
STEMMED_DATA_PATH = os.path.join('..', 'data', 'stemmed')
stem_train_features, stem_val_features, stem_test_features, stem_train_labels, stem_val_labels, stem_test_labels, stem_vocab_size = prepare_data(
    STEMMED_DATA_PATH)

modelSTEM = buildCharCNNModel(stem_vocab_size, embSize=32, inputSize=320, verbose=False)

In [10]:
# Train CNN model.
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
save_best = keras.callbacks.ModelCheckpoint(os.path.join(MODELS_DIR, "stemmed"), monitor='val_loss',
                                            save_best_only=True, restore_best_weights=True)
modelSTEM.fit(np.array(stem_train_features), np.array(stem_train_labels), batch_size=256, epochs=EPOCHS,
              validation_data=(np.array(stem_val_features), np.array(stem_val_labels)),
              callbacks=[early_stopping, save_best])

Epoch 1/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 2/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 3/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 4/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 5/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 6/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 7/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 8/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 9/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 10/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 11/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 12/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 13/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 14/200
Epoch 15/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 16/200
Epoch 17/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 18/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 19/200
Epoch 20/200
Epoch 21/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 22/200
Epoch 23/200



INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


INFO:tensorflow:Assets written to: ..\models\charCNN\stemmed\assets


Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200


<keras.callbacks.History at 0x1bdde45ca90>

In [11]:
probas_stem = modelSTEM.predict(np.array(stem_test_features))
y_pred_stem = proba_to_pred(probas_stem)
results_stem = calculate_metrics(stem_test_labels, y_pred_stem)
results_stem



{'balanced_accuracy': 0.8400986649061033,
 'f1_score': 0.8423892100192678,
 'precision': 0.8311787072243346,
 'recall': 0.85390625}

In [12]:
# Create csv with all results

results_all = pd.DataFrame([results, results_lem, results_stem])
results_all['model'] = 'CharCNN'
results_all['dataset'] = ['simple', 'lemmatized', 'stemmed']

if not os.path.exists('results'):
    os.makedirs('results')
results_all.to_csv('results/char_cnn.csv', index=False)
results_all

Unnamed: 0,balanced_accuracy,f1_score,precision,recall,model,dataset
0,0.839691,0.845283,0.817518,0.875,CharCNN,simple
1,0.833833,0.838464,0.816432,0.861719,CharCNN,lemmatized
2,0.840099,0.842389,0.831179,0.853906,CharCNN,stemmed
