# Training CNN model

In [1]:
import pandas as pd
import numpy as np 
import os 
from numpy import savetxt, loadtxt 
import keras
from utils.KerasModels import generateExpData
from utils.KerasModels import buildCharCNNModel, buildCharGRUModel
from sklearn.metrics import balanced_accuracy_score, make_scorer, f1_score, precision_score, recall_score

keras.utils.set_random_seed(812)




2023-11-21 21:38:58.667445: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def prepare_data(data_directory):
    train = pd.read_csv(os.path.join(data_directory, 'train.csv'))
    validation = pd.read_csv(os.path.join(data_directory, 'validation.csv'))
    test = pd.read_csv(os.path.join(data_directory, 'test.csv'))

    dfTrainDataset = train[["screen_name", "text", "account.type"]]
    dfValDataset = validation[["screen_name", "text", "account.type"]]
    dfTestDataset = test[["screen_name", "text", "account.type"]]

    tokenizer = None
    train_features, tokenizer = generateExpData(dfTrainDataset, tokenizer = tokenizer)
    val_features, tokenizer = generateExpData(dfValDataset, tokenizer = tokenizer)
    test_features, tokenizer = generateExpData(dfTestDataset, tokenizer = tokenizer)

    dictLabels = {"human":0, "bot":1}
    y_train = dfTrainDataset["account.type"].apply(lambda x: dictLabels[x])
    y_val = dfValDataset["account.type"].apply(lambda x: dictLabels[x])
    y_test = dfTestDataset["account.type"].apply(lambda x: dictLabels[x])

    train_labels = y_train.tolist()
    val_labels = y_val.tolist()
    test_labels = y_test.tolist()

    vocab_size = len(tokenizer.word_index)

    return train_features, val_features, test_features, train_labels, val_labels, test_labels, vocab_size

def proba_to_pred(y_proba):
    y_pred_char_cnn = (y_proba > 0.5).astype(int)
    return y_pred_char_cnn

def calculate_metrics(y_true, y_pred):
    results = {
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
        'f1_score': f1_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred)
    }
    return results

In [4]:
DATA_PATH = os.path.join('..', 'data', 'preprocessed_url_simple')
train_features, val_features, test_features, train_labels, val_labels, test_labels, vocab_size = prepare_data(DATA_PATH)

model = buildCharCNNModel(vocab_size, embSize=32, inputSize=320)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 320)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 320, 32)      3360        ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 318, 128)     12416       ['embedding[0][0]']              
                                                                                                  
 conv1d_1 (Conv1D)              (None, 317, 128)     16512       ['embedding[0][0]']              
                                                                                              

In [6]:
# Train CNN model.
model.fit(np.array(train_features), np.array(train_labels), batch_size=256, epochs=25, validation_data=(np.array(val_features), np.array(val_labels)))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x14bd26810>

In [7]:
probas = model.predict(np.array(test_features))
y_pred = proba_to_pred(probas)
results = calculate_metrics(test_labels, y_pred)
results



{'balanced_accuracy': 0.9210620142719128,
 'f1_score': 0.9217591041776332,
 'precision': 0.9134971070852699,
 'recall': 0.9301719142360441}

# Lemmatized data 

In [9]:
LEMMATIZED_DATA_PATH = os.path.join('..', 'data', 'lemmatized')
lem_train_features, lem_val_features, lem_test_features, lem_train_labels, lem_val_labels, lem_test_labels, lem_vocab_size = prepare_data(LEMMATIZED_DATA_PATH)

modelLEM = buildCharCNNModel(lem_vocab_size, embSize=32, inputSize=320, verbose=False)

In [10]:
# Train CNN model.
modelLEM.fit(np.array(lem_train_features), np.array(lem_train_labels), batch_size=256, epochs=25, validation_data=(np.array(lem_val_features), np.array(lem_val_labels)))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x14b8b39d0>

In [11]:
probas_lem = modelLEM.predict(np.array(lem_test_features))
y_pred_lem = proba_to_pred(probas_lem)
results_lem = calculate_metrics(lem_test_labels, y_pred_lem)
results_lem



{'balanced_accuracy': 0.9082764405352697,
 'f1_score': 0.9130673499267936,
 'precision': 0.8675013041210224,
 'recall': 0.9636855321614834}

# Stemmed data

In [12]:
STEMMED_DATA_PATH = os.path.join('..', 'data', 'stemmed')
stem_train_features, stem_val_features, stem_test_features, stem_train_labels, stem_val_labels, stem_test_labels, stem_vocab_size = prepare_data(STEMMED_DATA_PATH)

modelSTEM = buildCharCNNModel(stem_vocab_size, embSize=32, inputSize=320, verbose=False)

In [13]:
# Train CNN model.
modelSTEM.fit(np.array(stem_train_features), np.array(stem_train_labels), batch_size=256, epochs=25, validation_data=(np.array(stem_val_features), np.array(stem_val_labels)))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x14cb50d10>

In [14]:
probas_stem = modelSTEM.predict(np.array(stem_test_features))
y_pred_stem = proba_to_pred(probas_stem)
results_stem = calculate_metrics(stem_test_labels, y_pred_stem)
results_stem



{'balanced_accuracy': 0.9185041834188477,
 'f1_score': 0.9196649533599848,
 'precision': 0.9065490711202853,
 'recall': 0.9331659262120919}

In [19]:
# Create csv with all results

results_all = pd.DataFrame([results, results_lem, results_stem])
results_all['model'] = 'CharCNN'
results_all['dataset'] = ['simple', 'lemmatized', 'stemmed']

if not os.path.exists('results'):
    os.makedirs('results')
results_all.to_csv('results/char_cnn.csv', index=False)
results_all

Unnamed: 0,balanced_accuracy,f1_score,precision,recall,model,dataset
0,0.921062,0.921759,0.913497,0.930172,CharCNN,simple
1,0.908276,0.913067,0.867501,0.963686,CharCNN,lemmatized
2,0.918504,0.919665,0.906549,0.933166,CharCNN,stemmed
