# DNN Model Notebook

```md
@author: miguelrocha
(Adapted by: Grupo 03)
```

In [8]:
# Notebook Imports
import numpy as np

from helpers.dataset import Dataset
from helpers.enums import ModelRunMode

In [9]:
# Model run mode
# Options: 
#   ModelRunMode.TRAIN.value            (Train the model)
#   ModelRunMode.CLASSIFY.value         (Classify data)
mode = ModelRunMode.TRAIN.value

In [10]:
# Parameters cell
if mode == ModelRunMode.TRAIN.value:
    # Train mode
    input_csv = '../tarefa_1/clean_input_datasets/ai_human_input_sm.csv'
    output_csv = '../tarefa_1/clean_output_datasets/ai_human_output_sm.csv'         # CSV for training output (ID, Label)
    test_size = 0.2                                                                 # Proportion of the dataset to use as test data
    # epochs = 100                                                                    # Number of epochs for training
    # learning_rate = 0.001                                                           # Learning rate for gradient descent
    # momentum = 0.9                                                                  # Momentum for gradient descent
    # verbose = 1                                                                     # Verbosity level for training
    # batch_size = 32                                                                 # Batch size for gradient descent
elif mode == ModelRunMode.CLASSIFY.value:
    # Classify mode
    input_csv = "../tarefa_1/clean_input_datasets/dataset2_inputs.csv"              # CSV for training input (ID, Text)
    output_csv = "../tarefa_1/classify_output_datasets/dataset2_outputs.csv"        # CSV for predictions output
    model_prefix = "logreg_model"                                                   # Prefix for loading the model files
else:
    print("The selected option is not valid. Options: \"train\" or \"classify\"!")
    SystemExit()

# DNN model

In [11]:
from helpers.layers import DenseLayer
from helpers.activation import SigmoidActivation, ReLUActivation
from helpers.losses import LossFunction, MeanSquaredError, BinaryCrossEntropy
from helpers.optimizer import Optimizer
from helpers.metrics import accuracy, mse
from helpers.dataset import Dataset
from helpers.regularizer import L1Regularizer

class NeuralNetwork:
 
    def __init__(self, epochs = 100, batch_size = 128, optimizer: Optimizer = None, verbose = False, loss: LossFunction = MeanSquaredError, metric:callable = mse):
        self.epochs = epochs
        self.batch_size = batch_size
        self.optimizer = optimizer
        self.verbose = verbose
        self.loss = loss()
        self.metric = metric

        # attributes
        self.layers = []
        self.history = {}

    def add(self, layer):
        if self.layers:
            layer.set_input_shape(input_shape=self.layers[-1].output_shape())
        if hasattr(layer, 'initialize'):
            layer.initialize(self.optimizer)
        self.layers.append(layer)
        return self

    def get_mini_batches(self, X, y = None,shuffle = True):
        n_samples = X.shape[0]
        indices = np.arange(n_samples)
        assert self.batch_size <= n_samples, "Batch size cannot be greater than the number of samples"
        if shuffle:
            np.random.shuffle(indices)
        for start in range(0, n_samples - self.batch_size + 1, self.batch_size):
            if y is not None:
                yield X[indices[start:start + self.batch_size]], y[indices[start:start + self.batch_size]]
            else:
                yield X[indices[start:start + self.batch_size]], None

    def forward_propagation(self, X, training):
        output = X
        for layer in self.layers:
            output = layer.forward_propagation(output, training)
        return output

    def backward_propagation(self, output_error):
        error = output_error
        for layer in reversed(self.layers):
            error = layer.backward_propagation(error)
        return error

    def fit(self, dataset):
        X = dataset.X
        y = dataset.Y
        if np.ndim(y) == 1:
            y = np.expand_dims(y, axis=1)

        self.history = {}
        for epoch in range(1, self.epochs + 1):
            # store mini-batch data for epoch loss and quality metrics calculation
            output_x_ = []
            y_ = []
            for X_batch, y_batch in self.get_mini_batches(X, y):
                # Forward propagation
                output = self.forward_propagation(X_batch, training=True)
                # Backward propagation
                error = self.loss.derivative(y_batch, output)
                self.backward_propagation(error)

                output_x_.append(output)
                y_.append(y_batch)

            output_x_all = np.concatenate(output_x_)
            y_all = np.concatenate(y_)

            # compute loss
            loss = self.loss.loss(y_all, output_x_all)

            if self.metric is not None:
                metric = self.metric(y_all, output_x_all)
                metric_s = f"{self.metric.__name__}: {metric:.4f}"
            else:
                metric_s = "NA"
                metric = 'NA'

            # save loss and metric for each epoch
            self.history[epoch] = {'loss': loss, 'metric': metric}

            if self.verbose:
                print(f"Epoch {epoch}/{self.epochs} - loss: {loss:.4f} - {metric_s}")

        return self

    def predict(self, dataset):
        if isinstance(dataset, Dataset):
            return self.forward_propagation(dataset.X, training=False)
        
        return self.forward_propagation(dataset, training=False)

    def score(self, dataset, predictions):
        if self.metric is not None:
            return self.metric(dataset.Y, predictions)
        else:
            raise ValueError("No metric specified for the neural network.")
    

# Hiper Params fine-tunning function

In [12]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random

def hyperparameter_optimization(train_ds, validation_ds, param_grid, n_iter=10):
    '''
    Make a random search for hyperparameters optimization.
    '''
    best_acc = 0
    best_params = {}
    
    param_combinations = [
        [random.choice(values) for _, values in param_grid.items()]
        for _ in range(n_iter)
    ]
    
    for params in tqdm(param_combinations):
        param_dict = dict(zip(param_grid.keys(), params))
        
        net = NeuralNetwork(
            epochs=param_dict['epochs'],
            batch_size=param_dict['batch_size'],
            optimizer=Optimizer(learning_rate=param_dict['learning_rate'], momentum=param_dict['momentum']),
            verbose=False,
            loss=BinaryCrossEntropy,
            metric=accuracy
        )
        
        n_features = train_ds.X.shape[1]
        
        for i, units in enumerate(param_dict['n_hidden']):
            if i == 0:
                net.add(DenseLayer(units, (n_features,),dropout_rate=param_dict['dropout_rate']))
            else:
                net.add(DenseLayer(units,dropout_rate=param_dict['dropout_rate']))
            net.add(ReLUActivation())
            
        net.add(DenseLayer(1))
        net.add(SigmoidActivation())
        
        
        net.fit(train_ds)
        out = net.predict(validation_ds)
        val_acc = net.score(validation_ds,out)
        
        print("Hiperparâmetros:", param_dict, end=" ")
        print("Acurácia:", val_acc)
        if val_acc > best_acc:
            best_acc = val_acc
            best_params = param_dict
    
    print("Melhores Hiperparâmetros:", best_params)
    print("Melhor Acurácia:", best_acc)
    return best_params


# Preparing Data

In [13]:
# Load Datasets
X_train, y_train, X_test, y_test, vocab = Dataset.prepare_train_test_bow(input_csv=input_csv, output_csv=output_csv, test_size=test_size, max_vocab_size=4000, min_freq=1)

# Wrap Dataset object
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
train_ds = Dataset(X=X_train, Y=y_train)
validation_ds = Dataset(X=X_val, Y=y_val)
test_ds = Dataset(X=X_test, Y=y_test)

print("Dataset loaded")
print(f"Train set has {train_ds.nrows()} rows and {train_ds.ncols()} columns")
print(f"Validation set has {validation_ds.nrows()} rows and {validation_ds.ncols()} columns")
print(f"Test set has {test_ds.nrows()} rows and {test_ds.ncols()} columns\n")

Dataset loaded
Train set has 19200 rows and 4000 columns
Validation set has 4800 rows and 4000 columns
Test set has 6000 rows and 4000 columns



# Fine-Tunning

In [14]:
param_grid = {
'epochs': [20, 40],
'n_hidden': [[50, 25], [100, 50], [100]],
'dropout_rate': [0.1, 0.5],
'learning_rate': [0.01, 0.001, 0.005],
'momentum': [0.9, 0.5,0.1],
'batch_size': [64]
}

best_params = hyperparameter_optimization(train_ds,validation_ds, param_grid, 10)


 10%|█         | 1/10 [01:49<16:29, 109.94s/it]

Hiperparâmetros: {'epochs': 40, 'n_hidden': [100, 50], 'dropout_rate': 0.1, 'learning_rate': 0.001, 'momentum': 0.5, 'batch_size': 64} Acurácia: 0.9885416666666667


 20%|██        | 2/10 [02:19<08:22, 62.81s/it] 

Hiperparâmetros: {'epochs': 20, 'n_hidden': [50, 25], 'dropout_rate': 0.1, 'learning_rate': 0.001, 'momentum': 0.5, 'batch_size': 64} Acurácia: 0.9845833333333334


 30%|███       | 3/10 [03:13<06:51, 58.81s/it]

Hiperparâmetros: {'epochs': 20, 'n_hidden': [100], 'dropout_rate': 0.1, 'learning_rate': 0.001, 'momentum': 0.9, 'batch_size': 64} Acurácia: 0.9852083333333334


 40%|████      | 4/10 [03:43<04:43, 47.27s/it]

Hiperparâmetros: {'epochs': 20, 'n_hidden': [50, 25], 'dropout_rate': 0.5, 'learning_rate': 0.01, 'momentum': 0.9, 'batch_size': 64} Acurácia: 0.99125


 50%|█████     | 5/10 [04:12<03:24, 40.80s/it]

Hiperparâmetros: {'epochs': 20, 'n_hidden': [50, 25], 'dropout_rate': 0.1, 'learning_rate': 0.001, 'momentum': 0.9, 'batch_size': 64} Acurácia: 0.984375


 60%|██████    | 6/10 [06:11<04:28, 67.16s/it]

Hiperparâmetros: {'epochs': 40, 'n_hidden': [100], 'dropout_rate': 0.1, 'learning_rate': 0.001, 'momentum': 0.5, 'batch_size': 64} Acurácia: 0.9889583333333334


 70%|███████   | 7/10 [07:11<03:14, 64.89s/it]

Hiperparâmetros: {'epochs': 20, 'n_hidden': [100], 'dropout_rate': 0.1, 'learning_rate': 0.005, 'momentum': 0.1, 'batch_size': 64} Acurácia: 0.9872916666666667


  return 1 / (1 + np.exp(-input))
 80%|████████  | 8/10 [09:16<02:48, 84.24s/it]

Hiperparâmetros: {'epochs': 40, 'n_hidden': [100, 50], 'dropout_rate': 0.1, 'learning_rate': 0.01, 'momentum': 0.5, 'batch_size': 64} Acurácia: 0.4685416666666667


 90%|█████████ | 9/10 [10:17<01:16, 76.70s/it]

Hiperparâmetros: {'epochs': 40, 'n_hidden': [50, 25], 'dropout_rate': 0.1, 'learning_rate': 0.01, 'momentum': 0.5, 'batch_size': 64} Acurácia: 0.9902083333333334


100%|██████████| 10/10 [10:46<00:00, 64.62s/it]

Hiperparâmetros: {'epochs': 20, 'n_hidden': [50, 25], 'dropout_rate': 0.5, 'learning_rate': 0.01, 'momentum': 0.9, 'batch_size': 64} Acurácia: 0.9922916666666667
Melhores Hiperparâmetros: {'epochs': 20, 'n_hidden': [50, 25], 'dropout_rate': 0.5, 'learning_rate': 0.01, 'momentum': 0.9, 'batch_size': 64}
Melhor Acurácia: 0.9922916666666667





# Training with best params

In [15]:
net = NeuralNetwork(
    epochs=best_params['epochs'],
    batch_size=best_params['batch_size'],
    optimizer=Optimizer(learning_rate=best_params['learning_rate'], momentum=best_params['momentum']),
    verbose=True,
    loss=BinaryCrossEntropy,
    metric=accuracy
)

n_features = train_ds.X.shape[1]
for i, units in enumerate(best_params['n_hidden']):
    if i == 0:
        net.add(DenseLayer(units, (n_features,),dropout_rate=best_params['dropout_rate']))
    else:
        net.add(DenseLayer(units,dropout_rate=best_params['dropout_rate']))
    net.add(ReLUActivation())
    
net.add(DenseLayer(1))
net.add(SigmoidActivation())

net.fit(train_ds)
net.predict(test_ds)
out = net.predict(test_ds)
acc = net.score(test_ds,out)
print("Test Accuracy:", acc)


Epoch 1/20 - loss: 0.4891 - accuracy: 0.7647
Epoch 2/20 - loss: 0.1992 - accuracy: 0.9295
Epoch 3/20 - loss: 0.1329 - accuracy: 0.9557
Epoch 4/20 - loss: 0.0987 - accuracy: 0.9653
Epoch 5/20 - loss: 0.0824 - accuracy: 0.9729
Epoch 6/20 - loss: 0.0629 - accuracy: 0.9791
Epoch 7/20 - loss: 0.0590 - accuracy: 0.9808
Epoch 8/20 - loss: 0.0478 - accuracy: 0.9844
Epoch 9/20 - loss: 0.0461 - accuracy: 0.9850
Epoch 10/20 - loss: 0.0372 - accuracy: 0.9877
Epoch 11/20 - loss: 0.0357 - accuracy: 0.9871
Epoch 12/20 - loss: 0.0305 - accuracy: 0.9901
Epoch 13/20 - loss: 0.0286 - accuracy: 0.9903
Epoch 14/20 - loss: 0.0262 - accuracy: 0.9916
Epoch 15/20 - loss: 0.0243 - accuracy: 0.9927
Epoch 16/20 - loss: 0.0201 - accuracy: 0.9930
Epoch 17/20 - loss: 0.0215 - accuracy: 0.9940
Epoch 18/20 - loss: 0.0194 - accuracy: 0.9935
Epoch 19/20 - loss: 0.0184 - accuracy: 0.9938
Epoch 20/20 - loss: 0.0154 - accuracy: 0.9946
Test Accuracy: 0.9908333333333333


# Test with other dataset

In [16]:
input_test_csv = '../tarefa_1/clean_input_datasets/ai_human_input_sm.csv'
output_test_csv = '../tarefa_1/clean_output_datasets/ai_human_output_sm.csv'

In [17]:
import pandas as pd
def load_data(input_path, output_path, sep="\t"):
        # read input and output csv's
        df_input = pd.read_csv(input_path, sep=sep)
        df_output = pd.read_csv(output_path, sep=sep)
        # handle rows thet might have an empty Text or missing Label
        df_input.dropna(subset=["ID", "Text"], inplace=True)
        df_output.dropna(subset=["ID", "Label"], inplace=True)
        # remove duplicated ID's
        df_input.drop_duplicates(subset=["ID"], inplace=True)
        df_output.drop_duplicates(subset=["ID"], inplace=True)
        # merge datasets on ID column
        df_merged = pd.merge(df_input, df_output, on="ID")
        return df_merged

In [18]:
# Load test dataset
df_merged = load_data(input_test_csv, output_test_csv)
df_merged["Text"] = df_merged["Text"].apply(Dataset.clean_text)

# Convert label "Human"/"AI" to 0/1
labels = np.where(df_merged["Label"] == "AI", 1.0, 0.0)
train_texts = df_merged["Text"].astype(str).tolist()
df_merged.head()

Unnamed: 0,ID,Text,Label
0,1,cars cars around since became famous henry for...,AI
1,2,transportation large necessity countries world...,AI
2,3,americas love affair vehicles seems cooling sa...,AI
3,4,often ride car drive one motor vehicle work st...,AI
4,5,cars wonderful thing perhaps one worlds greate...,AI


In [19]:
# Vectorize text using the same vocabulary as the training set
X = Dataset.vectorize_text_bow(train_texts, vocab)

# Wrap Dataset object
test_ds = Dataset(X=X, Y=labels)

print("Test shape:", X.shape, labels.shape)
out = net.predict(test_ds)
acc = net.score(test_ds,out)
print("Test Accuracy:", acc)

Test shape: (30000, 4000) (30000,)
Test Accuracy: 0.9968333333333333
