### Install and import necessary libraries

In [None]:
!pip install torchtext==0.17.0
!pip install transformers

In [None]:
# Base
import glob
import os
import time
from functools import partial
import numpy as np
import pandas as pd
import string

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, balanced_accuracy_score
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification

In [None]:
# Donwload stopwords and set devide where we run experiments
nltk.download('stopwords')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize tokenizer and get stopwords
tokenizer = get_tokenizer("basic_english")
stop_words = set(stopwords.words('english'))

### Define Global Parameters for models

In [None]:
MAX_WORDS = 256
EPOCHS = 25
LEARNING_RATE = 2e-5
BATCH_SIZE = 32
EMBEDDING_DIM = 256
HIDDEN_DIM = 256
NUM_HEADS = 16

### Models and dataset classes

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['label'] = self.labels[idx]
        return item

class RNNModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, num_layers=4)
        self.linear = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.rnn(embeddings)
        logits = self.linear(output[:, -1, :])
        logits = self.dropout(logits)
        return logits

    def initialize_weights(self):
        for name, param in self.rnn.named_parameters():
            if 'weight' in name:
                nn.init.orthogonal_(param)  # Orthogonal initialization for LSTM weights
            elif 'bias' in name:
                nn.init.constant_(param, 0)  # Set biases to 0
        nn.init.xavier_uniform_(self.linear.weight)  # Xavier initialization for the linear layer
        nn.init.constant_(self.linear.bias, 0)       # Initialize biases

class BiRNNModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(BiRNNModel, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, num_layers=4, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2, output_dim)  # Multiply by 2 for bidirectional
        self.dropout = nn.Dropout(0.5)

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, hidden = self.rnn(embeddings)
        logits = self.linear(output[:, -1, :])
        logits = self.dropout(logits)
        return logits

    def initialize_weights(self):
        for name, param in self.rnn.named_parameters():
            if 'weight' in name:
                nn.init.orthogonal_(param)  # Orthogonal initialization for LSTM weights
            elif 'bias' in name:
                nn.init.constant_(param, 0)  # Set biases to 0
        nn.init.xavier_uniform_(self.linear.weight)  # Xavier initialization for the linear layer
        nn.init.constant_(self.linear.bias, 0)       # Initialize biases

class LSTMModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, num_layers=4, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 4, output_dim)  # hidden_dim * 2 for pooling + hidden_dim * 2 for forward/backward states
        self.dropout = nn.Dropout(0.5)

        # Default initialize with Xavier weights
        self.initialize_weights()

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)
        output, (hn, cn) = self.lstm(embeddings)

        # Max pooling over the sequence length dimension (dim=1)
        max_pooled_output, _ = torch.max(output, dim=1)  # Max pool along the seq_len dimension
        forward_hidden = hn[-2, :, :]  # Last layer forward hidden state
        backward_hidden = hn[-1, :, :]  # Last layer backward hidden state

        # Concatenate forward and backward hidden states
        final_hidden_state = torch.cat((forward_hidden, backward_hidden), dim=1)  # (batch_size, hidden_dim * 2)
        combined_representation = torch.cat((final_hidden_state, max_pooled_output), dim=1)  # (batch_size, hidden_dim * 4)

        # Apply dropout to the combined representation
        combined_representation = self.dropout(combined_representation)

        # Apply the linear layer to the combined representation
        logits = self.linear(combined_representation)
        return logits

    def initialize_weights(self):
        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                nn.init.orthogonal_(param)  # Orthogonal initialization for LSTM weights
            elif 'bias' in name:
                nn.init.constant_(param, 0)  # Set biases to 0
        nn.init.xavier_uniform_(self.linear.weight)  # Xavier initialization for the linear layer
        nn.init.constant_(self.linear.bias, 0)       # Initialize biases

class GRUModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(GRUModel, self).__init__()
        self.embedding_layer = nn.Embedding(num_embeddings=input_dim, embedding_dim=embedding_dim)
        self.gru = nn.GRU(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True, num_layers=2, bidirectional=True)
        self.linear = nn.Linear(hidden_dim*4, output_dim)
        self.dropout = nn.Dropout(0.5)
        self.initialize_weights()

    def forward(self, X_batch):
        embeddings = self.embedding_layer(X_batch)  # (batch_size, seq_len, embedding_dim)
        output, hn = self.gru(embeddings)  # output shape: (batch_size, seq_len, hidden_dim * 2)

        # Max pooling over the sequence length dimension (dim=1)
        max_pooled_output, _ = torch.max(output, dim=1)  # Max pool along the seq_len dimension
        forward_hidden = hn[-2, :, :]  # Last layer forward hidden state
        backward_hidden = hn[-1, :, :]  # Last layer backward hidden state

        # Concatenate forward and backward hidden states
        final_hidden_state = torch.cat((forward_hidden, backward_hidden), dim=1)  # (batch_size, hidden_dim * 2)
        combined_representation = torch.cat((final_hidden_state, max_pooled_output), dim=1)  # (batch_size, hidden_dim * 4)

        # Apply dropout to the combined representation
        combined_representation = self.dropout(combined_representation)

        # Apply the linear layer to the combined representation
        logits = self.linear(combined_representation)
        return logits

    def initialize_weights(self):
        for name, param in self.gru.named_parameters():
            if 'weight' in name:
                nn.init.orthogonal_(param)  # Orthogonal initialization for LSTM weights
            elif 'bias' in name:
                nn.init.constant_(param, 0)  # Set biases to 0
        nn.init.xavier_uniform_(self.linear.weight)  # Xavier initialization for the linear layer
        nn.init.constant_(self.linear.bias, 0)       # Initialize biases

class TransformerModel(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads, hidden_dim, output_dim):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim=embed_dim) # 30522 is the vocab size of BERT
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        attn_output, _ = self.attention(embedded, embedded, embedded)
        hidden = F.relu(self.fc1(attn_output.mean(dim=1)))  # Pooling
        return self.fc2(hidden)

class BERTClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

    def forward(self, input_ids, attention_mask):
        return self.bert(input_ids=input_ids, attention_mask=attention_mask)['logits']

class RoBERTaClassifier(nn.Module):
    def __init__(self, num_classes):
        super(RoBERTaClassifier, self).__init__()
        self.roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_classes)

    def forward(self, input_ids, attention_mask):
        return self.roberta(input_ids=input_ids, attention_mask=attention_mask)['logits']

### Supporting functions

In [None]:
# Count number of trainable parameters in models
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Creates plots for results
def create_plots(df):
    sns.set(style="whitegrid")

    plt.figure(figsize=(16, 8))

    plt.subplot(1, 3, 1)
    sns.barplot(x='Dataset', y='Test Accuracy', data=df)
    plt.title('Test Accuracy by Dataset')
    plt.xticks(rotation=90)

    plt.subplot(1, 3, 2)
    sns.barplot(x='Dataset', y='Training Time', data=df)
    plt.title('Training Time by Dataset')
    plt.xticks(rotation=90)

    plt.subplot(1, 3, 3)
    sns.barplot(x='Dataset', y='Parameters', data=df)
    plt.title('Model Parameters by Dataset')
    plt.xticks(rotation=90)

    plt.tight_layout()
    plt.show()

### Load & Prepare the Data

In [None]:
def load_data(file_path):
    csv_files = glob.glob(os.path.join(file_path, '*.csv'))
    dataframes = {os.path.basename(file).split('.')[0]: pd.read_csv(file) for file in csv_files}
    return dataframes

def yield_tokens(data_iter):
    for text in data_iter:
        tokens = tokenizer(text.lower())  # Convert to lowercase
        #tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]  # Remove stopwords and punctuation
        yield tokens

def build_vocabulary(datasets):
    for dataset in datasets:
        for _, text in dataset:
            tokens = tokenizer(text.lower())  # Convert to lowercase
            #tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]  # Remove stopwords and punctuation
            yield tokens

def collate_batch(batch, max_words, vocab):
    Y, X = list(zip(*batch))
    Y = torch.tensor(Y, dtype=torch.long)  # Targets in range [0,1,2,3]
    X = [vocab(tokenizer(text)) for text in X]
    if max_words == -1:
        max_words = max(len(tokens) for tokens in X)
    X = [tokens + ([vocab['<PAD>']] * (max_words - len(tokens))) if len(tokens) < max_words else tokens[:max_words] for tokens in X]
    return torch.tensor(X, dtype=torch.int64).to(device), Y.to(device)

def make_data_loaders(train_dataset, test_dataset, max_words, vocab):
    custom_collate_fn = partial(collate_batch, max_words=max_words, vocab= vocab)

    num_train = int(len(train_dataset) * 0.95)
    split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

    train_loader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn)
    valid_loader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)

    return [train_loader, valid_loader, test_loader]

def prepare_data_loaders(dataframes):
    dataset_loaders = {}

    for dname, df in dataframes.items():
        test_df = df # current df is the test df

        test_df['CLASS'] = test_df['CLASS'].replace({'truthful': 0, 'deceptive': 1})
        X_test, y_test = test_df['TEXT'], test_df['CLASS']

        # Combine datasets - Exclude current df
        cross_df = [cur_df for cur_name, cur_df in dataframes.items() if dname != cur_name]
        cross_df = pd.concat(cross_df, axis=0, ignore_index=True)

        cross_df['CLASS'] = cross_df['CLASS'].replace({'truthful': 0, 'deceptive': 1})
        X_train, y_train = cross_df['TEXT'], cross_df['CLASS']

        train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
        test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
        train_dataset = [(label, train_df['TEXT'][i]) for i, label in enumerate(train_df['CLASS'])]
        test_dataset = [(label, test_df['TEXT'][i]) for i, label in enumerate(test_df['CLASS'])]

        vocab = build_vocab_from_iterator(build_vocabulary([train_dataset, test_dataset]), min_freq=10, specials=["<PAD>", "<UNK>"])
        vocab.set_default_index(vocab["<UNK>"])

        loaders = make_data_loaders(train_dataset, test_dataset, MAX_WORDS, vocab)
        dataset_loaders[dname] = (loaders, vocab)

    return dataset_loaders

def prepare_data_loadersTransf(dataframes):
    dataset_loaders = {}

    for dname, df in dataframes.items():
        # Test set
        test_df = df # current df is the test df
        test_df['CLASS'] = test_df['CLASS'].replace({'truthful': 0, 'deceptive': 1})
        X_test, y_test = test_df['TEXT'].tolist(), test_df['CLASS'].tolist()

        # Train set - Combine datasets - Exclude current df
        cross_df = [cur_df for cur_name, cur_df in dataframes.items() if dname != cur_name]
        cross_df = pd.concat(cross_df, axis=0, ignore_index=True)
        X_train, y_train = cross_df['TEXT'].tolist(), cross_df['CLASS'].tolist()
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.05)

        # Tokenize texts separately after splitting
        X_train_enc = tokenizer(X_train, return_tensors='pt', max_length=MAX_WORDS, padding=True, truncation=True)
        X_valid_enc = tokenizer(X_valid, return_tensors='pt', max_length=MAX_WORDS, padding=True, truncation=True)
        X_test_enc = tokenizer(X_test, return_tensors='pt', max_length=MAX_WORDS, padding=True, truncation=True)

        # Create datasets with tokenized data
        train_dataset = TextClassificationDataset(X_train_enc, torch.tensor(y_train))
        valid_dataset = TextClassificationDataset(X_valid_enc, torch.tensor(y_valid))
        test_dataset = TextClassificationDataset(X_test_enc, torch.tensor(y_test))

        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

        dataset_loaders[dname] = ([train_loader, valid_loader, test_loader], None) # Vocab is not needed for these models

    return dataset_loaders

### Trainining & Evaluation

In [None]:
def EvaluateModel(model, loss_fn, val_loader, isTransf=False):
    model.eval()
    with torch.no_grad():
        Y_actual, Y_preds, losses = [], [], []

        for batch in val_loader:
            # Simple models or Bert/Roberta
            if isTransf is False:
                X, Y = batch[0].to(device), batch[1].to(device)

                # Forward pass
                preds = model(X)
            else:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                Y = batch['label'].to(device)

                # Forward pass
                preds = model(input_ids=input_ids, attention_mask=attention_mask)

            loss = loss_fn(preds, Y)
            losses.append(loss.item())

            Y_actual.append(Y)
            Y_preds.append(preds.argmax(dim=-1))

        Y_actual = torch.cat(Y_actual)
        Y_preds = torch.cat(Y_preds)

    return torch.tensor(losses).mean(), Y_actual.detach().cpu().numpy(), Y_preds.detach().cpu().numpy()

def TrainModel(model, loss_fn, optimizer, train_loader, valid_loader, epochs, patience, isTransf=False):
    best_accuracy = 0.0
    best_valid_loss = float('inf')
    consecutive_no_improvement = 0
    best_model_state = None

    for epoch in range(1, epochs + 1):
        model.train()
        epoch_losses = []
        start_time = time.time()

        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch}"):

            # Move data to the device
            if isTransf is False:
                X, Y = batch[0], batch[1]
                X, Y = X.to(device), Y.to(device)

                # Forward pass
                Y_preds = model(X)
            else:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                Y = batch['label'].to(device)

                # Forward pass
                Y_preds = model(input_ids=input_ids, attention_mask=attention_mask)

            # Calculate loss
            loss = loss_fn(Y_preds, Y)
            epoch_losses.append(loss.item())

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()

            # Limit gradient - Exploding gradient problems, especially in LSTMS/GRUS
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

        # Training time for the epoch
        train_time = time.time() - start_time
        avg_train_loss = sum(epoch_losses) / len(epoch_losses)

        # Validation Phase
        valid_loss, valid_actual, valid_preds = EvaluateModel(model, loss_fn, valid_loader, isTransf=isTransf)
        valid_accuracy = accuracy_score(valid_actual, valid_preds)

        print(f"Train Loss: {avg_train_loss:.3f} | Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_accuracy:.3f}")

        # Check for improvement
        if valid_loss < best_valid_loss:
            best_accuracy = valid_accuracy
            best_valid_loss = valid_loss
            consecutive_no_improvement = 0
            best_model_state = model.state_dict()  # Save the best model's state
        else:
            consecutive_no_improvement += 1

        # Early stopping condition
        if consecutive_no_improvement >= patience:
            print(f"Early stopping after {epoch} epochs. No improvement for {patience} consecutive epochs.")
            break

    # Load the best model's state (if available)
    if best_model_state:
        model.load_state_dict(best_model_state)

    return train_time

def classification_results(classifier, loaders, patience=3, isTransf=False):

    # Setup
    train_loader, valid_loader, test_loader = loaders
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(classifier.parameters(), lr=LEARNING_RATE)

    # Eval
    mean_training_time = TrainModel(classifier, loss_fn, optimizer, train_loader, valid_loader, EPOCHS, patience, isTransf=isTransf)
    _, Y_actual, Y_preds = EvaluateModel(classifier, loss_fn, test_loader, isTransf=isTransf)

    # Calculate accuracy, precision, recall, F1 score, and balanced accuracy
    accuracy = accuracy_score(Y_actual, Y_preds)
    precision = precision_score(Y_actual, Y_preds, average='weighted', labels=np.unique(Y_preds))
    recall = recall_score(Y_actual, Y_preds, average='weighted', labels=np.unique(Y_preds))
    f1 = f1_score(Y_actual, Y_preds, average='weighted', labels=np.unique(Y_preds))
    balanced_acc = balanced_accuracy_score(Y_actual, Y_preds)

    print(f"Test Accuracy : {accuracy:.3f} | Precision (Weighted): {precision:.3f} | Recall (Weighted): {recall:.3f} | F1-Score (Weighted): {f1:.3f} | Balanced Accuracy: {balanced_acc:.3f}\n")

    # print("\nClassification Report : ")
    # print(classification_report(Y_actual, Y_preds, target_names=['0', '1']))
    # print("\nConfusion Matrix : ")
    # print(confusion_matrix(Y_actual, Y_preds))
    # print(f'{total_params}, {mean_training_time}, {accuracy}')

    return {
        'Parameters': count_parameters(classifier),
        'Training Time': mean_training_time,
        'Test Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Balanced Accuracy': balanced_acc
    }

def classification_results_only(classifier, loaders, patience=3, isTransf=False):

    # Evaluate
    _, _, test_loader = loaders
    loss_fn = nn.CrossEntropyLoss()
    _, Y_actual, Y_preds = EvaluateModel(classifier, loss_fn, test_loader, isTransf=isTransf)

    # Calculate accuracy, precision, recall, F1 score, and balanced accuracy
    accuracy = accuracy_score(Y_actual, Y_preds)
    precision = precision_score(Y_actual, Y_preds, average='weighted', labels=np.unique(Y_preds))
    recall = recall_score(Y_actual, Y_preds, average='weighted', labels=np.unique(Y_preds))
    f1 = f1_score(Y_actual, Y_preds, average='weighted', labels=np.unique(Y_preds))
    balanced_acc = balanced_accuracy_score(Y_actual, Y_preds)

    print(f"Test Accuracy : {accuracy:.3f} | Precision (Weighted): {precision:.3f} | Recall (Weighted): {recall:.3f} | F1-Score (Weighted): {f1:.3f} | Balanced Accuracy: {balanced_acc:.3f}\n")

    # print("\nClassification Report : ")
    # print(classification_report(Y_actual, Y_preds, target_names=['0', '1']))
    # print("\nConfusion Matrix : ")
    # print(confusion_matrix(Y_actual, Y_preds))
    # print(f'{total_params}, {mean_training_time}, {accuracy}')

    return {
        'Parameters': count_parameters(classifier),
        'Training Time': 0,
        'Test Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Balanced Accuracy': balanced_acc
    }

def run_experiment(loaders, model_name, patience=3):
    results_list = []

    for dname, (loaders, vocab) in loaders.items():

        print(f'********Training for {dname}...********')
        match model_name:
            case "RNN Model":
                model = RNNModel(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, 2).to(device)
                isTransf = False
            case "BiRNN Model":
                model = BiRNNModel(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, 2).to(device)
                isTransf = False
            case "LSTM Model":
                model = LSTMModel(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, 2).to(device)
                isTransf = False
            case "GRU Model":
                model = GRUModel(len(vocab), EMBEDDING_DIM, HIDDEN_DIM, 2).to(device)
                isTransf = False
            case "Transformer Model":
                model = TransformerModel(len(vocab), EMBEDDING_DIM, NUM_HEADS, HIDDEN_DIM, 2).to(device)
                isTransf = False
            case "Bert Model":
                model = BERTClassifier(2).to(device)
                isTransf = True
            case "Roberta Model":
                model = RoBERTaClassifier(2).to(device)
                isTransf = True
            case _:
                pass

        # In case we train only
        results = classification_results(model, loaders, patience, isTransf=isTransf)
        #torch.save(model.state_dict(),f"models/{dname}_{model_name}")

        # In case we evaluate results
        # model.load_state_dict(torch.load(f"models/{dname}_{model_name}"))
        # results = classification_results_only(model, loaders, patience, isTransf=isTransf)

        results['Dataset'] = dname
        results_list.append(results)

    results_df = pd.DataFrame(results_list)
    results_df['Model'] = model_name
    print(f"\nResults DataFrame for {model_name}:")
    print(results_df)
    #create_plots(results_df)
    return results_df

### Run experiments

In [None]:
# In case we need data from Google Drive
from google.colab import drive
drive.mount('/content/drive')

# This should be replaced by the actual path to the datasets
dataframes = load_data('/content/drive/MyDrive/datasets/')

# For all models besides Bert/Roberta
tokenizer = get_tokenizer("basic_english")
stop_words = set(stopwords.words('english'))
dataset_loaders = prepare_data_loaders(dataframes)
all_results = []

MAX_WORDS = 256; LEARNING_RATE = 2e-3; BATCH_SIZE = 64; EMBEDDING_DIM = 100; HIDDEN_DIM = 128;
all_results.append(run_experiment(dataset_loaders, "RNN Model", patience=3))

MAX_WORDS = 256; LEARNING_RATE = 2e-3; BATCH_SIZE = 64; EMBEDDING_DIM = 100; HIDDEN_DIM = 128;
all_results.append(run_experiment(dataset_loaders, "BiRNN Model", patience=3))

MAX_WORDS = 256; LEARNING_RATE = 2e-3; BATCH_SIZE = 64; EMBEDDING_DIM = 100; HIDDEN_DIM = 128;
all_results.append(run_experiment(dataset_loaders, "GRU Model", patience=3))

MAX_WORDS = 256; LEARNING_RATE = 2e-3; BATCH_SIZE = 64; EMBEDDING_DIM = 100; HIDDEN_DIM = 128;
all_results.append(run_experiment(dataset_loaders, "LSTM Model", patience=3))

MAX_WORDS = 256; LEARNING_RATE = 2e-3; BATCH_SIZE = 64; EMBEDDING_DIM = 100; HIDDEN_DIM = 128; NUM_HEADS = 10
all_results.append(run_experiment(dataset_loaders, "Transformer Model", patience=3))

#Configuration Parameters - Bert and Roberta require too many resources, scale down
MAX_WORDS = 256; EPOCHS = 5; LEARNING_RATE = 2e-5; BATCH_SIZE = 16
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset_loaders = prepare_data_loadersTransf(dataframes)
all_results.append(run_experiment(dataset_loaders, "Bert Model", patience=3))

MAX_WORDS = 256; EPOCHS = 8; LEARNING_RATE = 2e-5; BATCH_SIZE = 16
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
dataset_loaders = prepare_data_loadersTransf(dataframes)
all_results.append(run_experiment(dataset_loaders, "Roberta Model", patience=3))

# Collect all DFs
all_results_df = pd.concat(all_results, axis=0, ignore_index=True)
all_results_df.to_csv('results/cross_domain_full_results.csv')