# **Insert Title Here**
**DATA103 S11 Group 4**
- GOZON, Jean Pauline D.
- JAMIAS, Gillian Nicole A.
- MARCELO Andrea Jean C. 
- REYES, Anton Gabriel G.
- VICENTE, Francheska Josefa

## Requirements and Imports

### Imports

**Basic Libraries**

* `numpy` contains a large collection of mathematical functions
* `pandas` contains functions that are designed for data manipulation and data analysis



In [None]:
import numpy as np
import pandas as pd
import datasets

**Machine Learning Libraries**

* `torch` this is an open source ML library for deep neural network creation
* `transformers` contains pre-trained models

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_lightning.callbacks import ProgressBarBase, RichProgressBar

In [None]:
from transformers import AutoTokenizer, BertTokenizerFast, AutoModelForSequenceClassification, TrainerCallback, TrainingArguments, Trainer

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, hamming_loss, accuracy_score
from transformers import EvalPrediction
import evaluate

In [None]:
import optuna

In [None]:
import pickle

In [None]:
df = pd.read_csv ('cleaned_data.csv')
df

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

## Preparing data for Feature Engineering

### Splitting the Dataset into Train, Val, and Test Split

In [None]:
X = df ['text']
X

In [None]:
y = df ['class']
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                    stratify = y,
                                                    random_state = 42, 
                                                    shuffle = True)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train, 
                                                  test_size = 0.1,
                                                  stratify = y_train,
                                                  random_state = 42, 
                                                  shuffle = True)

In [None]:
print('Train input  shape: ', X_train.shape)
print('Train output shape: ', y_train.shape)

In [None]:
print('Val input  shape: ', X_val.shape)
print('Val output shape: ', y_val.shape)

In [None]:
print('Test input  shape: ', X_test.shape)
print('Test output shape: ', y_test.shape)

In [None]:
train_df = pd.concat([X_train, y_train], axis = 1).reset_index(drop = True)
train_df

In [None]:
val_df = pd.concat([X_val, y_val], axis = 1).reset_index(drop = True)
val_df

In [None]:
test_df = pd.concat([X_test, y_test], axis = 1).reset_index(drop = True)
test_df

### Creation of Dataset

In [None]:
train_dataset = datasets.Dataset.from_pandas(train_df)
train_dataset

In [None]:
val_dataset = datasets.Dataset.from_pandas(val_df)
val_dataset

In [None]:
test_dataset = datasets.Dataset.from_pandas(test_df)
test_dataset

In [None]:
dataset = datasets.DatasetDict({
    "train" : train_dataset, 
    "val" : val_dataset, 
    "test" : test_dataset
})

dataset

## Feature Engineering

### Defining of Functions

In [None]:
MAX_LENGTH = 512

In [None]:
def preprocess_function(examples, tokenizer):
    encoding = tokenizer(examples["text"], padding = "max_length", truncation = True, max_length = MAX_LENGTH)
    encoding["labels"] = torch.tensor(examples ['class'])
    return encoding

In [None]:
def create_encoded_dataset (tokenizer):
    encoded_dataset = dataset.map(preprocess_function, 
                                  batched=True, 
                                  remove_columns=dataset['train'].column_names, 
                                  fn_kwargs = {"tokenizer": tokenizer})
    
    encoded_dataset.set_format("torch")
    
    return encoded_dataset

### Tokenizing with BERT

In [None]:
bert_tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
bert_encoded_dataset = create_encoded_dataset (bert_tokenizer)

### Tokenizing with RoBERTa

In [None]:
model_checkpoint_roberta = 'roberta-base'

In [None]:
roberta_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_roberta)

In [None]:
roberta_encoded_dataset = create_encoded_dataset (roberta_tokenizer)

## Modeling and Evaluation

### Defining of Functions

In [None]:
def compute_metrics(p: EvalPrediction):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    precision_metric = load_metric("precision")
    recall_metric = load_metric("recall")
    accuracy_metric = load_metric("accuracy")
    f1_metric = load_metric("f1")
    
    f1_macro_score = f1_metric.compute(predictions=predictions, references=labels, average="macro")
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)
    precision_score = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall_score = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    
    results = {
        'Accuracy' : accuracy_score,
        'F1 Macro Score' : f1_macro_score, 
        'Precision' : precision_score,
        'Recall' : recall_score
    }
    
    return result

### Defining of Hyperparameter Space

In [None]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [0.1, 0.01, 0.001]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
        "num_train_epochs": trial.suggest_categorical("num_train_epochs", [2, 3, 4])
    }

### BERT Model

#### Model Training 

In [None]:
model_checkpoint = 'bert-base-uncased'

In [None]:
bert_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    return_dict = False
).to(device)

In [None]:
training_args = TrainingArguments(output_dir = "bert_trainer", 
                                  save_steps = 20000,
                                  save_strategy = 'steps',
                                  fp16 = True,
                                  evaluation_strategy = "epoch", 
                                  resume_from_checkpoint = True)

In [None]:
trainer = Trainer(
    model = bert_model,
    args = training_args,
    train_dataset = bert_encoded_dataset ['train'],
    eval_dataset = bert_encoded_dataset ['val'],
    compute_metrics = compute_metrics,
    callbacks = [TrainerCallback()]
)

In [None]:
trainer.train()

#### Saving BERT base model

In [None]:
path_for_models ='./saved_models/BERTv1'
trainer.save_model(path_for_models)

#### Hyperparameter Tuning

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

In [None]:
trainer_tuning = Trainer(
    model_init = model_init,
    args = training_args,
    train_dataset = bert_encoded_dataset ['train'],
    eval_dataset = bert_encoded_dataset ['val'],
    tokenizer = bert_tokenizer,
    compute_metrics = compute_metrics,
    callbacks = [TrainerCallback()]
)

In [None]:
best_trial = trainer_tuning.hyperparameter_search(
    direction = "maximize",
    backend = "optuna",
    hp_space = optuna_hp_space,
    n_trials = 3,
    compute_objective=compute_objective
)

In [None]:
best_trial

##### Saving BERT tuned model

In [None]:
path_for_models ='./saved_models/BERTv1_tuned'
trainer.save_model(path_for_models)

#### Evaluation

#### Feature Importance

### RoBERTa Model

#### Model Training 

In [None]:
model_checkpoint_roberta = 'roberta-base'

In [None]:
bert_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_roberta,
    return_dict = False
).to(device)

In [None]:
training_args = TrainingArguments(output_dir = "roberta_trainer", 
                                  save_steps = 20000,
                                  save_strategy = 'steps',
                                  fp16 = True,
                                  evaluation_strategy = "epoch", 
                                  resume_from_checkpoint = True)

In [None]:
trainer = Trainer(
    model = bert_model,
    args = training_args,
    train_dataset = bert_encoded_dataset ['train'],
    eval_dataset = bert_encoded_dataset ['val'],
    compute_metrics = compute_metrics,
    callbacks = [TrainerCallback()]
)

In [None]:
trainer.train()

#### Saving RoBERTa base model

In [None]:
path_for_models ='./saved_models/RoBERTav1'
trainer.save_model(path_for_models)

#### Hyperparameter Tuning

In [None]:
def model_init_roberta ():
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint_roberta)

In [None]:
trainer_tuning = Trainer(
    model_init = model_init_roberta,
    args = training_args,
    train_dataset = roberta_encoded_dataset ['train'],
    eval_dataset = roberta_encoded_dataset ['val'],
    tokenizer = roberta_tokenizer,
    compute_metrics = compute_metrics,
    callbacks = [TrainerCallback()]
)

In [None]:
best_trial_roberta = trainer_tuning.hyperparameter_search(
    direction = "maximize",
    backend = "optuna",
    hp_space = optuna_hp_space,
    n_trials = 3,
    compute_objective = compute_objective
)

In [None]:
best_trial_roberta

##### Saving RoBERTa tuned model

In [None]:
path_for_models ='./saved_models/RoBERTav1_tuned'
trainer.save_model(path_for_models)

#### Evaluation

#### Feature Importance