# **Insert Title Here**
**DATA103 S11 Group 4**
- GOZON, Jean Pauline D.
- JAMIAS, Gillian Nicole A.
- MARCELO Andrea Jean C. 
- REYES, Anton Gabriel G.
- VICENTE, Francheska Josefa

## Requirements and Imports

### Imports

**Basic Libraries**

* `numpy` contains a large collection of mathematical functions
* `pandas` contains functions that are designed for data manipulation and data analysis



In [2]:
import numpy as np
import pandas as pd
import datasets

**Machine Learning Libraries**

* `torch` this is an open source ML library for deep neural network creation
* `transformers` contains pre-trained models

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_lightning.callbacks import ProgressBarBase, RichProgressBar

In [5]:
from transformers import AutoTokenizer, BertTokenizerFast, AutoModelForSequenceClassification, TrainerCallback, TrainingArguments, Trainer

In [6]:
from sklearn.metrics import f1_score, roc_auc_score, hamming_loss, accuracy_score
from transformers import EvalPrediction
import evaluate

In [7]:
import pickle

In [9]:
df = pd.read_csv ('cleaned_data.csv')
df

Unnamed: 0,class,text,text_token
0,0,"['its not a viable option, and youll be leavin...","['its', 'not', 'a', 'viable', 'option', 'and',..."
1,1,['it can be hard to appreciate the notion that...,"['it', 'can', 'be', 'hard', 'to', 'appreciate'..."
2,1,"['hi, so last night i was sitting on the ledge...","['hi', 'so', 'last', 'night', 'i', 'was', 'sit..."
3,1,['i tried to kill my self once and failed badl...,"['i', 'tried', 'to', 'kill', 'my', 'self', 'on..."
4,1,['hi nem3030. what sorts of things do you enjo...,"['hi', 'nem3030', 'what', 'sorts', 'of', 'thin..."
...,...,...,...
242155,0,if you don't like rock then your not going to ...,"['if', 'you', 'don', 't', 'like', 'rock', 'the..."
242156,0,you how you can tell i have so many friends an...,"['you', 'how', 'you', 'can', 'tell', 'i', 'hav..."
242157,0,pee probably tastes like salty tea😏💦‼️ can som...,"['pee', 'probably', 'tastes', 'like', 'salty',..."
242158,1,the usual stuff you find herei'm not posting t...,"['the', 'usual', 'stuff', 'you', 'find', 'here..."


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

## Feature Engineering

### Splitting the Dataset into Train, Val, and Test Split

In [None]:
X = df ['text']
X

In [None]:
y = df ['class']
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                    stratify = y,
                                                    random_state = 42, 
                                                    shuffle = True)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train, 
                                                  test_size = 0.1,
                                                  stratify = y_train,
                                                  random_state = 42, 
                                                  shuffle = True)

In [None]:
print('Train input  shape: ', X_train.shape)
print('Train output shape: ', y_train.shape)

In [None]:
print('Val input  shape: ', X_val.shape)
print('Val output shape: ', y_val.shape)

In [None]:
print('Test input  shape: ', X_test.shape)
print('Test output shape: ', y_test.shape)

In [None]:
train_df = pd.concat([X_train, y_train], axis = 1).reset_index(drop = True)
train_df

In [None]:
val_df = pd.concat([X_val, y_val], axis = 1).reset_index(drop = True)
val_df

In [None]:
test_df = pd.concat([X_test, y_test], axis = 1).reset_index(drop = True)
test_df

### Tokenizing with BERT

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
MAX_LENGTH = 512

In [None]:
train_dataset = datasets.Dataset.from_pandas(train_df)
train_dataset

In [None]:
val_dataset = datasets.Dataset.from_pandas(val_df)
val_dataset

In [None]:
test_dataset = datasets.Dataset.from_pandas(test_df)
test_dataset

In [None]:
dataset = datasets.DatasetDict({
    "train" : train_dataset, 
    "val" : val_dataset, 
    "test" : test_dataset
})

dataset

In [None]:
def preprocess_function(examples):
    encoding = tokenizer(examples["text"], padding = "max_length", truncation = True, max_length = MAX_LENGTH)
    encoding["labels"] = torch.tensor(examples ['class'])
    return encoding

In [None]:
encoded_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)

In [None]:
encoded_dataset.set_format("torch")

## Modeling and Evaluation

### BERT Model

#### Model Training 

In [None]:
bert_model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    return_dict = False
).to(device)

In [None]:
training_args = TrainingArguments(output_dir = "bert_trainer", 
                                  save_steps = 20000,
                                  save_strategy = 'steps',
                                  fp16 = True,
                                  evaluation_strategy = "epoch", 
                                  resume_from_checkpoint = True)

In [None]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    hamming_loss_score = hamming_loss(y_true = y_true, y_pred = y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    
    # return as dictionary
    metrics = {
        'f1_micro_average': f1_micro_average,
        'roc_auc': roc_auc, 
        'hamming_loss_score' : hamming_loss_score,
        'f1_macro_average' : f1_macro_average,
        'accuracy': accuracy
    }
    return metrics

In [None]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
trainer = Trainer(
    model = bert_model,
    args = training_args,
    train_dataset = encoded_dataset ['train'],
    eval_dataset = encoded_dataset ['val'],
    compute_metrics = compute_metrics,
    callbacks = [TrainerCallback()]
)

In [None]:
trainer.train()

#### Saving BERT base model

In [None]:
path_for_models ='./saved_models/BERTv1'
trainer.save_model(path_for_models)

#### Hyperparameter Tuning

#### Evaluation

#### Feature Importance