# Under construction ... starting from here

In [7]:
! pip install transformers[torch] datasets evaluate swifter accelerate wandb

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting swifter
  Downloading swifter-1.4.0.tar.gz (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wandb
  Downloading wandb-0.16.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.5-py3-none-any.whl (7.8 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-

In [8]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [9]:
# load standard modules/libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# load special modules/libraries
import os
import warnings
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from collections import Counter
import string
import re
from tqdm  import tqdm

# load pytorch modules/libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset,DataLoader

from sklearn.metrics import f1_score, roc_auc_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
from torch.utils.data import Dataset, DataLoader

class GFMDataset(Dataset):

  def __init__(self, description, targets, tokenizer, max_len):
    self.description = description
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.description)

  def __getitem__(self, item):
    description = str(self.description[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      description,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
      truncation = True
    )

    return {
      'review_text': description,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [11]:
from transformers import AutoTokenizer
MAX_LEN = 500
BATCH_SIZE = 20

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def load_and_preprocess_dataset(file_path):
    # Load dataset
    raw_df = pd.read_csv(file_path)

    # Subsample 2500 samples for each class where success == 1 and success == 0
    df = pd.concat([
        raw_df[raw_df['success'] == 1].sample(n=2500, random_state=42),
        raw_df[raw_df['success'] == 0].sample(n=2500, random_state=42)
    ], ignore_index=True)

    # Display class distribution
    class_distribution = df['success'].value_counts()
    print("Class Distribution:")
    print(class_distribution)

    # First, split the data into training and temporary data (temp_data)
    df_temp, df_test = train_test_split(df, test_size=0.2, random_state=42)

    # Now, split the temporary data into training and validation data
    df_train, df_val = train_test_split(df_temp, test_size=0.25, random_state=42)

    train_data = GFMDataset(description=df_train['Description'].to_numpy(),
                                    targets=df_train['success'].to_numpy(),
                                    tokenizer=tokenizer,
                                    max_len=MAX_LEN)
    train_data_loader = DataLoader(train_data, batch_size = BATCH_SIZE, num_workers = 4) #modify num_works as needed

    val_data = GFMDataset(description=df_val['Description'].to_numpy(),
                              targets=df_val['success'].to_numpy(),
                              tokenizer=tokenizer,
                              max_len=MAX_LEN)
    val_data_loader = DataLoader(val_data, batch_size=BATCH_SIZE, num_workers=4)  # modify num_works as needed

    test_data = GFMDataset(description=df_test['Description'].to_numpy(),
                                targets=df_test['success'].to_numpy(),
                                tokenizer=tokenizer,
                                max_len=MAX_LEN)
    test_data_loader = DataLoader(test_data, batch_size=BATCH_SIZE, num_workers=4)
    return train_data_loader, val_data_loader, test_data_loader

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import wandb

# Load and preprocess dataset
train_loader, valid_loader, test_loader = load_and_preprocess_dataset("/content/gdrive/MyDrive/ECE1786_Project/processed_en_dataset.csv")

# Load evaluation functions
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
auc = evaluate.load("roc_auc")
recall = evaluate.load("recall")
precision = evaluate.load("precision")

def compute_metrics(eval_pred):
    # predictions = eval_pred.predictions
    # labels = eval_pred.label_ids
    # # Assuming your model outputs logits, use softmax to get probabilities
    # probabilities = np.exp(predictions) / np.exp(predictions).sum(axis=1, keepdims=True)
    # predicted_labels = np.argmax(probabilities, axis=1)
    # # Compute different metrics
    # acc = accuracy.compute(predictions=predicted_labels, references=labels)["accuracy"]
    # f1_val = f1.compute(predictions=predicted_labels, references=labels)["f1"]
    # # auc_val = auc.compute(predictions={"prediction_scores": probabilities[:, 1]}, references=labels)  # Assuming binary classification
    # auc_val = roc_auc_score(labels, probabilities[:, 1])
    # recall_val = recall.compute(predictions=predicted_labels, references=labels)["recall"]
    # precision_val = precision.compute(predictions=predicted_labels, references=labels)["precision"]
    # # Calculate loss using cross-entropy
    # loss = F.cross_entropy(torch.tensor(predicted_labels), torch.tensor(labels).float()).item()
    # return {
    #     "loss": loss,
    #     'accuracy': acc,
    #     'f1_score': f1_val,
    #     'auc': auc_val,
    #     'recall': recall_val,
    #     'precision': precision_val
    # }
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    # Assuming your model outputs logits, use softmax to get probabilities
    probabilities = F.softmax(torch.tensor(predictions), dim=1)
    predicted_labels = torch.argmax(probabilities, dim=1)
    # Compute different metrics
    acc = accuracy.compute(predictions=predicted_labels.numpy(), references=labels)["accuracy"]
    f1_val = f1.compute(predictions=predicted_labels.numpy(), references=labels)["f1"]
    auc_val = roc_auc_score(labels, probabilities[:, 1])
    recall_val = recall.compute(predictions=predicted_labels.numpy(), references=labels)["recall"]
    precision_val = precision.compute(predictions=predicted_labels.numpy(), references=labels)["precision"]
    # Calculate loss using cross-entropy
    loss = F.cross_entropy(torch.tensor(predictions), torch.tensor(labels).long()).item()
    return {
        "loss": loss,
        'accuracy': acc,
        'f1_score': f1_val,
        'auc': auc_val,
        'recall': recall_val,
        'precision': precision_val
    }

id2label = {0: "UNSUCCESSFUL", 1: "SUCCESSFUL"}
label2id = {"UNSUCCESSFUL": 0, "SUCCESSFUL": 1}

# Initialize W&B
wandb.init()

# Load T5 model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

# Training arguments
training_args = TrainingArguments(
    output_dir="/content/gdrive/MyDrive/ECE1786_Project/t5-output/",
    learning_rate=1e-5,
    per_device_train_batch_size=40,
    per_device_eval_batch_size=40,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=1000,
    load_best_model_at_end=True,
    report_to='wandb',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_loader.dataset,
    eval_dataset=valid_loader.dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


Class Distribution:
1    2500
0    2500
Name: success, dtype: int64




Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: ignored