In [1]:
!pip install pytorch_lightning
!pip install transformers[torch] datasets evaluate swifter accelerate wandb SentencePiece datasets

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.1.2-py3-none-any.whl (776 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.9/776.9 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.10.0 pytorch_lightning-2.1.2 torchmetrics-1.2.1
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading ev

In [2]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

train_df = pd.read_csv("/content/drive/My Drive/ece 1786/train.csv")
valid_df = pd.read_csv("/content/drive/My Drive/ece 1786/valid.csv")
test_df = pd.read_csv("/content/drive/My Drive/ece 1786/test.csv")

train_df["source_text"] = "goal is " + train_df["goal"].astype(int).astype(str) + ", " + train_df["source_text"]
valid_df["source_text"] = "goal is " + valid_df["goal"].astype(int).astype(str) + ", " + valid_df["source_text"]
test_df["source_text"] = "goal is " + test_df["goal"].astype(int).astype(str) + ", " + test_df["source_text"]

train_df

Mounted at /content/drive


Unnamed: 0,source_text,goal,target_text
0,"goal is 25000, sean alexander neighbor kids co...",25000.0,positive
1,"goal is 20000, name stacey close friend former...",20000.0,negative
2,"goal is 1500, seven years ago twentytwo years ...",1500.0,positive
3,"goal is 2500, begin nathaniel music shopping b...",2500.0,positive
4,"goal is 10000, march 27th senseless tragic sho...",10000.0,positive
...,...,...,...
63292,"goal is 10000, update yesterday busy day colly...",10000.0,negative
63293,"goal is 12000, matt age 43 looks feels like hi...",12000.0,negative
63294,"goal is 7500, many know little roman davitt be...",7500.0,positive
63295,"goal is 3000, everyone knows pamela lindsay sp...",3000.0,positive


In [3]:
import torch
import pytorch_lightning as pl
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5ForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score
import numpy as np
from transformers import DataCollatorWithPadding
import wandb

class GFMDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['source_text']
        label = self.dataframe.iloc[idx]['target_text']  # Assuming label is 0 or 1

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target_encoding = self.tokenizer(
            label,
            max_length=2,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            # 'decoder_input_ids': target_encoding['input_ids'].squeeze(),
            # 'decoder_attention_mask': target_encoding['attention_mask'].squeeze(),
            'labels': torch.tensor([1 if label == "positive" else 0])  # Encode label as a tensor
        }

def compute_metrics(eval_pred):
    predictions = eval_pred.predictions[0]
    labels = eval_pred.label_ids

    # Model outputs logits, use softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(axis=1, keepdims=True)
    predicted_labels = np.argmax(probabilities, axis=1)

    # Compute different metrics
    acc = accuracy_score(labels, predicted_labels)
    f1_val = f1_score(labels, predicted_labels)
    auc_val = roc_auc_score(labels, probabilities[:, 1])
    recall_val = recall_score(labels, predicted_labels)
    precision_val = precision_score(labels, predicted_labels)

    return {
        'accuracy': acc,
        'f1_score': f1_val,
        'auc': auc_val,
        'recall': recall_val,
        'precision': precision_val
    }

In [4]:
# Initialize the T5 model and tokenizer
t5_model = T5ForSequenceClassification.from_pretrained('t5-small')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Freeze all layers except classifier layers
for param in list(t5_model.parameters())[:-4]:
  param.require_grad = False

data_collator = DataCollatorWithPadding(tokenizer=t5_tokenizer)


# Create datasets and dataloaders
max_length = 512
train_dataset = GFMDataset(train_df, t5_tokenizer, max_length)
# take only the first half of the val dataset to avoid Cuda memory error; (same with test dataset as well).
val_dataset = GFMDataset(valid_df.iloc[:int(0.5*len(valid_df))], t5_tokenizer, max_length) 
test_dataset = GFMDataset(test_df.iloc[:int(0.5*len(test_df))], t5_tokenizer, max_length)
test_dataset2 = GFMDataset(test_df.iloc[int(0.5*len(test_df)):], t5_tokenizer, max_length)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at t5-small and are newly initialized: ['classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
t5_model

T5ForSequenceClassification(
  (transformer): T5Model(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_featu

In [6]:
list(t5_model.named_parameters())[-4:]

[('classification_head.dense.weight',
  Parameter containing:
  tensor([[-1.2343e-02, -3.2641e-02, -3.0195e-02,  ..., -1.0061e-02,
           -1.2756e-02,  3.8715e-02],
          [ 1.1838e-04,  3.2202e-02, -5.3289e-02,  ...,  1.5630e-02,
            8.3477e-03,  2.9511e-02],
          [ 2.2011e-02, -1.8075e-02,  4.5547e-03,  ..., -1.1903e-02,
            1.6148e-02,  1.3318e-02],
          ...,
          [-1.4536e-02, -1.7940e-01, -4.5960e-02,  ...,  1.7403e-03,
           -6.8194e-02, -2.2127e-02],
          [ 1.3160e-02, -7.7931e-02,  7.4191e-02,  ..., -3.1059e-02,
            5.8592e-02, -1.6870e-02],
          [-9.1884e-03,  3.6063e-03, -5.2137e-02,  ..., -2.1815e-02,
           -4.3882e-02,  5.6135e-03]], requires_grad=True)),
 ('classification_head.dense.bias',
  Parameter containing:
  tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [None]:
# Initialize W&B
wandb.init()


training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/ece 1786/t5-small-train-clf-only",
    learning_rate=1e-4,
    lr_scheduler_type="cosine",  # Use a cosine learning rate schedule
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.1,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,  # Log metrics every 100 steps
    save_strategy="steps",
    save_steps = 500,
    load_best_model_at_end=True,
    metric_for_best_model = "accuracy",
    greater_is_better = True,
    report_to='wandb',
    save_total_limit = 1,
)


# Set up PyTorch Lightning Trainer
trainer = Trainer(
    model=t5_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=t5_tokenizer,
       data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss,Accuracy,F1 Score,Auc,Recall,Precision
500,0.6393,0.615039,0.649899,0.673734,0.729517,0.721493,0.631905
1000,0.5911,0.582471,0.686552,0.676409,0.756031,0.653885,0.700541
1500,0.5617,0.597154,0.667846,0.562583,0.778272,0.426337,0.82681
2000,0.5502,0.587859,0.697927,0.642323,0.781512,0.541372,0.789551
2500,0.5593,0.553968,0.716127,0.708992,0.787305,0.690212,0.728823
3000,0.5486,0.557468,0.714358,0.700424,0.790622,0.666498,0.737989
3500,0.5563,0.547992,0.716886,0.725086,0.794395,0.745207,0.706023
4000,0.5565,0.546396,0.715369,0.703996,0.794894,0.67558,0.734907
4500,0.5267,0.552646,0.715116,0.703655,0.796081,0.675076,0.734761
5000,0.5397,0.549968,0.715369,0.717653,0.798703,0.721998,0.71336


Step,Training Loss,Validation Loss,Accuracy,F1 Score,Auc,Recall,Precision
500,0.6393,0.615039,0.649899,0.673734,0.729517,0.721493,0.631905
1000,0.5911,0.582471,0.686552,0.676409,0.756031,0.653885,0.700541
1500,0.5617,0.597154,0.667846,0.562583,0.778272,0.426337,0.82681
2000,0.5502,0.587859,0.697927,0.642323,0.781512,0.541372,0.789551
2500,0.5593,0.553968,0.716127,0.708992,0.787305,0.690212,0.728823
3000,0.5486,0.557468,0.714358,0.700424,0.790622,0.666498,0.737989
3500,0.5563,0.547992,0.716886,0.725086,0.794395,0.745207,0.706023
4000,0.5565,0.546396,0.715369,0.703996,0.794894,0.67558,0.734907
4500,0.5267,0.552646,0.715116,0.703655,0.796081,0.675076,0.734761
5000,0.5397,0.549968,0.715369,0.717653,0.798703,0.721998,0.71336


In [None]:
trainer.predict(val_dataset)

In [None]:
trainer.predict(test_dataset)

In [None]:
trainer.predict(test_dataset2)