<a href="https://colab.research.google.com/github/elana1fel/Dreaddit_implementation/blob/main/Contrastive_Learning_for_stress_clasification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! git clone https://github.com/elana1fel/Dreaddit_implementation.git

Cloning into 'Dreaddit_implementation'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 11 (delta 1), reused 6 (delta 1), pack-reused 0[K
Unpacking objects: 100% (11/11), done.


In [2]:
! pip install transformers
! pip install torchsampler
! pip install pytorch-metric-learning


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 14.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 73.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 24.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchsampler
  Downloading torchsampler-0.1.2-py3-none-any.whl (5.6 kB)
Installing collected packages: torchsampler
Success

In [3]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
import torch.nn as nn

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from transformers import AutoTokenizer, AutoModelForSequenceClassification,\
                         AdamW, get_scheduler, AutoModel
from sklearn.metrics import precision_recall_fscore_support
from tqdm import tqdm
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchsampler import ImbalancedDatasetSampler
from pytorch_metric_learning import losses
import sys
import os


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [4]:
import torch

torch.cuda.is_available()

True

# Adding Vader as a feature

In [5]:
def get_vader_feature(df):
    sid = SentimentIntensityAnalyzer()
    df['neg'] = df['text'].apply(lambda review: sid.polarity_scores(review)['neg'])
    df['pos'] = df['text'].apply(lambda review: sid.polarity_scores(review)['pos'])
    df['neu'] = df['text'].apply(lambda review: sid.polarity_scores(review)['neu'])
    df['compound'] = df['text'].apply(lambda review: sid.polarity_scores(review)['compound'])

# StressDataset class

In [6]:
class StressDataset(Dataset):
    def __init__(self, file_path, mode):
        super().__init__()
        self.mode = mode
        df = pd.read_csv(file_path)
        self.labels = df['label'].tolist()
        self.data = {}
        get_vader_feature(df)
        for idx, row in df.iterrows():
            self.data[idx] = (row['text'], row['neg'], row['neu'], row['pos'], row['compound'], row['label'])
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
          text, neg, neu, pos, compound, label = self.data[idx]
          vad_score = [neg, neu, pos, compound]
          return (text, torch.tensor(vad_score), torch.tensor(label, dtype=torch.long))


    def get_labels(self):
        return self.labels


# model class

In [7]:
class Model(nn.Module):
    def __init__(self, pretrained_type, config):
        super().__init__()

        num_labels = 2
        self.pretrained_model = AutoModel.from_pretrained(pretrained_type, num_labels=num_labels)
        self.dense = nn.Linear(768, config['hidden'])
        self.dropout = nn.Dropout(config['dropout'])
        self.classifier = nn.Linear(config['hidden'], num_labels)

        vad_dim = 128
        self.vad_embedding = nn.Sequential(
            nn.Linear(4, vad_dim),
            nn.GELU()
        )

        self.pool_embedding = nn.Sequential(
            nn.Linear(config['hidden'] + vad_dim, config['hidden']),
            nn.GELU()
        )

        self.gelu = nn.GELU()

        torch.nn.init.orthogonal_(self.dense.weight)
        torch.nn.init.orthogonal_(self.classifier.weight)

    def forward(self, vad_score, **pretrained_text):
        vad_embedding = self.vad_embedding(vad_score)
        outputs = self.pretrained_model(**pretrained_text).last_hidden_state
        pretrained_output = outputs[:, 0, :]
        pretrained_output = self.gelu(self.dense(pretrained_output))

        pooled_output = torch.cat((vad_embedding, pretrained_output), dim=1)
        pooled_output = self.pool_embedding(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        return logits, pretrained_output, vad_embedding

In [8]:
res_path = "/content/Dreaddit_implementation/models"
if not os.path.isdir(res_path):
  os.makedirs(res_path)


# Supervised Contrastive Learning

In [9]:
import random

MODEL = {
    "roberta":{
        "pretrain": "cardiffnlp/twitter-roberta-base-sentiment",
        "name": "twitter-roberta-base-sentiment"
    }
}

os.environ["TOKENIZERS_PARALLELISM"] = "false"
PRINT_MODEL = True

def set_seed():
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

def prepare_data(train_path, dev_path):
    train_data = StressDataset(train_path, mode='train')
    dev_data = StressDataset(dev_path, mode='val')
    train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, sampler=ImbalancedDatasetSampler(train_data))
    dev_dataloader = DataLoader(dev_data, batch_size=1, shuffle=False)
    return train_dataloader, dev_dataloader

def train():
    model_type = 'roberta'
    train_path='/content/Dreaddit_implementation/dreaddit-train.csv'
    dev_path='/content/Dreaddit_implementation/dreaddit-test.csv'

    set_seed()
    config = {
        'dropout': DROPOUT,
        'hidden': HIDDEN
    }
    train_dataloader, dev_dataloader = prepare_data(train_path, dev_path)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    print(f"running on: {device}")

    model = Model(MODEL[model_type]["pretrain"], config).to(device)

    #print(model)
    tokenizer = AutoTokenizer.from_pretrained(MODEL[model_type]["pretrain"])
    optimizer = AdamW(model.parameters(), lr=LR)
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=WARM_UP, num_training_steps=len(train_dataloader)*EPOCHS)
    criterion = nn.CrossEntropyLoss() 
    #loss_func = losses.SupConLoss().to(device)
    loss_func = nn.TripletMarginLoss(margin=1.0).to(device)
    # check trained parameters
    print("Parameters to train:", sum(p.numel() for p in model.parameters() if p.requires_grad))

    best_f1 = 0
    pbar = tqdm(range(EPOCHS), desc='Epoch: ')
    for epoch in pbar:
        model.train()
        total_loss = 0
        for data in train_dataloader:
            optimizer.zero_grad()
            text, vad_score, label = list(data[0]), data[1].to(device), data[2].to(device)
            input_text = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
            
            logits, pretrained_output, vad_embedding = model(vad_score=vad_score, **input_text)

            ce_loss = criterion(logits, label)

            if torch.numel(torch.unique(label))==2:
              bs, emb = vad_embedding.shape[0], vad_embedding.shape[1]
              ones_args = torch.argwhere(label).reshape(-1,).tolist()
              zeros_args = torch.argwhere(1-label).reshape(-1,).tolist()
              
              base_vad = []
              pos_vad = []
              neg_vad = []
              for i in range(len(zeros_args)-1):
                for j in range(i+1, len(zeros_args)):
                  base_vad.append(vad_embedding[zeros_args[i]])
                  pos_vad.append(vad_embedding[zeros_args[j]])
                  neg_vad.append(vad_embedding[random.choice(ones_args)])
              for i in range(len(ones_args)-1):
                for j in range(i+1, len(ones_args)):
                  base_vad.append(vad_embedding[ones_args[i]])
                  pos_vad.append(vad_embedding[ones_args[j]])
                  neg_vad.append(vad_embedding[random.choice(zeros_args)])
              base_vad = torch.stack(base_vad)
              pos_vad = torch.stack(pos_vad)
              neg_vad = torch.stack(neg_vad)
              '''scl_pretrained_loss = loss_func(pretrained_output, label)
              scl_vad_loss = loss_func(vad_embedding, label)'''
              scl_vad_loss = loss_func(base_vad, pos_vad, neg_vad)

              base_pretrained = []
              pos_pretrained = []
              neg_pretrained = []
              for i in range(len(zeros_args)-1):
                for j in range(i+1, len(zeros_args)):
                  base_pretrained.append(pretrained_output[zeros_args[i]])
                  pos_pretrained.append(pretrained_output[zeros_args[j]])
                  neg_pretrained.append(pretrained_output[random.choice(ones_args)])
              for i in range(len(ones_args)-1):
                for j in range(i+1, len(ones_args)):
                  base_pretrained.append(pretrained_output[ones_args[i]])
                  pos_pretrained.append(pretrained_output[ones_args[j]])
                  neg_pretrained.append(pretrained_output[random.choice(zeros_args)])
              base_pretrained = torch.stack(base_pretrained)
              pos_pretrained = torch.stack(pos_pretrained)
              neg_pretrained = torch.stack(neg_pretrained)
              scl_pretrained_loss = loss_func(base_pretrained, pos_pretrained, neg_pretrained)
            else:
              scl_pretrained_loss = 0
              scl_vad_loss = 0
            loss = LAMBDA * ce_loss + (1-LAMBDA) * scl_pretrained_loss + (LAMBDA2) * scl_vad_loss

            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        scheduler.step()
    
        model.eval()
        pred = []
        labels = []
        for data in dev_dataloader:
            text, vad_score, label = list(data[0]), data[1].to(device), data[2].to(device)
            input_text = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
            with torch.no_grad():
                logits, pretrained_output, vad_embedding = model(vad_score=vad_score, **input_text)

            pred.append(torch.argmax(logits, dim=-1).cpu().numpy())
            labels.append(label.cpu().numpy())
        precision, recall, f1, support = precision_recall_fscore_support(labels, pred, average='weighted', zero_division=1)
        precision = round(precision, 4)
        recall = round(recall, 4)
        f1 = round(f1, 4)
        avg_loss = round(total_loss/len(train_dataloader), 4)
        pbar.set_description(f"Epoch: {epoch}, F1 score: {f1}, Loss: {avg_loss}", refresh=True)
        print(f"epoch: {epoch}, f1: {f1}, train loss: {avg_loss}, precision: {precision}, recall: {recall}, support: {support}")
        if f1 > best_f1:
            print(f"best_f1_weighted: {f1}")
            print(f"best_precision_weighted: {precision}")
            print(f"best_recall_weighted: {recall}")
            best_f1 = f1
            if f1 >= 0.8:
                torch.save(model.state_dict(), f"{res_path}/{MODEL[model_type]['name']}_{f1}.pt")




In [13]:
EPOCHS = 10
LR = 4e-5
BATCH_SIZE = 8
SEED = 42
WARM_UP = 5
HIDDEN = 512
DROPOUT = 0.1
LAMBDA = 0.6
LAMBDA2 = 0.1

train()

running on: cuda


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

Parameters to train: 125369218


Epoch: 0, F1 score: 0.3032, Loss: 1.0333:  10%|█         | 1/10 [01:01<09:17, 61.93s/it]

epoch: 0, f1: 0.3032, train loss: 1.0333, precision: 0.3013, recall: 0.3077, support: None
best_f1_weighted: 0.3032
best_precision_weighted: 0.3013
best_recall_weighted: 0.3077


Epoch: 1, F1 score: 0.8221, Loss: 0.7219:  10%|█         | 1/10 [02:03<09:17, 61.93s/it]

epoch: 1, f1: 0.8221, train loss: 0.7219, precision: 0.823, recall: 0.8224, support: None
best_f1_weighted: 0.8221
best_precision_weighted: 0.823
best_recall_weighted: 0.8224


Epoch: 2, F1 score: 0.8043, Loss: 0.504:  30%|███       | 3/10 [03:07<07:19, 62.80s/it]

epoch: 2, f1: 0.8043, train loss: 0.504, precision: 0.8102, recall: 0.8056, support: None


Epoch: 3, F1 score: 0.7573, Loss: 0.4114:  40%|████      | 4/10 [04:09<06:14, 62.39s/it]

epoch: 3, f1: 0.7573, train loss: 0.4114, precision: 0.7942, recall: 0.765, support: None


Epoch: 4, F1 score: 0.8266, Loss: 0.3399:  40%|████      | 4/10 [05:11<06:14, 62.39s/it]

epoch: 4, f1: 0.8266, train loss: 0.3399, precision: 0.8277, recall: 0.8266, support: None
best_f1_weighted: 0.8266
best_precision_weighted: 0.8277
best_recall_weighted: 0.8266


Epoch: 5, F1 score: 0.8112, Loss: 0.2619:  60%|██████    | 6/10 [06:14<04:09, 62.35s/it]

epoch: 5, f1: 0.8112, train loss: 0.2619, precision: 0.8126, recall: 0.8112, support: None


Epoch: 6, F1 score: 0.8041, Loss: 0.2354:  70%|███████   | 7/10 [07:15<03:05, 61.89s/it]

epoch: 6, f1: 0.8041, train loss: 0.2354, precision: 0.8076, recall: 0.8042, support: None


Epoch: 7, F1 score: 0.783, Loss: 0.2156:  80%|████████  | 8/10 [08:16<02:02, 61.48s/it]

epoch: 7, f1: 0.783, train loss: 0.2156, precision: 0.7987, recall: 0.7846, support: None


Epoch: 8, F1 score: 0.7915, Loss: 0.189:  90%|█████████ | 9/10 [09:16<01:01, 61.16s/it]

epoch: 8, f1: 0.7915, train loss: 0.189, precision: 0.798, recall: 0.793, support: None


Epoch: 9, F1 score: 0.7919, Loss: 0.1797: 100%|██████████| 10/10 [10:18<00:00, 61.84s/it]

epoch: 9, f1: 0.7919, train loss: 0.1797, precision: 0.804, recall: 0.7944, support: None



