In [None]:
!pip install --upgrade pip
!pip install sentencepiece datasets transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [None]:
!nvidia-smi

Thu Mar  9 05:56:22 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    21W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader
import torch
import numpy as np
from sklearn.metrics import classification_report,accuracy_score
import pandas as pd
from scipy.special import softmax
from collections import Counter
import torch.nn as nn
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import an unlabelled dataset and a validation dataset
train = pd.read_csv('/content/drive/MyDrive/val/df_sentiment_train_text.csv',header=None).values[:,0].tolist()
val = pd.read_csv('/content/drive/MyDrive/val/df_sentiment_val_text.csv',header=None).values[:,0].tolist()
val_label = pd.read_csv('/content/drive/MyDrive/val/df_sentiment_val_label.csv',header=None).values[:,0].tolist()
val_label = [2 if i == 'positive' else i for i in val_label]
val_label = [1 if i == 'neutral' else i for i in val_label]
val_label = [0 if i == 'negative' else i for i in val_label]
train = [str(x) for x in train]

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels).to('cuda') if torch.cuda.is_available() else torch.tensor(labels)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# subclassing the Trainer class and overriding the compute_loss function with a custom loss function to balance the classes
class CustomTrainer(Trainer):
    def __init__(self, class_weights, **kwargs):
        super().__init__(**kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to('cuda')
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# self training class
class SelfTraining:
    # first initialize the class with the model and tokenizer
    def __init__(self,model,tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.CUDA = True
        self.LR = 2e-5
        self.EPOCHS = 5
        self.BATCH_SIZE =100
        self.MAX_TRAINING_EXAMPLES = -1
        self.imbalanced = True
        self.best_acc = 0
        self.acc = []
        self.class_weights = torch.tensor([1,2,10],dtype=torch.float32).to('cuda') if torch.cuda.is_available() else torch.tensor(class_weights,dtype=torch.float32)

    # load the unlabeled training data and validation data with labels
    def load_data(self,train,val,val_label):
        self.train_data = train
        self.val_data = val
        self.val_data_label = val_label

    # preprocess the data
    def preprocess(self,corpus):
        outcorpus = []
        for text in corpus:
            new_text = []
            for t in text.split(" "):
                t = '@user' if t.startswith('@') and len(t) > 1 else t
                t = 'http' if t.startswith('http') else t
                new_text.append(t)
            new_text = " ".join(new_text)
            outcorpus.append(new_text)
        return outcorpus

    # forward pass
    def forward(self,text, cuda=True):
        text = self.preprocess(text)
        encoded_input = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        if cuda:
            encoded_input.to('cuda')
            output = self.model(**encoded_input)
            scores = output[0].detach().cpu().numpy()
        else:
            output = self.model(**encoded_input)
            scores = output[0].detach().numpy()

        scores = softmax(scores, axis=-1)
        return scores

    # predict the labels and construct MyDataset for training data
    def predict(self):
        if self.CUDA:
            self.model.to('cuda')
        corpus = self.train_data
        dl = DataLoader(corpus, batch_size=self.BATCH_SIZE)
        all_preds = []
        all_scores = []

        for idx,batch in enumerate(dl):
            text = self.preprocess(batch)
            scores = self.forward(text, cuda=self.CUDA)
            preds = np.argmax(scores, axis=-1)
            all_preds.extend(preds)
            all_scores.extend(np.max(scores, axis=-1))

        print('done')
        train = pd.DataFrame(corpus)
        train['labels'] = all_preds
        train['scores'] = all_scores
        train.columns = ['texts','labels','scores']
        train = train[train['scores'] > 0.9].reset_index()
        if len(train) == 0:
            print("No new training examples found")
            return False

        else:
            train_label = train['labels'].values.tolist()
            train_label = [2 if i == 'positive' else i for i in train_label]
            train_label = [1 if i == 'neutral' else i for i in train_label]
            train_label = [0 if i == 'negative' else i for i in train_label]
            if self.imbalanced:
                class_counts = np.bincount(train_label)
                class_weights = [sum(class_counts) / count for count in class_counts]
                self.class_weights = torch.tensor(class_weights,dtype=torch.float32).to('cuda') if torch.cuda.is_available() else torch.tensor(class_weights,dtype=torch.float32)
            train_text = train['texts'].values.tolist()
            train_encodings = self.tokenizer(train_text, return_tensors='pt', padding=True, truncation=True)
            if self.CUDA:
                train_encodings.to('cuda')
            train_dataset = MyDataset(train_encodings, train_label)
            self.train_dataset = train_dataset
            print("New training examples found: ",len(train_dataset))
            return True

    # encode the validation data and construct MyDataset for it
    def encode_val(self):
        val_encodings = self.tokenizer(self.val_data, return_tensors='pt', padding=True, truncation=True)
        if self.CUDA:
            val_encodings.to('cuda')
        val_dataset = MyDataset(val_encodings, self.val_data_label)
        self.val_dataset = val_dataset

    # train the model iteratively with a parameter for number of iterations
    def train(self,iterations=2):
        if self.CUDA:
            self.model.to('cuda')
        for param in self.model.parameters():
            param.requires_grad = False

        for param in self.model.classifier.parameters():
            param.requires_grad = True
        self.encode_val()
        training_args = TrainingArguments(
            output_dir='/content/drive/MyDrive/results',                   # output directory
            num_train_epochs=self.EPOCHS,                  # total number of training epochs
            per_device_train_batch_size=self.BATCH_SIZE,   # batch size per device during training
            per_device_eval_batch_size=self.BATCH_SIZE,    # batch size for evaluation
            warmup_steps=0,                         # number of warmup steps for learning rate scheduler
            weight_decay=0.01,                        # strength of weight decay
            logging_dir='/content/drive/MyDrive/results/logs',                     # directory for storing logs
            logging_steps=10,                         # when to print log
            load_best_model_at_end=True,              # load or not best model at the end
            evaluation_strategy='steps',              # when to evaluate
            dataloader_pin_memory=False
        )
        for i in range(iterations):
            print("Iteration: ",i+1)
            if self.predict():
                print(len(self.train_dataset))
                trainer = CustomTrainer(
                    model=self.model,                              # the instantiated 🤗 Transformers model to be trained
                    args=training_args,                       # training arguments, defined above
                    train_dataset=self.train_dataset,              # training dataset
                    eval_dataset=self.val_dataset,                # evaluation dataset
                    class_weights=self.class_weights
                )
                trainer.train()

                # evaluate on the validation set
                test_preds_raw, test_labels , _ = trainer.predict(self.val_dataset)
                test_preds = np.argmax(test_preds_raw, axis=-1)
                print(classification_report(test_labels, test_preds, digits=3))

                # save the accuracy of best model of the iteration for comparison later
                self.acc.append(accuracy_score(test_labels, test_preds))

                # save best model of the iteration if it is the best model so far
                print("Accuracy: ",str(self.acc[-1])," - iteration: ",str(i+1))
                if self.acc[-1] > self.best_acc:
                    self.best_acc = self.acc[-1]
                    trainer.save_model('/content/drive/MyDrive/results/iteration_'+str(i+1))
                    print("Model saved! - iteration: ",str(i+1))
                if i + 1 == iterations:
                    trainer.save_model('/content/drive/MyDrive/results/iteration_'+str(i+1))
                    print("Model saved! - iteration: ",str(i+1))
            else:
                break

    def full_train(self,iterations=2):
      data = pd.read_csv('/content/drive/MyDrive/val/df_sentiment_test_text.csv',header=None).values[:,0].tolist()
      data_label = pd.read_csv('/content/drive/MyDrive/val/df_sentiment_test_label.csv',header=None).values[:,0].tolist()
      data_encodings = test.tokenizer(data, return_tensors='pt', padding=True, truncation=True)
      data_dataset = MyDataset(data_encodings, data_label)
      test.train_dataset = data_dataset
      if self.CUDA:
          self.model.to('cuda')
          data_encodings.to('cuda')
      for param in self.model.parameters():
          param.requires_grad = True

      self.encode_val()
      training_args = TrainingArguments(
          output_dir='/content/drive/MyDrive/results',                   # output directory
          num_train_epochs=self.EPOCHS,                  # total number of training epochs
          per_device_train_batch_size=self.BATCH_SIZE,   # batch size per device during training
          per_device_eval_batch_size=self.BATCH_SIZE,    # batch size for evaluation
          warmup_steps=0,                         # number of warmup steps for learning rate scheduler
          weight_decay=0.01,                        # strength of weight decay
          logging_dir='/content/drive/MyDrive/results/logs',                     # directory for storing logs
          logging_steps=10,                         # when to print log
          load_best_model_at_end=True,              # load or not best model at the end
          evaluation_strategy='steps',              # when to evaluate
          dataloader_pin_memory=False
      )
      for i in range(iterations):
          print("Iteration: ",i+1)
          print(len(self.train_dataset))
          trainer = CustomTrainer(
              model=self.model,                              # the instantiated 🤗 Transformers model to be trained
              args=training_args,                       # training arguments, defined above
              train_dataset=self.train_dataset,              # training dataset
              eval_dataset=self.val_dataset,                # evaluation dataset
              class_weights=self.class_weights
          )
          trainer.train()

          # evaluate on the validation set
          test_preds_raw, test_labels , _ = trainer.predict(self.val_dataset)
          test_preds = np.argmax(test_preds_raw, axis=-1)
          print(classification_report(test_labels, test_preds, digits=3))

          # save the accuracy of best model of the iteration for comparison later
          self.acc.append(accuracy_score(test_labels, test_preds))

          # save best model of the iteration if it is the best model so far
          print("Accuracy: ",str(self.acc[-1])," - iteration: ",str(i+1))
          if self.acc[-1] > self.best_acc:
              self.best_acc = self.acc[-1]
              trainer.save_model('/content/drive/MyDrive/results/iteration_'+str(i+1))
              print("Model saved! - iteration: ",str(i+1))
          if i + 1 == iterations:
              trainer.save_model('/content/drive/MyDrive/results/iteration_'+str(i+1))
              print("Model saved! - iteration: ",str(i+1))


In [None]:
MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment" # use this to finetune the sentiment classifier
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/results/base',num_labels=3)

In [None]:
test = SelfTraining(model,tokenizer)

In [None]:
test.load_data(train,val,val_label)

In [None]:
test.full_train(iterations=100)

In [None]:
from google.colab import runtime
runtime.unassign()