In [8]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import numpy as np

import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [23]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

prompts = {
    'spam': 'text message with advertisement or offer',
    'non_spam':'text message from a friend or family says',
    'real_news':'recently published political news title',
    'fake_news':'fake political news title',
    'happy_tweet':'tweet as a happy person',
    'sad_tweet': 'tweet as a sad person'
}

def clean_text(text):
    text = text.lower()
    for p in prompts:
        text = text.replace(prompts[p], "")
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [None]:
ZERO_SHOT = "zs"
FEW_SHOT = "fs"
PARAPHRASE = "pp"

GPT2 = "gpt2"
FLAN = "flan"
LLAMA = "llama"

class Utils:
    @staticmethod    
    def load_data(file_path):
        df = pd.read_csv(file_path)
        df['text']=df['text'].apply(clean_text)
        df = df.dropna()        
        return df['text'].tolist(), df['y'].tolist()
    
    @staticmethod
    def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
        dataset = TextDataset(texts, labels, tokenizer, max_len)
        return DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    @staticmethod
    def get_mini_train_data(task):
        return "../data/orig/processed/train/"+task+"-data-mini.csv"
    
    @staticmethod
    def get_syn_train_data(llm, synthetic_data_type, task):
        return "../data/syn/"+llm+"/"+synthetic_data_type+"/auto-"+task+"-data.csv"
    
    @staticmethod
    def get_mini_test_data(task):
        return "../data/orig/processed/test/"+task+"-data.csv"
    
    @staticmethod
    def get_dataset(llm, synthetic_data_type, task):
        if synthetic_data_type==ZERO_SHOT or synthetic_data_type==PARAPHRASE or synthetic_data_type==FEW_SHOT:
            return Utils.get_syn_train_data(llm, synthetic_data_type, task)
        return Utils.get_mini_train_data(task)
    
    @staticmethod
    def get_model_path(llm, synthetic_data_type, task):
        return "../models/" + llm + "/" + task + "-" + synthetic_data_type + "-" + llm + "-model.pth"

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
    
class TextClassifier(torch.nn.Module):
    def __init__(self, model_name, num_labels):
        super(TextClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    
    def forward(self, input_ids, attention_mask, labels=None):
        return self.bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model = model.train()
    losses = []
    correct_predictions = 0

    for data in tqdm(data_loader):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        labels = data['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

def eval_model(model, data_loader, device):
    model = model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    return accuracy_score(true_labels, predictions), classification_report(true_labels, predictions)


In [3]:
class Config:
    MAX_LEN = 32  # Maximum sequence length for BERT
    BATCH_SIZE = 16
    EPOCHS = 5
    LEARNING_RATE = 2e-5
    MODEL_NAME = 'bert-base-uncased'  # BERT model type

tokenizer = BertTokenizer.from_pretrained(Config.MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(file_path):
    # Load data
    texts, labels = Utils.load_data(file_path)
    num_classes = len(set(labels))  # Number of unique labels

    print("Tokenizing and Splitting")

    # Split into train and validation sets
    train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
    train_data_loader = Utils.create_data_loader(train_texts, train_labels, tokenizer, Config.MAX_LEN, Config.BATCH_SIZE)
    val_data_loader = Utils.create_data_loader(val_texts, val_labels, tokenizer, Config.MAX_LEN, Config.BATCH_SIZE)

    # Initialize Model
    model = TextClassifier(Config.MODEL_NAME, num_labels=num_classes)
    model = model.to(device)

    # Optimizer
    optimizer = AdamW(model.parameters(), lr=Config.LEARNING_RATE)

    print("Training")

    # Training Loop
    for epoch in range(Config.EPOCHS):
        print(f"Epoch {epoch + 1}/{Config.EPOCHS}")
        train_acc, train_loss = train_epoch(model, train_data_loader, torch.nn.CrossEntropyLoss(), optimizer, device)
        print(f"Train loss: {train_loss}, Train accuracy: {train_acc}")

        val_acc, val_report = eval_model(model, val_data_loader, device)
        print(f"Validation accuracy: {val_acc}")
        if(epoch==Config.EPOCHS-1):
            print(f"Classification report:\n{val_report}")

    return model

models = {}

In [4]:
DATASET_TYPE = ZERO_SHOT # zs, fs, pp, n
LLM = LLAMA
tasks = {
    'news':{
        'MAX_LEN':32,
        'BATCH_SIZE':10,
        'EPOCHS':3,
    },
    'spam':{
        'MAX_LEN':48,
        'BATCH_SIZE':50,
        'EPOCHS':3,
    },
    'sentiment':{
        'MAX_LEN':48,
        'BATCH_SIZE':40,
        'EPOCHS':4,
    }
}
for task in tasks:
    tasks[task]['train_data'] = Utils.get_dataset(LLM, DATASET_TYPE,task)
    tasks[task]['test_data'] = Utils.get_mini_test_data(task)
df_out = pd.DataFrame(columns=['Task','Accuracy'])
tasks

{'news': {'MAX_LEN': 32,
  'BATCH_SIZE': 10,
  'EPOCHS': 3,
  'train_data': '../data/syn/llama/zs/auto-news-data.csv',
  'test_data': '../data/orig/processed/test/news-data.csv'},
 'spam': {'MAX_LEN': 48,
  'BATCH_SIZE': 50,
  'EPOCHS': 3,
  'train_data': '../data/syn/llama/zs/auto-spam-data.csv',
  'test_data': '../data/orig/processed/test/spam-data.csv'},
 'sentiment': {'MAX_LEN': 48,
  'BATCH_SIZE': 40,
  'EPOCHS': 4,
  'train_data': '../data/syn/llama/zs/auto-sentiment-data.csv',
  'test_data': '../data/orig/processed/test/sentiment-data.csv'}}

In [5]:
Utils.get_model_path(LLM, DATASET_TYPE, "spam")

'../models/llama/spam-zs-llama-model.pth'

In [7]:
for task in tasks:
    if(task in ['sentiment']):
        continue
    # if(task in ['spam','news']):
    #     continue
    class Config:
        MAX_LEN = tasks[task]['MAX_LEN']  # Maximum sequence length for BERT
        BATCH_SIZE = tasks[task]['BATCH_SIZE']
        EPOCHS = tasks[task]['EPOCHS']
        LEARNING_RATE = 2e-5
        MODEL_NAME = 'bert-base-uncased'
    print("Starting training: ", task)
    print("Train Data on: ",tasks[task]['train_data'])
    models[task] = train_model(tasks[task]['train_data'])
    torch.save(models[task].state_dict(), Utils.get_model_path(LLM, DATASET_TYPE, task))
    print("Ending training: ", task)
    print("Starting TEST")
    test_texts, test_labels = Utils.load_data(tasks[task]['test_data'])
    print("Tokenize")
    test_data_loader = Utils.create_data_loader(test_texts, test_labels, tokenizer, Config.MAX_LEN, Config.BATCH_SIZE)
    print("Evaluate")
    tasks[task]['predictions'] = eval_model(models[task], test_data_loader, device)
    print("Test Accuracy: ", tasks[task]['predictions'][0])
    df_out.loc[len(df_out)] = [task, tasks[task]['predictions'][0]]


df_out

Starting training:  news
Train Data on:  ../data/syn/llama/zs/auto-news-data.csv
Tokenizing and Splitting


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training
Epoch 1/3


100%|██████████| 11/11 [00:51<00:00,  4.71s/it]


Train loss: 0.6051501658829775, Train accuracy: 0.7407407407407407
Validation accuracy: 1.0
Epoch 2/3


100%|██████████| 11/11 [00:52<00:00,  4.81s/it]


Train loss: 0.2592696493322199, Train accuracy: 0.9907407407407407
Validation accuracy: 1.0
Epoch 3/3


100%|██████████| 11/11 [00:49<00:00,  4.50s/it]


Train loss: 0.056214520877057854, Train accuracy: 1.0
Validation accuracy: 1.0
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        13

    accuracy                           1.00        28
   macro avg       1.00      1.00      1.00        28
weighted avg       1.00      1.00      1.00        28

Ending training:  news
Starting TEST
Tokenize
Evaluate
Test Accuracy:  0.5217391304347826
Starting training:  spam
Train Data on:  ../data/syn/llama/zs/auto-spam-data.csv
Tokenizing and Splitting


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training
Epoch 1/3


100%|██████████| 20/20 [08:40<00:00, 26.05s/it]


Train loss: 0.29955361485481263, Train accuracy: 0.9293873312564901
Validation accuracy: 1.0
Epoch 2/3


100%|██████████| 20/20 [08:22<00:00, 25.15s/it]


Train loss: 0.032298611896112564, Train accuracy: 1.0
Validation accuracy: 1.0
Epoch 3/3


 55%|█████▌    | 11/20 [05:18<04:20, 28.96s/it]


KeyboardInterrupt: 