In [16]:
from IPython.display import display, HTML
from tqdm import tqdm
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import GPTNeoModel, AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn as nn
import os
from sklearn.preprocessing import normalize
import seaborn as sns
from sklearn.model_selection import train_test_split
from ignite.contrib.handlers import PiecewiseLinear
from transformers import AdamW
from ignite.engine import Engine, Events
from ignite.contrib.handlers import ProgressBar
from ignite.metrics import Accuracy
import pandas as pd
from datasets import Dataset
import matplotlib.pyplot as plt
from ignite.metrics import Precision, Recall
from ignite.handlers import Checkpoint, global_step_from_engine
from captum.attr import LayerIntegratedGradients
from captum.attr import visualization as viz
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


device = "cuda" if torch.cuda.is_available() else "cpu"
print("DEVICE:", device)
USE_CHECKPOINT = True

def save_plot(arr, lbl):
    x = np.arange(len(arr))

    plt.figure(figsize=(8, 6))
    plt.plot(x, arr, label='Data') 
    plt.title(lbl)  
    plt.xlabel('X-axis') 
    plt.ylabel('Y-axis') 
    plt.legend()

    # Save the plot to an image file (e.g., PNG)
    plt.savefig(f"{lbl}.png")
    
def read_data():
    data = pd.read_csv("posts.csv", delimiter=",", dtype={'target': int})

    data['combined'] = data['title'] + " " + data['text']

    # data, _ = train_test_split(data, test_size=0.9)               # when testing the code, use this line to do a sanity check on 10% of data
    train_data, test_data = train_test_split(data, test_size=0.1)

    X_train = train_data['combined']
    y_train = train_data['target']
    X_test = test_data['combined']
    y_test = test_data['target']


    X_train = X_train.dropna()
    y_train = y_train[X_train.index]  # Ensure y_train matches X_train

    X_test = X_test.dropna()
    y_test = y_test[X_test.index]  # Ensure y_test matches X_test

def tokenize_function(examples):
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    return tokenizer(examples["combined"], padding="max_length", truncation=True)

def parse_dataset(df):
    df = df.dropna()
    dataset = Dataset.from_pandas(df)
    dataset = dataset.map(tokenize_function, batched=True)
    dataset = dataset.remove_columns(['title', 'text', 'Unnamed: 0', '__index_level_0__', 'combined'])
    dataset = dataset.rename_column("target", "labels")
    dataset.set_format("torch")
    return dataset

def df_to_dataloader():

    dataset = pd.read_csv("posts.csv", delimiter=",", dtype={'target': int})
    dataset['combined'] = dataset['title'] + " " + dataset['text']

    train_df, test_df = train_test_split(dataset, test_size=0.1, random_state=42)

    tokenized_train = parse_dataset(train_df)
    tokenized_test = parse_dataset(test_df)

    train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=32)
    eval_dataloader = DataLoader(tokenized_test, batch_size=32)

    return train_dataloader, eval_dataloader, train_df, test_df

def print_pred_summary(model, df):
    tokenized = parse_dataset(df)
    print(tokenized)
    loader = DataLoader(tokenized, batch_size = 16)
    print("Printing examples...")
    acc = []
    y_true = []
    y_pred = []
    i = 0
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        y_pred.append(predictions)
        y_true.append(batch['labels'])
        i += 1
        print(i)

    print(y_pred)
    print(y_true)
    y_pred_flat = [item.item() for sublist in y_pred for item in sublist]
    y_true_flat = [item.item() for sublist in y_true for item in sublist]

    acc_score = accuracy_score(y_true_flat, y_pred_flat)
    print("Acc score:", acc_score)

    cm = confusion_matrix(y_true_flat, y_pred_flat, labels=[0, 1, 2, 3])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=[0, 1, 2, 3])
    disp.plot()
    plt.savefig('conf_mat2.png')
    
        

train_dataloader, eval_dataloader, train_df, test_df = df_to_dataloader()

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5, problem_type="single_label_classification")
optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 12
num_training_steps = num_epochs * len(train_dataloader)

milestones_values = [
        (0, 5e-5),
        (num_training_steps, 0.0),
    ]
lr_scheduler = PiecewiseLinear(
        optimizer, param_name="lr", milestones_values=milestones_values
    )

device = torch.device("cpu") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

for param in model.distilbert.parameters():
   param.requires_grad = False


loss_values = []

def train_step(engine, batch):
    model.train()
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    
    loss = outputs.loss
    #loss = (loss * class_weights).mean()
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    loss_values.append(loss.item())
    return loss

trainer = Engine(train_step)

trainer.add_event_handler(Events.ITERATION_STARTED, lr_scheduler)


pbar = ProgressBar()

pbar.attach(trainer)

def evaluate_step(engine, batch):
    model.eval()

    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    print("ACC:")
    acc = []
    print(accuracy_score(predictions, batch["labels"]))
   # print(precision_recall_fscore_support(predictions, batch["labels"]))
    return {'y_pred': predictions, 'y': batch["labels"]}

train_evaluator = Engine(evaluate_step)
validation_evaluator = Engine(evaluate_step)

to_save = {'model': model, 'optimizer': optimizer, 'trainer': trainer}
checkpoint_dir = "checkpoints_distilbert2/"

checkpoint = Checkpoint(
 	 to_save,
    checkpoint_dir,
    n_saved=1,
   global_step_transform=global_step_from_engine(trainer),
)  
train_evaluator.add_event_handler(Events.COMPLETED, checkpoint)


train_acc = []
val_acc = []

@trainer.on(Events.EPOCH_COMPLETED)
def log_training_results(engine):
    if engine.state.epoch % 2 == 0:
        train_evaluator.run(train_dataloader)
        metrics = train_evaluator.state.metrics
        print(f"Training Results - Epoch: {engine.state.epoch}  ")

def log_validation_results(engine):
    if engine.state.epoch % 2 == 0:
        validation_evaluator.run(eval_dataloader)
        metrics = validation_evaluator.state.metrics
        print(f"Validation Results - Epoch: {engine.state.epoch}")

trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)

model.to(device)
if USE_CHECKPOINT == True:
    checkpoint_fp = './checkpoints_distilbert/' + "checkpoint_4.pt"
    checkpoint = torch.load(checkpoint_fp, map_location=device) 
    Checkpoint.load_objects(to_load=to_save, checkpoint=checkpoint)
    print_pred_summary(model, test_df)
        # Define model output
    def model_output(inputs):
        return model(inputs)[0]

    # Define model input
    model_input = model.distilbert.embeddings
    lig = LayerIntegratedGradients(model_output, model_input)

    def construct_input_and_baseline(text):
        max_length = 510
        tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
        baseline_token_id = tokenizer.pad_token_id 
        sep_token_id = tokenizer.sep_token_id 
        cls_token_id = tokenizer.cls_token_id 

        text_ids = tokenizer.encode(text, max_length=max_length, truncation=True, add_special_tokens=False)
    
        input_ids = [cls_token_id] + text_ids + [sep_token_id]
        token_list = tokenizer.convert_ids_to_tokens(input_ids)

        baseline_input_ids = [cls_token_id] + [baseline_token_id] * len(text_ids) + [sep_token_id]
        return torch.tensor([input_ids], device='cpu'), torch.tensor([baseline_input_ids], device='cpu'), token_list

    text = 'This movie is superb'
    input_ids, baseline_input_ids, all_tokens = construct_input_and_baseline(text)

    print(f'original text: {input_ids}')
    print(f'baseline text: {baseline_input_ids}')

    def summarize_attributions(attributions):
        attributions = attributions.sum(dim=-1).squeeze(0)
        attributions = attributions / torch.norm(attributions)
        
        return attributions

    def interpret_text(text, true_class):
        input_ids, baseline_input_ids, all_tokens = construct_input_and_baseline(text)
        attributions, delta = lig.attribute(inputs= input_ids,
                                        baselines= baseline_input_ids,
                                        return_convergence_delta=True,
                                        target=torch.tensor(true_class)
                                        )
        attributions_sum = summarize_attributions(attributions)

        score_vis = viz.VisualizationDataRecord(
                            word_attributions = attributions_sum,
                            pred_prob = torch.max(model(input_ids)[0]),
                            pred_class = torch.argmax(model(input_ids)[0]).numpy(),
                            true_class = true_class,
                            attr_class = text,
                            attr_score = attributions_sum.sum(),       
                            raw_input_ids = all_tokens,
                            convergence_score = delta)

        # if true_class != torch.argmax(model(input_ids)[0]).numpy():
        #     return -10
        # else:
        #     return attributions_sum.sum()
        display(HTML(viz.visualize_text([score_vis])))

    # max_sm = 0
    # max_i = 0
    # for i in range(0, 20):
    example = 15
    train_df = train_df.dropna()
    text = train_df['combined'].iloc[example]
    true_class = train_df['target'].iloc[example]
    print(text)
    print(true_class)

    interpret_text(text, true_class)
        #print(sm)

else:
    trainer.run(train_dataloader, max_epochs=num_epochs)

    save_plot(loss_values, 'Loss')
    save_plot(train_acc, 'train')
    save_plot(val_acc, 'val')



DEVICE: cpu


Map: 100%|██████████| 5042/5042 [00:05<00:00, 920.75 examples/s]
Map: 100%|██████████| 565/565 [00:00<00:00, 663.53 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 28/28 [00:00<00:00, 98.41 examples/s] 


Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 28
})
Printing examples...
1
2


NameError: name 'my_list' is not defined