<a href="https://colab.research.google.com/github/gulayoklan/Ceng463-Assignment2/blob/main/task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
from nltk.corpus import stopwords
import re

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup, AutoTokenizer

In [None]:
file_path = "orientation-it-train.tsv"

# Load the dataset
df = pd.read_csv(file_path, sep='\t')

# Keep only the necessary columns (text and label) and drop rows with missing values
df = df[['text_en', 'text' ,'label']].dropna()

# Ensure the label column is binary and of type integer
df['label'] = df['label'].astype(int)



In [None]:
# Stratified split into train, validation, and test sets
train, temp = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
val, test = train_test_split(temp, test_size=0.5, stratify=temp['label'], random_state=42)

test.head()
print(df['label'].unique())
print("Train label counts:\n", train['label'].value_counts())
print("Val   label counts:\n", val['label'].value_counts())
print("Test  label counts:\n", test['label'].value_counts())

In [None]:
#oversampling to resolve the class imbalance problem
from sklearn.utils import resample

# Separate majority and minority classes
train_majority = train[train.label == 1]
train_minority = train[train.label == 0]

print("Before oversampling:")
print(train['label'].value_counts())

# Upsample minority class
train_minority_upsampled = resample(
    train_minority,
    replace=True,             # sample with replacement
    n_samples=len(train_majority),    # to match majority class
    random_state=42           # reproducible results
)

# Combine majority class with upsampled minority class
train_oversampled = pd.concat([train_majority, train_minority_upsampled])

print("After oversampling:")
print(train_oversampled['label'].value_counts())
# Shuffle the oversampled training data
train = train_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)

Before oversampling:
label
1    1663
0    1030
Name: count, dtype: int64
After oversampling:
label
1    1663
0    1663
Name: count, dtype: int64


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model =BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased',num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess(data,tokenizer):
  input_ids=[]
  attention_masks=[]
  sentences=data.text_en.values
  labels=data.label.values
  for sentence in sentences:
    encoded_dict=tokenizer.encode_plus(sentence,
                                            add_special_tokens=True,
                                            max_length=512,
                                            padding="max_length",
                                            truncation=True,
                                            return_attention_mask=True,
                                            return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)
  return TensorDataset(input_ids,attention_masks,labels)


In [None]:
train_data=preprocess(train,tokenizer)
val_data=preprocess(val,tokenizer)
test_data=preprocess(test,tokenizer)
batch_size=8
train_dataloader=DataLoader(train_data,batch_size=batch_size)
validation_dataloader=DataLoader(val_data,batch_size=batch_size)
test_dataloader=DataLoader(test_data,batch_size=batch_size)



In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )

In [None]:

epochs = 6

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()
best_eval_accuracy = 0
# For each epoch...
for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        output = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        loss = output.loss
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
    # validation
    print("")
    print("Running Validation...")
    t0 = time.time()
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():
            output= model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
model=torch.load("bert_model")

  model=torch.load("bert_model")


In [None]:
predictions = []
for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():
            output= model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask)
            logits = output.logits
            logits = logits.detach().cpu().numpy()
            pred_flat = np.argmax(logits, axis=1).flatten()

            predictions.extend(list(pred_flat))

labels=[label for label in test.label.values]
print(classification_report(labels,predictions))
print(np.unique(predictions, return_counts=True))

              precision    recall  f1-score   support

           0       0.82      0.69      0.75       129
           1       0.83      0.91      0.86       208

    accuracy                           0.82       337
   macro avg       0.82      0.80      0.81       337
weighted avg       0.82      0.82      0.82       337

(array([0, 1]), array([108, 229]))


## LLM Inference

In [None]:
from huggingface_hub import login

login() # It will prompt you to enter your token

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import pipeline
model_name = "meta-llama/Llama-3.1-8B"
generator = pipeline(
    "text-generation",
    model=model_name,
    device=-1,    # model at cpu
)

In [None]:
def classify_with_generation(
    generator_pipeline,
    text,
    candidate_labels,
    max_new_tokens=32,
    do_sample=True,
    repetition_penalty=1.0
):
    labels_str = ", ".join(candidate_labels)

    prompt2=(
        "You are a highly intelligent political analyst and language model trained to classify parliamentary texts based on the speaker's political orientation.\n "
        "Your task is to determine whether the speaker's orientation is 'right' or 'left' based on the provided parliamentary speech. Respond with only the label ('right' or 'left') and nothing else.\n"
        f"Here is the parliamentary text:{text}\n"
        "Label:"
      )
    # 2) Generate output from the model
    output = generator_pipeline(
        prompt2,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=0.2,
        repetition_penalty=repetition_penalty
    )

    # The pipeline returns a list of dicts, e.g. [{'generated_text': "..."}].
    generated_text = output[0]["generated_text"]


    best_label = None
    # Convert to lowercase for simpler matching (optional).
    gen_lower = generated_text.lower()

    for label in candidate_labels:
        if label.lower() in gen_lower[-17:]:
            best_label = label
            break

    return best_label, generated_text

def llm_inference(texts):
  candidate_labels = ["left", "right"]
  predictions = []
  true_labels = []
  labels = test.label.values

  wrong_label=0
  for i in range(0,len(texts)):

    text = texts[i]
    label = labels[i]
    best_label, raw_output = classify_with_generation(
        generator_pipeline=generator,
        text=text,
        candidate_labels=candidate_labels,
        max_new_tokens=3,
        do_sample=False,
        repetition_penalty=1.2
    )
    if (best_label=="left"):
      predictions.append(0)
      true_labels.append(label)
    elif (best_label=="right"):
      predictions.append(1)
      true_labels.append(label)
    else:
      wrong_label+=1
  return predictions,true_labels,wrong_label





In [None]:
print(labels[1])

In [None]:
it_predictions,it_true_labels,it_wrong_label=llm_inference(test.text.values)
predictions,true_labels,wrong_label=llm_inference(test.text_en.values)


In [None]:
print(classification_report(it_true_labels,it_predictions))
print(it_wrong_label)
print(np.unique(it_predictions, return_counts=True))

              precision    recall  f1-score   support

           0       0.39      0.19      0.25       129
           1       0.62      0.82      0.71       208

    accuracy                           0.58       337
   macro avg       0.51      0.50      0.48       337
weighted avg       0.53      0.58      0.53       337

0
(array([0, 1]), array([ 61, 276]))


In [None]:
print(classification_report(true_labels,predictions))
print(wrong_label)
print(np.unique(predictions, return_counts=True))