## **Fine Tunning DistilBERT**
### **Davit Davtyan**

#### **Imports and Setup**

In [8]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import accelerate
from datasets import Dataset

#### **1. Data Loading**

In [9]:
ds = load_dataset("knkarthick/dialogsum")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})


#### **2. Data Preprocessing**

In [10]:
clusters = pd.read_csv('dialogsum_clustered.csv')

train_df = pd.DataFrame(ds['train'])
validation_df = pd.DataFrame(ds['validation'])
test_df = pd.DataFrame(ds['test'])

train_df = train_df.merge(clusters, on='id', how='left')
validation_df = validation_df.merge(clusters, on='id', how='left')
test_df = test_df.merge(clusters, on='id', how='left').dropna(subset=['cluster'])

train_df['combined_text'] = train_df['dialogue'] + " [SEP] " + train_df['summary']
validation_df['combined_text'] = validation_df['dialogue'] + " [SEP] " + validation_df['summary']
test_df['combined_text'] = test_df['dialogue'] + " [SEP] " + test_df['summary']

In [11]:
train_texts = train_df['combined_text'].tolist()
train_labels = train_df['cluster'].tolist()

validation_texts = validation_df['combined_text'].tolist()
validation_labels = validation_df['cluster'].tolist()

test_texts = test_df['combined_text'].tolist()
test_labels = test_df['cluster'].tolist()

#### **3. Text Vectorization**

In [12]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=20)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def tokenize_and_format(examples):
    tokenized_inputs = tokenizer(
        examples['combined_text'],
        padding="max_length",
        truncation=True,
        max_length=512
    )
    tokenized_inputs['labels'] = list(examples['cluster'])  # Ensure labels are added correctly
    return tokenized_inputs

# Apply tokenization and formatting to each dataset
train_dataset = Dataset.from_pandas(train_df).map(tokenize_and_format, batched=True)
validation_dataset = Dataset.from_pandas(validation_df).map(tokenize_and_format, batched=True)
test_dataset = Dataset.from_pandas(test_df).map(tokenize_and_format, batched=True)


Map: 100%|██████████| 12460/12460 [00:41<00:00, 302.48 examples/s]
Map: 100%|██████████| 500/500 [00:02<00:00, 220.92 examples/s]
Map: 100%|██████████| 500/500 [00:01<00:00, 349.76 examples/s]


#### **4. Model Training**

In [14]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01
)



In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    compute_metrics=lambda p: {'accuracy': accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
                               'f1': f1_score(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted')}
)

In [None]:
trainer.train()
evaluation_results = trainer.evaluate(test_dataset)
print(evaluation_results)

Epoch,Training Loss,Validation Loss
