## **SBERT with Custom Neural Network**
### **Davit Davtyan**

#### **Imports and Setup**

In [13]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sentence_transformers import SentenceTransformer

#### **1. Data Loading**

In [14]:
ds = load_dataset("knkarthick/dialogsum")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})


#### **2. Data Preprocessing**

In [15]:
clusters = pd.read_csv('dialogsum_clustered.csv')

train_df = pd.DataFrame(ds['train'])
validation_df = pd.DataFrame(ds['validation'])
test_df = pd.DataFrame(ds['test'])

train_df = train_df.merge(clusters, on='id', how='left')
validation_df = validation_df.merge(clusters, on='id', how='left')
test_df = test_df.merge(clusters, on='id', how='left').dropna(subset=['cluster'])

train_df['combined_text'] = train_df['dialogue'] + " [SEP] " + train_df['summary']
validation_df['combined_text'] = validation_df['dialogue'] + " [SEP] " + validation_df['summary']
test_df['combined_text'] = test_df['dialogue'] + " [SEP] " + test_df['summary']

In [16]:
train_texts = train_df['combined_text'].tolist()
train_labels = train_df['cluster'].tolist()

validation_texts = validation_df['combined_text'].tolist()
validation_labels = validation_df['cluster'].tolist()

test_texts = test_df['combined_text'].tolist()
test_labels = test_df['cluster'].tolist()

#### **3. Text Vectorization**

In [17]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [6]:
# Convert texts to embeddings
train_embeddings = sbert_model.encode(train_texts, convert_to_tensor=True)
validation_embeddings = sbert_model.encode(validation_texts, convert_to_tensor=True)
test_embeddings = sbert_model.encode(test_texts, convert_to_tensor=True)

# Convert labels to tensors
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
test_labels = torch.tensor(test_labels)

In [7]:
# Save the embeddings
torch.save(train_embeddings, 'train_embeddings.pt')
torch.save(validation_embeddings, 'validation_embeddings.pt')
torch.save(test_embeddings, 'test_embeddings.pt')

# Save the labels as well
torch.save(train_labels, 'train_labels.pt')
torch.save(validation_labels, 'validation_labels.pt')
torch.save(test_labels, 'test_labels.pt')

In [18]:
# Load the embeddings
train_embeddings = torch.load('train_embeddings.pt')
validation_embeddings = torch.load('validation_embeddings.pt')
test_embeddings = torch.load('test_embeddings.pt')

# Load the labels
train_labels = torch.load('train_labels.pt')
validation_labels = torch.load('validation_labels.pt')
test_labels = torch.load('test_labels.pt')

In [23]:
batch_size = 32

train_dataset = TensorDataset(train_embeddings, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

validation_dataset = TensorDataset(validation_embeddings, validation_labels)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size)

test_dataset = TensorDataset(test_embeddings, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

#### **4. Model Definition**

In [24]:
class NeuralNet(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

model = NeuralNet(input_dim=384, num_classes=20)  # 384 for 'all-MiniLM-L6-v2' embeddings

#### **5. Model Training**

In [25]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(num_epochs):
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # Validation accuracy
        model.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for inputs, labels in validation_loader:
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Accuracy: {100 * correct / total:.2f}%')

train_model(num_epochs=10)

Epoch [1/10], Loss: 1.9273, Validation Accuracy: 33.20%
Epoch [2/10], Loss: 1.9729, Validation Accuracy: 36.20%
Epoch [3/10], Loss: 2.1730, Validation Accuracy: 36.60%
Epoch [4/10], Loss: 2.1014, Validation Accuracy: 39.00%
Epoch [5/10], Loss: 1.8482, Validation Accuracy: 39.40%
Epoch [6/10], Loss: 2.5078, Validation Accuracy: 41.40%
Epoch [7/10], Loss: 2.0506, Validation Accuracy: 40.20%
Epoch [8/10], Loss: 1.8443, Validation Accuracy: 39.20%
Epoch [9/10], Loss: 1.6120, Validation Accuracy: 41.40%
Epoch [10/10], Loss: 1.6391, Validation Accuracy: 41.00%


#### **6. Model Evaluation**

In [26]:
model.eval()
test_preds = []
with torch.no_grad():
    for inputs, _ in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        test_preds.extend(predicted.numpy())

test_f1 = f1_score(test_labels.numpy(), test_preds, average='weighted')
test_accuracy = accuracy_score(test_labels.numpy(), test_preds)
print(f'Test Accuracy: {test_accuracy:.4f}, Test F1 Score: {test_f1:.4f}')

Test Accuracy: 0.3820, Test F1 Score: 0.3768


### **END**