In [1]:
import glob
import pandas as pd
from src.utils import map_category
import numpy as np
from datasets import load_from_disk


In [2]:
stream_data = load_from_disk("data/processed/stream_data")


In [3]:
stream_data.set_format(type="pandas")
train_df = stream_data["train"][:]
valid_df = stream_data["validation"][:]


In [4]:
# def label_int2str(row):
#     return stream_data["train"].features["label"].int2str(row)
# train_df["label_name"] = train_df["label"].apply(label_int2str)
# train_df["label_name"].value_counts(ascending=True)


In [5]:
X_train, y_train = train_df["text"], train_df["label"]
X_valid, y_valid = valid_df["text"], valid_df["label"]
labels = stream_data["train"].features["label"].names


In [6]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


In [7]:
import torch.nn as nn

# Define classification head
class ClassificationHead(nn.Module):
    def __init__(self, embedding_dim, num_classes):
        super(ClassificationHead, self).__init__()
        self.linear = nn.Linear(embedding_dim, num_classes)

    def forward(self, features):
        x = features['sentence_embedding']
        x = self.linear(x)
        return x

# Define the number of classes for a classification task.
num_classes = 20
classification_head = ClassificationHead(model.get_sentence_embedding_dimension(), num_classes)

# Combine SentenceTransformer model and classification head."
class SentenceTransformerWithHead(nn.Module):
    def __init__(self, transformer, head):
        super(SentenceTransformerWithHead, self).__init__()
        self.transformer = transformer
        self.head = head

    def forward(self, input):
        features = self.transformer(input)
        logits = self.head(features)
        return logits

model_with_head = SentenceTransformerWithHead(model, classification_head)


In [8]:
import os
os.environ["TORCH_USE_CUDA_DSA"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

from sentence_transformers import InputExample
import torch
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
import time
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Use CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move models to device once
model = model.to(device)
model_with_head = model_with_head.to(device)

# training parameters
num_epochs = 5
batch_size = 2
learning_rate = 2e-5

# Convert the dataset to PyTorch tensors
train_examples = [InputExample(texts=[s], label=l) for s, l in zip(X_train, y_train)]
valid_examples = [InputExample(texts=[s], label=l) for s, l in zip(X_valid, y_valid)]

# Dataloaders
def collate_fn(batch):
    texts = [example.texts[0] for example in batch]
    labels = torch.tensor([example.label for example in batch])
    return texts, labels

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_examples, batch_size=batch_size, collate_fn=collate_fn)

# Define the loss function, optimizer, and scheduler
y_train_array = y_train.values if isinstance(y_train, pd.Series) else y_train
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train_array), y=y_train_array)
class_weights = torch.FloatTensor(class_weights)
class_weights = class_weights.to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

optimizer = AdamW(model_with_head.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
train_loss_list = []
val_loss_list = []
best_val_f1 = 0

for epoch in range(num_epochs):
    # Training phase
    model_with_head.train()
    epoch_start_time = time.time()
    train_loss = 0.0
    
    for step, (texts, labels) in enumerate(train_dataloader):
        labels = labels.to(device)
        optimizer.zero_grad()

        # Encode text and pass through classification head
        inputs = model.tokenize(texts)
        input_ids = inputs['input_ids'].to(device)
        input_attention_mask = inputs['attention_mask'].to(device)
        inputs_final = {'input_ids': input_ids, 'attention_mask': input_attention_mask}
        
        logits = model_with_head(inputs_final)
        
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()
        if step % 100 == 0:
            print(f"Epoch {epoch}, Step {step}/{len(train_dataloader)}, Loss: {loss.item():.4f}")
    
    # Calculate average training loss for the epoch
    avg_train_loss = train_loss / len(train_dataloader)
    train_loss_list.append(avg_train_loss)

    # Validation phase
    model_with_head.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for texts, labels in valid_dataloader:
            labels = labels.to(device)
            
            inputs = model.tokenize(texts)
            input_ids = inputs['input_ids'].to(device)
            input_attention_mask = inputs['attention_mask'].to(device)
            inputs_final = {'input_ids': input_ids, 'attention_mask': input_attention_mask}
            
            logits = model_with_head(inputs_final)
            loss = criterion(logits, labels)
            
            val_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            # Collect predictions for F1 score
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    val_loss = val_loss / len(valid_dataloader)
    accuracy = 100 * correct / total
    val_loss_list.append(val_loss)
    val_f1 = f1_score(all_labels, all_preds, average="weighted")
    
    epoch_time = time.time() - epoch_start_time
    print(f'Epoch {epoch+1}/{num_epochs}, Time: {epoch_time:.2f}s')
    print(f'Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}')
    print(f'Accuracy: {accuracy:.2f}%, F1 Score: {val_f1:.4f}')
    
    # Save both the encoder model and the full model with head
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        model_save_path = f'data/interim/epoch-{epoch}'
        model.save(model_save_path)  # Save encoder
        
        # Save the full model with classification head
        torch.save({
            'model_state_dict': model_with_head.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, f'{model_save_path}_with_head.pt')
        print(f"Saved best model with F1: {val_f1:.4f}")


Using device: cuda




Epoch 0, Step 0/4457, Loss: 3.0041
Epoch 0, Step 100/4457, Loss: 2.9559
Epoch 0, Step 200/4457, Loss: 2.8772
Epoch 0, Step 300/4457, Loss: 2.4159
Epoch 0, Step 400/4457, Loss: 2.5326
Epoch 0, Step 500/4457, Loss: 3.0054
Epoch 0, Step 600/4457, Loss: 2.3365
Epoch 0, Step 700/4457, Loss: 2.3214
Epoch 0, Step 800/4457, Loss: 2.3561
Epoch 0, Step 900/4457, Loss: 2.6650
Epoch 0, Step 1000/4457, Loss: 2.4817
Epoch 0, Step 1100/4457, Loss: 2.8405
Epoch 0, Step 1200/4457, Loss: 2.4017
Epoch 0, Step 1300/4457, Loss: 2.3789
Epoch 0, Step 1400/4457, Loss: 2.1779
Epoch 0, Step 1500/4457, Loss: 2.7591
Epoch 0, Step 1600/4457, Loss: 2.3966
Epoch 0, Step 1700/4457, Loss: 2.8235
Epoch 0, Step 1800/4457, Loss: 2.7049
Epoch 0, Step 1900/4457, Loss: 3.0158
Epoch 0, Step 2000/4457, Loss: 2.3519
Epoch 0, Step 2100/4457, Loss: 2.3287
Epoch 0, Step 2200/4457, Loss: 2.2698
Epoch 0, Step 2300/4457, Loss: 2.5066
Epoch 0, Step 2400/4457, Loss: 2.8131
Epoch 0, Step 2500/4457, Loss: 2.6059
Epoch 0, Step 2600/4457,

In [9]:
# Save the final model
# model_final_save_path='data/interim/st_ft_'
# model.save(model_final_save_path)


In [10]:
model = SentenceTransformer("data/interim/epoch-3")


In [11]:
X_train_embed = model.encode(X_train, show_progress_bar=True)


Batches:   0%|          | 0/279 [00:00<?, ?it/s]

In [12]:
X_valid_embed = model.encode(X_valid, show_progress_bar=True)


Batches:   0%|          | 0/35 [00:00<?, ?it/s]

In [13]:
X_train_embed.shape, X_valid_embed.shape


((8914, 384), (1114, 384))

In [14]:
from sklearn.metrics import f1_score
from sklearn.utils.extmath import density
from time import time

def benchmark(clf, custom_name=False):
    print("_" * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train_embed, y_train)
    train_time = time() - t0
    print(f"train time: {train_time:.3}s")

    t0 = time()
    pred = clf.predict(X_valid_embed)
    test_time = time() - t0
    print(f"inference time:  {test_time:.3}s")

    weighted_f1 = f1_score(y_valid, pred, average="weighted")
    print(f"Weighted F1 score:    {weighted_f1:.3}")

    if hasattr(clf, "coef_"):
        print(f"dimensionality: {clf.coef_.shape[1]}")
        print(f"density: {density(clf.coef_)}")
        print()
    
    print()
    if custom_name:
        clf_descr = str(custom_name)
    else:
        clf_descr = clf.__class__.__name__
    return clf_descr, weighted_f1, train_time, test_time


In [15]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier

results_embed = []
for clf, name in (
    (LogisticRegression(class_weight="balanced"), "Logistic Regression"),
    (RidgeClassifier(class_weight="balanced"), "Ridge Classifier"),
    (KNeighborsClassifier(), "kNN"),
    (XGBClassifier(), "XGBoost"),
    (LinearSVC(class_weight="balanced"), "Linear SVC"),
    (SGDClassifier(class_weight="balanced"), "SGD Classifier"),
    (NearestCentroid(), "NearestCentroid"),
):
    print("=" * 80)
    print(name)
    results_embed.append(benchmark(clf, name))


Logistic Regression
________________________________________________________________________________
Training: 
LogisticRegression(class_weight='balanced')
train time: 7.57s
inference time:  0.00116s
Weighted F1 score:    0.811
dimensionality: 384
density: 1.0


Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(class_weight='balanced')
train time: 0.0737s
inference time:  0.0305s
Weighted F1 score:    0.808
dimensionality: 384
density: 1.0


kNN
________________________________________________________________________________
Training: 
KNeighborsClassifier()
train time: 0.00202s
inference time:  0.155s
Weighted F1 score:    0.817

XGBoost
________________________________________________________________________________
Training: 
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping