In [None]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import torch
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
import torch
import tools as tl
from sklearn.svm import SVC

In [27]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [28]:
# Load the dataset
file_path = '../../datasets/BBC News/BBC News Train.csv'  # Replace with your actual file path
bbc_df = pd.read_csv(file_path)

# Create the test set (735 rows) and training set (remaining rows)
test_set = bbc_df.sample(n=735, random_state=42).reset_index(drop=True)  # Test set with 735 samples
train_set = bbc_df.drop(test_set.index).reset_index(drop=True)  # Training set with remaining rows

In [29]:
len(train_set), len(test_set)

(755, 735)

In [5]:
train_set.head(3)

Unnamed: 0,ArticleId,Text,Category
0,2031,uk young top euro earnings league british chil...,business
1,76,tech helps disabled speed demons an organisati...,tech
2,1860,camera phones are must-haves four times more...,tech


In [6]:
# Initialize distilroberta tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
model = RobertaModel.from_pretrained('distilroberta-base')
# Generate embeddings
print("Generating embeddings...")
embeddings_train = tl.generate_embeddings(train_set['Text'].tolist(), tokenizer, model)
embeddings_test = tl.generate_embeddings(test_set['Text'].tolist(), tokenizer, model)

embeddings_test = torch.tensor(embeddings_test)
embeddings_train = torch.tensor(embeddings_train)
print("Embeddings generated!")

Generating embeddings...


Generating Embeddings: 100%|██████████| 48/48 [00:46<00:00,  1.04batch/s]
Generating Embeddings: 100%|██████████| 46/46 [00:47<00:00,  1.03s/batch]

Embeddings generated!



  embeddings_test = torch.tensor(embeddings_test)
  embeddings_train = torch.tensor(embeddings_train)


In [20]:
# Fit an SVM model to the reduced embeddings
svm_model = SVC(kernel='linear')
svm_model.fit(embeddings_train, train_set['Category'])

# Predict the labels
predicted_labels = svm_model.predict(embeddings_test)

# Add the predicted labels to the dataframe
test_set['cluster'] = predicted_labels
test_set['embedding'] = embeddings_test.tolist()

In [21]:
test_set.to_csv('outputs/bbc_svm.csv', index=False)

In [22]:
bbc_svm = pd.read_csv('outputs/bbc_svm.csv')

### Accuracy

In [25]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score

def calculate_metrics(y_true, y_pred):
    """
    Calculate various classification metrics
    
    Parameters:
    y_true: True labels (actual categories)
    y_pred: Predicted labels
    
    Returns:
    dict: Dictionary containing various metrics
    """
    # Calculate individual metrics
    accuracy = (y_true == y_pred).mean()
    f1 = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    
    # Get detailed classification report
    report = classification_report(y_true, y_pred)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    metrics = {
        'accuracy': accuracy * 100,
        'f1_score': f1 * 100,
        'precision': precision * 100,
        'recall': recall * 100,
        'detailed_report': report,
        'confusion_matrix': conf_matrix
    }
    
    return metrics

# Using your existing data
metrics = calculate_metrics(
    bbc_svm['Category'],
    bbc_svm['cluster']
)

# Print results
print(f"Accuracy: {metrics['accuracy']:.2f}%")
print(f"F1 Score: {metrics['f1_score']:.2f}%")
print(f"Precision: {metrics['precision']:.2f}%")
print(f"Recall: {metrics['recall']:.2f}%")
print("\nDetailed Classification Report:")
print(metrics['detailed_report'])

Accuracy: 98.64%
F1 Score: 98.64%
Precision: 98.65%
Recall: 98.64%

Detailed Classification Report:
               precision    recall  f1-score   support

     business       0.99      0.97      0.98       169
entertainment       0.99      0.98      0.99       128
     politics       0.99      0.99      0.99       140
        sport       0.99      1.00      1.00       162
         tech       0.96      0.99      0.97       136

     accuracy                           0.99       735
    macro avg       0.99      0.99      0.99       735
 weighted avg       0.99      0.99      0.99       735

