In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_20newsgroups
import torch
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
import torch
import umap
from sklearn.svm import SVC
import tools as tl 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


### Loading Dataset

In [3]:
# Load the 20 newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

df_train = pd.DataFrame({'text': newsgroups_train.data, 'label': newsgroups_train.target})
df_test = pd.DataFrame({'text': newsgroups_test.data, 'label': newsgroups_test.target})

df_train['label'] = df_train['label'].map(lambda x: newsgroups_train.target_names[x])
df_test['label'] = df_test['label'].map(lambda x: newsgroups_test.target_names[x])

In [4]:
len(df_train), len(df_test)

(11314, 7532)

### Embedding text

In [5]:
# Initialize distilroberta tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
model = RobertaModel.from_pretrained('distilroberta-base')
# Generate embeddings
print("Generating embeddings...")
embeddings_train = tl.generate_embeddings(df_train['text'].tolist(), tokenizer, model)
embeddings_test = tl.generate_embeddings(df_test['text'].tolist(), tokenizer, model)

embeddings_test = torch.tensor(embeddings_test)
embeddings_train = torch.tensor(embeddings_train)
print("Embeddings generated!")

Generating embeddings...


Generating Embeddings: 100%|██████████| 708/708 [10:51<00:00,  1.09batch/s]
Generating Embeddings: 100%|██████████| 471/471 [07:16<00:00,  1.08batch/s]

Embeddings generated!



  embeddings_test = torch.tensor(embeddings_test)
  embeddings_train = torch.tensor(embeddings_train)


### training SVM

In [7]:
# Fit an SVM model to the reduced embeddings
svm_model = SVC(kernel='linear')
svm_model.fit(embeddings_train, df_train['label'])

# Predict the labels
predicted_labels = svm_model.predict(embeddings_test)

# Add the predicted labels to the dataframe
df_test['cluster'] = predicted_labels
df_test['embedding'] = embeddings_test.tolist()

df_train['embedding'] = embeddings_train.tolist()

In [9]:
df_test.to_csv('outputs/svm_news.csv', index=False)

### Evaluation

In [10]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate precision
precision = precision_score(df_test['label'], df_test['cluster'], average='weighted')
print(f"Precision: {precision * 100:.2f}%")

# Calculate recall
recall = recall_score(df_test['label'], df_test['cluster'], average='weighted')
print(f"Recall: {recall * 100:.2f}%")

# Calculate F1 score
f1 = f1_score(df_test['label'], df_test['cluster'], average='weighted')
print(f"F1 Score: {f1 * 100:.2f}%")

Precision: 64.43%
Recall: 64.17%
F1 Score: 64.06%
