## Lab6-Assignment: Topic Classification

Use the same training, development, and test partitions of the the 20 newsgroups text dataset as in Lab6.4-Topic-classification-BERT.ipynb 

* Fine-tune and examine the performance of another transformer-based pretrained language models, e.g., RoBERTa, XLNet

* Compare the performance of this model to the results achieved in Lab6.4-Topic-classification-BERT.ipynb and to a conventional machine learning approach (e.g., SVM, Naive Bayes) using bag-of-words or other engineered features of your choice. 
Describe the differences in performance in terms of Precision, Recall, and F1-score evaluation metrics.

In [3]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationArgs, ClassificationModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Load only the four specific categories we need
categories = ["alt.atheism", "comp.graphics", "sci.med", "sci.space"]

# Strip out headers, footers, and quoted text to prevent overfitting
train_groups = fetch_20newsgroups(
    subset="train",
    remove=("headers", "footers", "quotes"),
    categories=categories,
    random_state=8,
)
test_groups = fetch_20newsgroups(
    subset="test",
    remove=("headers", "footers", "quotes"),
    categories=categories,
    random_state=8,
)

In [None]:
train_df = pd.DataFrame({"text": train_groups.data, "labels": train_groups.target})
test_df = pd.DataFrame({"text": test_groups.data, "labels": test_groups.target})

train_df, dev_df = train_test_split(
    train_df, test_size=0.1, random_state=0, stratify=train_df[["labels"]]
)

In [10]:
model_args = ClassificationArgs()

model_args.overwrite_output_dir = True  

# Enable evaluation during training to monitor performance
model_args.evaluate_during_training = True  

# Training parameters
model_args.num_train_epochs = 10  # Train for 10 epochs
model_args.train_batch_size = 32  # Process 32 samples per batch
model_args.learning_rate = 4e-6  # Learning rate for optimization
model_args.max_seq_length = 256  # Max token length per input (the higher the number, the longer it takes)

# Early stopping helps prevent overfitting by stopping training 
# when validation loss stops improving
model_args.use_early_stopping = True
model_args.early_stopping_delta = 0.01  # Minimum improvement in loss required to continue training
model_args.early_stopping_metric = "eval_loss"  # The metric to monitor
model_args.early_stopping_metric_minimize = True  # Lower eval_loss is better
model_args.early_stopping_patience = 2  # Stop training if no improvement in 2 evaluations

# Run validation every 32 training steps to track progress
model_args.evaluate_during_training_steps = 32


In [20]:
model = ClassificationModel(
    model_type = "roberta",
    model_name = "roberta-large",
    num_labels = 4,
    args = model_args,
    use_cuda = torch.cuda.is_available(), 
)

print("\n".join(str(model.args).split(",")))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ClassificationArgs(adafactor_beta1=None
 adafactor_clip_threshold=1.0
 adafactor_decay_rate=-0.8
 adafactor_eps=(1e-30
 0.001)
 adafactor_relative_step=True
 adafactor_scale_parameter=True
 adafactor_warmup_init=True
 adam_betas=(0.9
 0.999)
 adam_epsilon=1e-08
 best_model_dir='outputs/best_model'
 cache_dir='cache_dir/'
 config={}
 cosine_schedule_num_cycles=0.5
 custom_layer_parameters=[]
 custom_parameter_groups=[]
 dataloader_num_workers=0
 do_lower_case=False
 dynamic_quantize=False
 early_stopping_consider_epochs=False
 early_stopping_delta=0.01
 early_stopping_metric='eval_loss'
 early_stopping_metric_minimize=True
 early_stopping_patience=2
 encoding=None
 eval_batch_size=100
 evaluate_during_training=True
 evaluate_during_training_silent=True
 evaluate_during_training_steps=32
 evaluate_during_training_verbose=False
 evaluate_each_epoch=True
 fp16=False
 gradient_accumulation_steps=1
 learning_rate=4e-06
 local_rank=-1
 logging_steps=50
 loss_type=None
 loss_args={}
 manual_se

In [None]:
training_results = model.train_model(train_df, eval_df = dev_df) # this fine tuning takes a lot of time, pls run it and lmk if it works

history = training_results[1]

  0%|          | 0/4 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling pa

## SVM with BoW and TF-IDF

In [None]:
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [43]:
# Merging train and validation sets from above as validation set isn't needed with SVM/NB
X_train = pd.concat([train_df["text"], dev_df["text"]])
X_test = test_df["text"]
y_train = pd.concat([train_df["labels"], dev_df["labels"]])
y_test = test_df["labels"]

In [47]:
# BoW
vectorizer = CountVectorizer()

# Without merging train and validation
'''
X_train_bow = vectorizer.fit_transform(train_df["text"])
X_dev_bow = vectorizer.transform(dev_df["text"])
X_test_bow = vectorizer.transform(test_df["text"])

y_train, y_dev, y_test = train_df["labels"], dev_df["labels"], test_df["labels"]
'''

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

svm_model = svm.LinearSVC()
svm_model.fit(X_train_bow, y_train)

y_pred = svm_model.predict(X_test_bow)

print(classification_report(y_test, y_pred, target_names=categories))

               precision    recall  f1-score   support

  alt.atheism       0.74      0.70      0.72       319
comp.graphics       0.72      0.84      0.78       389
      sci.med       0.80      0.69      0.74       396
    sci.space       0.71      0.73      0.72       394

     accuracy                           0.74      1498
    macro avg       0.74      0.74      0.74      1498
 weighted avg       0.74      0.74      0.74      1498





In [None]:
# TF-IDF
vectorizer = TfidfVectorizer()
# vectorizer = TfidfVectorizer(min_df=2) -> Slightly better performance

X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)

svm_model = svm.LinearSVC()
svm_model.fit(X_train_tf, y_train)

y_pred = svm_model.predict(X_test_tf)

print(classification_report(y_test, y_pred, target_names=categories))

               precision    recall  f1-score   support

  alt.atheism       0.84      0.77      0.80       319
comp.graphics       0.88      0.88      0.88       389
      sci.med       0.88      0.83      0.85       396
    sci.space       0.77      0.86      0.82       394

     accuracy                           0.84      1498
    macro avg       0.84      0.84      0.84      1498
 weighted avg       0.84      0.84      0.84      1498



## NB with BoW and TF-IDF

In [None]:
# BoW
vectorizer = CountVectorizer()

X_train_bow_nb = vectorizer.fit_transform(X_train)
X_test_bow_nb = vectorizer.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_bow_nb, y_train)

y_pred = nb_model.predict(X_test_bow_nb)

print(classification_report(y_test, y_pred, target_names=categories))

               precision    recall  f1-score   support

  alt.atheism       0.78      0.89      0.83       319
comp.graphics       0.91      0.88      0.90       389
      sci.med       0.84      0.90      0.87       396
    sci.space       0.91      0.77      0.84       394

     accuracy                           0.86      1498
    macro avg       0.86      0.86      0.86      1498
 weighted avg       0.87      0.86      0.86      1498



In [None]:
# TF-IDF
vectorizer = TfidfVectorizer()
# vectorizer = TfidfVectorizer(min_df=2) -> Slightly better performance

X_train_tf_nb = vectorizer.fit_transform(X_train)
X_test_tf_nb = vectorizer.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_tf_nb, y_train)

y_pred = nb_model.predict(X_test_tf_nb)

print(classification_report(y_test, y_pred, target_names=categories))

               precision    recall  f1-score   support

  alt.atheism       0.88      0.71      0.79       319
comp.graphics       0.90      0.87      0.88       389
      sci.med       0.76      0.91      0.83       396
    sci.space       0.83      0.82      0.83       394

     accuracy                           0.83      1498
    macro avg       0.84      0.83      0.83      1498
 weighted avg       0.84      0.83      0.83      1498

