In [19]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.7.1+cu128
True


In [56]:
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import evaluate
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from wordcloud import WordCloud

In [57]:
path = "../scraper/arxiv_physics_2025-07-02_16-53-00.csv"
df = pd.read_csv(path)

In [58]:
target_categories = ["hep-th", "hep-ph", "gr-qc"]
X = df['title']
y = df['primary_category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21, stratify=y
)

In [59]:
train_texts = X_train.tolist()
train_labels = y_train.tolist()
test_texts = X_test.tolist()
test_labels = y_test.tolist()

In [60]:
label_list = sorted(set(train_labels))
label_to_id = {l: i for i, l in enumerate(label_list)}
train_labels = [label_to_id[l] for l in train_labels]
test_labels = [label_to_id[l] for l in test_labels]

In [61]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_list))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=256)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [49]:
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
test_dataset = Dataset.from_dict({'text': test_texts, 'label': test_labels})

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


Map:   0%|          | 0/1208 [00:00<?, ? examples/s]

Map:   0%|          | 0/303 [00:00<?, ? examples/s]

In [50]:
metric = evaluate.load('accuracy')


In [51]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [64]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy='epoch',  # Changed from 'evaluation_strategy'
    save_strategy='epoch',  # Changed from 'save_strategy'
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)


In [65]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [66]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.608783,0.768977
2,No log,0.568155,0.782178
3,No log,0.77633,0.79868


TrainOutput(global_step=453, training_loss=0.5105053225889901, metrics={'train_runtime': 98.9784, 'train_samples_per_second': 36.614, 'train_steps_per_second': 4.577, 'total_flos': 134089175509632.0, 'train_loss': 0.5105053225889901, 'epoch': 3.0})

In [67]:
trainer.evaluate()

{'eval_loss': 0.7763304710388184,
 'eval_accuracy': 0.7986798679867987,
 'eval_runtime': 1.8065,
 'eval_samples_per_second': 167.725,
 'eval_steps_per_second': 21.035,
 'epoch': 3.0}