In [1]:
%load_ext autoreload
%autoreload 2


In [3]:
from datasets import DatasetDict, Dataset
from transformers import pipeline
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from src.utils import map_category


In [4]:
# Define the path to the interim data directory
data_path = "data/interim/part-*.json"
json_files = glob.glob(data_path)
papers_df = pd.concat([pd.read_json(file, lines=True) for file in json_files], ignore_index=True)

papers_df["arxiv_main_category"] = papers_df["main_category"].apply(map_category)
papers_df["text"] = papers_df["title"] + "\n" + papers_df["summary"]
papers_df = papers_df[["text", "arxiv_main_category"]]


In [5]:
# First split: 80% training, 20% temp (test + validation)
train_df, temp_df = train_test_split(
    papers_df, 
    test_size=0.2,
    stratify=papers_df["arxiv_main_category"],
    random_state=42
)

# Second split: Split temp into validation (50%) and test (50%)
# This gives us the final 10% validation, 10% test split from original dataset
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["arxiv_main_category"],
    random_state=42
)


In [None]:
# Print split sizes to verify
print(f"Training set: {len(train_df)} examples ({len(train_df)/len(papers_df)*100:.1f}%)")
print(f"Validation set: {len(val_df)} examples ({len(val_df)/len(papers_df)*100:.1f}%)")
print(f"Test set: {len(test_df)} examples ({len(test_df)/len(papers_df)*100:.1f}%)")

# Check category distribution across all splits using pandas
print(f"Total categories to classify: {len(papers_df['arxiv_main_category'].unique())}")

# Dictionary of DataFrames for easy iteration
split_dfs = {
    "train": train_df,
    "validation": val_df,
    "test": test_df
}

for split_name, df in split_dfs.items():
    # Get unique categories in this split
    split_categories = set(df["arxiv_main_category"].unique())
    
    # Check if all categories are present
    missing_categories = set(papers_df["arxiv_main_category"]) - split_categories
    
    print(f"\nSplit: {split_name}")
    print(f"Number of unique categories: {len(split_categories)}")
    print(f"Categories present: {len(split_categories)}/{len(papers_df['arxiv_main_category'].unique())}")


Training set: 5442 examples (80.0%)
Validation set: 680 examples (10.0%)
Test set: 681 examples (10.0%)
Total categories to classify: 16

Split: train
Number of unique categories: 16
Categories present: 16/16

Split: validation
Number of unique categories: 16
Categories present: 16/16

Split: test
Number of unique categories: 16
Categories present: 16/16


In [7]:
# Create the final DatasetDict with train, validation, and test splits
data = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(val_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False)
})

# Print the final split dataset structure
print(data)



DatasetDict({
    train: Dataset({
        features: ['text', 'arxiv_main_category'],
        num_rows: 5442
    })
    validation: Dataset({
        features: ['text', 'arxiv_main_category'],
        num_rows: 680
    })
    test: Dataset({
        features: ['text', 'arxiv_main_category'],
        num_rows: 681
    })
})


In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Convert text to embeddings
train_embeddings = model.encode(data["train"]["text"], show_progress_bar=True)
test_embeddings = model.encode(data["test"]["text"], show_progress_bar=True)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/171 [00:00<?, ?it/s]

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

In [9]:
train_embeddings.shape


(5442, 768)

In [10]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["arxiv_main_category"])


In [12]:
y_pred = clf.predict(test_embeddings)
print(classification_report(data["test"]["arxiv_main_category"], y_pred))


              precision    recall  f1-score   support

    astro-ph       0.82      0.86      0.84        43
    cond-mat       0.81      0.76      0.78        50
          cs       0.88      0.95      0.91       290
        econ       0.50      0.25      0.33         4
        eess       0.59      0.42      0.49        31
       gr-qc       0.58      0.64      0.61        11
         hep       0.77      0.80      0.79        30
        math       0.84      0.84      0.84       116
     math-ph       0.00      0.00      0.00         2
        nlin       0.00      0.00      0.00         2
        nucl       1.00      0.29      0.44         7
     physics       0.69      0.69      0.69        42
       q-bio       1.00      0.40      0.57         5
       q-fin       0.00      0.00      0.00         2
    quant-ph       0.88      0.88      0.88        32
        stat       0.71      0.71      0.71        14

    accuracy                           0.83       681
   macro avg       0.63   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from sklearn.svm import SVC

clf = SVC(random_state=42)
clf.fit(train_embeddings, data["train"]["arxiv_main_category"])
y_pred = clf.predict(test_embeddings)
print(classification_report(data["test"]["arxiv_main_category"], y_pred))


              precision    recall  f1-score   support

    astro-ph       0.85      0.93      0.89        43
    cond-mat       0.84      0.86      0.85        50
          cs       0.87      0.97      0.92       290
        econ       0.00      0.00      0.00         4
        eess       0.58      0.45      0.51        31
       gr-qc       0.58      0.64      0.61        11
         hep       0.92      0.80      0.86        30
        math       0.88      0.84      0.86       116
     math-ph       0.00      0.00      0.00         2
        nlin       0.00      0.00      0.00         2
        nucl       1.00      1.00      1.00         7
     physics       0.80      0.67      0.73        42
       q-bio       1.00      0.20      0.33         5
       q-fin       0.00      0.00      0.00         2
    quant-ph       0.82      0.84      0.83        32
        stat       0.73      0.57      0.64        14

    accuracy                           0.85       681
   macro avg       0.62   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

# Average the embeddings of all documents in each target label
df = pd.DataFrame(np.hstack([train_embeddings, np.array(data["train"]["arxiv_main_category"]).reshape(-1, 1)]))
averaged_target_embeddings = df.groupby(768).mean().values

# Find the best matching embeddings between evaluation documents and target embeddings
sim_matrix = cosine_similarity(test_embeddings, averaged_target_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

# Evaluate the model
print(classification_report(data["test"]["label"], y_pred))


In [17]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

# Create DataFrame with embeddings and category labels
embeddings_df = pd.DataFrame(train_embeddings)
embeddings_df["category"] = data["train"]["arxiv_main_category"]

# Group by category and calculate mean of embeddings
grouped = embeddings_df.groupby("category").mean()
categories = grouped.index.tolist()
averaged_target_embeddings = grouped.values

# Find the best matching embeddings between test documents and target embeddings
sim_matrix = cosine_similarity(test_embeddings, averaged_target_embeddings)
y_pred_indices = np.argmax(sim_matrix, axis=1)
y_pred = [categories[i] for i in y_pred_indices]

# Evaluate the model
print(classification_report(data["test"]["arxiv_main_category"], y_pred))


              precision    recall  f1-score   support

    astro-ph       0.86      0.84      0.85        43
    cond-mat       0.83      0.68      0.75        50
          cs       0.97      0.74      0.84       290
        econ       0.17      1.00      0.29         4
        eess       0.30      0.65      0.41        31
       gr-qc       0.32      0.64      0.42        11
         hep       0.95      0.67      0.78        30
        math       0.91      0.66      0.77       116
     math-ph       0.07      0.50      0.12         2
        nlin       0.11      1.00      0.20         2
        nucl       0.54      1.00      0.70         7
     physics       0.67      0.43      0.52        42
       q-bio       0.27      0.80      0.40         5
       q-fin       0.40      1.00      0.57         2
    quant-ph       0.78      0.88      0.82        32
        stat       0.41      0.86      0.56        14

    accuracy                           0.72       681
   macro avg       0.53   