## Metdata and Packages

 * @ Author: Andreas-Nizar Granitzer
 * @ Create Time: 2025-11-24 22:56:46
 * @ Description: This notebook documents the model deployment to Hugging Face.

In [None]:
## Load packages
from transformers import BertForSequenceClassification, BertTokenizer  # tools for NLP with transformers
import torch  # for tensor computation and deep learning
import json  # for json files
import pandas as pd
from huggingface_hub import login, create_repo, upload_file, upload_folder, logout
from dotenv import load_dotenv
import os
from pathlib import Path

## Setup environment

In [None]:
# Load environment variables from .env file (TODO: ensure HF_TOKEN is set there)
load_dotenv()

# Get Hugging Face token from environment variables
hf_token = os.getenv("HF_TOKEN")

# Log in to Hugging Face using the token
login(token=hf_token)

print("âœ… Successfully logged into Hugging Face!")

## Create repo on Hugging Face

In [None]:
repo_name = "asjc-classification/scibert_multilabel_asjc_classifier"
create_repo(repo_name, 
            private=True,   # Set to False if you want a public repo
            exist_ok=True,  # Avoid error if repo already exists
            token=hf_token)

## Convert model to compatible format

In [None]:
 # Define the number of labels
num_labels = 307 

# Load the fine-tuned model weights
model_path = "scibert_multilabel_asjc_classifier.pth"   # Path to your fine-tuned model weights (not in github repository)
checkpoint = torch.load(model_path, map_location=torch.device("cpu"))

# Check if it's wrapped in a dictionary
if "model_state_dict" in checkpoint:
    state_dict = checkpoint["model_state_dict"]
else:
    state_dict = checkpoint  # Directly assign if already a state_dict

# Load SciBERT base model with correct number of labels and pass fine-tuned weights
model = BertForSequenceClassification.from_pretrained(
    'allenai/scibert_scivocab_uncased', 
    num_labels=num_labels,
    state_dict=state_dict  # Pass fine-tuned weights
)

print("Model loaded successfully!")

## Store model and tokenizer on local repository

In [None]:
# Define save directory
SAVE_DIRECTORY = "scibert_multilabel_asjc_classifier"

# Save fine-tuned model
model.save_pretrained(SAVE_DIRECTORY)

# Load & save tokenizer
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased", 
                                          do_lower_case=True)
tokenizer.save_pretrained(SAVE_DIRECTORY)

print(f"Model saved to {SAVE_DIRECTORY}")

## Generate label mappings and modify config

In [None]:
# Input CSV
csv_file = "Categories.csv"  # adjust if needed

# Paths for output files
labels_file = Path(SAVE_DIRECTORY) / "labels.json"
mappings_file = Path(SAVE_DIRECTORY) / "label_mappings.json"

# Load CSV
df = pd.read_csv(csv_file, sep=";")
labels = df["SUBJECT TERM"].tolist()

# Create mappings
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

# Save labels.json
with open(labels_file, "w", encoding="utf-8") as f:
    json.dump(labels, f, indent=2, ensure_ascii=False)

# Save label_mappings.json
with open(mappings_file, "w", encoding="utf-8") as f:
    json.dump({"label2id": label2id, "id2label": id2label}, f, indent=2, ensure_ascii=False)

print(f"Labels and mappings saved in {SAVE_DIRECTORY}")

In [None]:
# Path to the save directory
config_path = Path(SAVE_DIRECTORY) / "config.json"
label_mappings_path = Path(SAVE_DIRECTORY) / "label_mappings.json"

# Load existing config
with open(config_path, "r", encoding="utf-8") as f:
    config = json.load(f)

# Load label_mappings.json
with open(label_mappings_path, "r", encoding="utf-8") as f:
    label_mappings = json.load(f)

# Replace label2id in config
config["label2id"] = label_mappings["label2id"]

# Ensure id2label remains consistent with label2id
config["id2label"] = {str(v): k for k, v in label_mappings["label2id"].items()}

# Add new keys
config["problem_type"] = "multi_label_classification"   # Specify problem type
config["threshold"] = 0.3                               # Threshold for multi-label classification

# Save updated config
with open(config_path, "w", encoding="utf-8") as f:
    json.dump(config, f, indent=2, ensure_ascii=False)

print("config.json updated successfully!")

## Push local folder to Hugging Face

In [None]:
# May be also executed if part of the code is changed (e.g., Model card)
upload_folder(
    folder_path="scibert_multilabel_asjc_classifier",
    repo_id="asjc-classification/scibert_multilabel_asjc_classifier"
)