# Upload Model to Hugging Face Hub

This notebook uploads your trained model to Hugging Face Hub.

## Setup

1. **Get Hugging Face Token**:
   - Go to https://huggingface.co/settings/tokens
   - Create a new token with **"write"** permissions
   - Copy the token

2. **Update Configuration**:
   - Update `MODEL_PATH` to point to your trained model
   - Update `REPO_ID` if you want a different repository name

3. **Run all cells**


In [None]:
%pip install huggingface_hub -U -q


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


## Configuration

**Update these values:**


In [None]:
# ==========================================
# CONFIGURATION
# ==========================================

# Path to your trained model in Google Drive
# Update this to your actual model path
MODEL_PATH = "/content/drive/MyDrive/Indo_Religiolect_V2/model_final"  # IndoBERT
# MODEL_PATH = "/content/drive/MyDrive/Indo_Religiolect_V2/model_sahabat_ai"  # Sahabat-AI

# Hugging Face repository ID (your username/repo-name)
REPO_ID = "dansachs/indo-religiolect-bert"

# Your Hugging Face token (get from https://huggingface.co/settings/tokens)
# Option 1: Paste token here (will be visible in notebook)
HF_TOKEN = ""  # Paste your token here

# Option 2: Use environment variable (more secure)
# Set this in Colab: Runtime ‚Üí Change runtime type ‚Üí Environment variables
# Or use: import os; os.environ["HF_TOKEN"] = "your_token_here"


In [None]:
import os
from pathlib import Path
from huggingface_hub import HfApi, login
from huggingface_hub.utils import HfHubHTTPError
import json

# Set token if provided
if HF_TOKEN:
    os.environ["HF_TOKEN"] = HF_TOKEN

# Check model path
model_path = Path(MODEL_PATH)
if not model_path.exists():
    raise FileNotFoundError(f"‚ùå Model path does not exist: {MODEL_PATH}")

print(f"üìÇ Model path: {MODEL_PATH}")
print(f"üì¶ Repository: {REPO_ID}")
print(f"\n‚úÖ Configuration ready!")


In [None]:
# Check model files
print("üîç Checking model files...")

required_files = ["config.json"]
model_file = None

if (model_path / "pytorch_model.bin").exists():
    model_file = "pytorch_model.bin"
elif (model_path / "model.safetensors").exists():
    model_file = "model.safetensors"
else:
    raise FileNotFoundError("‚ùå Model file not found (pytorch_model.bin or model.safetensors)")

print(f"   ‚úÖ Found: config.json")
print(f"   ‚úÖ Found: {model_file}")

# Check for tokenizer
if (model_path / "tokenizer_config.json").exists():
    print(f"   ‚úÖ Found: tokenizer files")
else:
    print(f"   ‚ö†Ô∏è  Tokenizer files not found (will upload model only)")

# Check for label map
if (model_path / "label_map.json").exists():
    print(f"   ‚úÖ Found: label_map.json")

print("\n‚úÖ All required files found!")


In [None]:
# Login to Hugging Face
print("üîê Logging in to Hugging Face...")

if os.getenv("HF_TOKEN"):
    print("   ‚úÖ Using token from environment")
    login(token=os.getenv("HF_TOKEN"), add_to_git_credential=True)
else:
    print("   üí° You'll be prompted to enter your token.")
    print("   üí° Get one at: https://huggingface.co/settings/tokens")
    login()

print("‚úÖ Logged in successfully!")


In [None]:
# Initialize API and check repository
api = HfApi()

print(f"üì¶ Checking repository: {REPO_ID}")
try:
    api.repo_info(REPO_ID, repo_type="model")
    print(f"   ‚ö†Ô∏è  Repository already exists. Will update it.")
    overwrite = True
except HfHubHTTPError as e:
    if e.status_code == 404:
        print(f"   ‚úÖ Repository doesn't exist. Will create it.")
        overwrite = False
    else:
        raise

print("\n‚úÖ Ready to upload!")


In [None]:
# Upload model files
print(f"üì§ Uploading model files...")
print(f"   From: {MODEL_PATH}")
print(f"   To: {REPO_ID}")
print(f"   This may take a few minutes...\n")

try:
    api.upload_folder(
        folder_path=str(model_path),
        repo_id=REPO_ID,
        repo_type="model",
        ignore_patterns=["*.log", "*.png", "__pycache__", "*.pyc"],  # Skip logs and cache
    )
    print("\n‚úÖ Model files uploaded successfully!")
except Exception as e:
    print(f"\n‚ùå Error uploading files: {e}")
    raise


In [None]:
# Create and upload model card (README.md)
print("üìù Creating model card...")

# Load training info if available
training_info = {}
info_file = model_path / "training_info.json"
if info_file.exists():
    with open(info_file, 'r') as f:
        training_info = json.load(f)

# Load label map
label_map = {}
label_file = model_path / "label_map.json"
if label_file.exists():
    with open(label_file, 'r') as f:
        label_map = json.load(f)

# Create model card
model_name = training_info.get("model_name", "indolem/indobert-base-uncased")
num_epochs = training_info.get("num_epochs", "N/A")
train_samples = training_info.get("train_samples", "N/A")

# Format train_samples
if isinstance(train_samples, int):
    train_samples_str = f"{train_samples:,}"
else:
    train_samples_str = str(train_samples)

# Format label_map
if label_map:
    labels_str = str(label_map)
else:
    labels_str = "Islam (0), Catholic (1), Protestant (2)"

model_card = f"""---
license: apache-2.0
base_model: {model_name}
tags:
  - indonesian
  - classification
  - religiolect
  - bert
  - text-classification
---

# Indo-Religiolect-BERT

A fine-tuned Indonesian BERT model for classifying religious texts into:
- **Islam** (Muslim)
- **Catholic**
- **Protestant**

## Model Details

- **Base Model**: `{model_name}`
- **Task**: Sequence Classification
- **Language**: Indonesian
- **Labels**: {labels_str}
- **Training Epochs**: {num_epochs}
- **Training Samples**: {train_samples_str}

## Training Data

Trained on ~2 million Indonesian religious text sentences collected from:
- Catholic websites
- Islamic websites  
- Protestant websites

## Usage

```python
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load model
tokenizer = AutoTokenizer.from_pretrained("{REPO_ID}")
model = AutoModelForSequenceClassification.from_pretrained("{REPO_ID}")

# Predict
text = "Allah adalah Tuhan yang Maha Esa"
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits, dim=-1).item()

label_map = {{0: 'Islam', 1: 'Catholic', 2: 'Protestant'}}
print(f"Prediction: {{label_map[prediction]}}")
```

## Performance

Model performance metrics are available in the training logs.

## Citation

If you use this model, please cite:
```
@misc{{indo-religiolect-bert,
  author = {{Dan Sachs}},
  title = {{Indo-Religiolect-BERT: Indonesian Religious Text Classifier}},
  year = {{2024}},
  publisher = {{Hugging Face}},
  howpublished = {{\\\\url{{https://huggingface.co/{REPO_ID}}}}}
}}
```
"""

# Upload model card
print("üì§ Uploading model card (README.md)...")
try:
    api.upload_file(
        path_or_fileobj=model_card.encode('utf-8'),
        path_in_repo="README.md",
        repo_id=REPO_ID,
        repo_type="model",
    )
    print("‚úÖ Model card uploaded!")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not upload model card: {e}")


In [None]:
# Display summary
print("\n" + "="*60)
print("‚úÖ UPLOAD COMPLETE!")
print("="*60)
print(f"\nüåê View your model at:")
print(f"   https://huggingface.co/{REPO_ID}")

if training_info:
    print(f"\nüìä Training Information:")
    for key, value in training_info.items():
        if key != "label_map":  # Skip label_map in summary
            print(f"   {key}: {value}")

print(f"\nüí° To use this model:")
print(f"   from transformers import AutoTokenizer, AutoModelForSequenceClassification")
print(f"   tokenizer = AutoTokenizer.from_pretrained('{REPO_ID}')")
print(f"   model = AutoModelForSequenceClassification.from_pretrained('{REPO_ID}')")

print(f"\n‚ú® Your model is now publicly available on Hugging Face!")
