# Inference Notebook: Cultural Classification (Colab + Drive)

# ✅ 1. Install required packages

In [2]:
!pip install -q transformers datasets evaluate scikit-learn

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.4 kB[0m [31m20.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# ✅ 2. Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# ✅ 3. Import libraries

In [4]:
import os
import json
import pickle
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ✅ 4. Define paths

In [5]:
MODEL_DIR = "/content/drive/MyDrive/CulturalIA_shared_folder/Models/lm_based"
CSV_PATH = "/content/drive/MyDrive/CulturalIA_shared_folder/Dataset/test_unlabeled.csv"
WIKIDATA_CACHE_PATH = "/content/drive/MyDrive/CulturalIA_shared_folder/Dataset/wikidata_cache_ultra.pkl"
SUMMARY_CACHE_PATH = "/content/drive/MyDrive/CulturalIA_shared_folder/Dataset/wiki_summary_cache.pkl"

# ✅ 5. Load model and tokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

# ✅ 6. Load caches

In [7]:
import os
import pickle

# Try to load the caches, else fallback to empty dicts
if os.path.exists(WIKIDATA_CACHE_PATH):
    with open(WIKIDATA_CACHE_PATH, "rb") as f:
        wikidata_cache = pickle.load(f)
else:
    print("⚠️ Wikidata cache not found. Proceeding with empty metadata.")
    wikidata_cache = {}

if os.path.exists(SUMMARY_CACHE_PATH):
    with open(SUMMARY_CACHE_PATH, "rb") as f:
        summary_cache = pickle.load(f)
else:
    print("⚠️ Summary cache not found. Proceeding with empty summaries.")
    summary_cache = {}

# Build text with graceful fallback
def build_text(x):
    summary = summary_cache.get(x.get("item", ""), "")
    meta = wikidata_cache.get(x.get("item", ""), {})
    fields = [
        f"[ATTACHMENT] {meta.get('attachment', 0)}",
        f"[SPREAD] {meta.get('spread', 0)}",
        f"[SPECIFICITY] {meta.get('specificity', 0)}",
        f"[LANGUAGES] {meta.get('n_languages', 0)}",
        f"[INSTANCEOF] {meta.get('n_instanceof', 0)}",
        f"[SUBCLASSOF] {meta.get('n_subclassof', 0)}",
        f"[DESCRIBEDBY] {meta.get('n_describedby', 0)}",
        f"[CATEGORY] {x.get('category', '')}",
        f"[TYPE] {x.get('type', '')}",
        f"[SUBCATEGORY] {x.get('subcategory', '')}",
        f"[NAME] {x.get('name', '')}",
        f"[DESC] {x.get('description', '')}",
        f"[WIKI] {summary}"
    ]
    return " | ".join(fields)


⚠️ Wikidata cache not found. Proceeding with empty metadata.
⚠️ Summary cache not found. Proceeding with empty summaries.


# ✅ 7. Define label mapping and input builder

In [8]:
labels = ["cultural agnostic", "cultural representative", "cultural exclusive"]
id2label = {i: label for i, label in enumerate(labels)}

def build_text(x):
    summary = summary_cache.get(x["item"], "")
    meta = wikidata_cache.get(x["item"], {})
    fields = [
        f"[ATTACHMENT] {meta.get('attachment', 0)}",
        f"[SPREAD] {meta.get('spread', 0)}",
        f"[SPECIFICITY] {meta.get('specificity', 0)}",
        f"[LANGUAGES] {meta.get('n_languages', 0)}",
        f"[INSTANCEOF] {meta.get('n_instanceof', 0)}",
        f"[SUBCLASSOF] {meta.get('n_subclassof', 0)}",
        f"[DESCRIBEDBY] {meta.get('n_describedby', 0)}",
        f"[CATEGORY] {x['category']}",
        f"[TYPE] {x['type']}",
        f"[SUBCATEGORY] {x.get('subcategory', '')}",
        f"[NAME] {x['name']}",
        f"[DESC] {x['description']}",
        f"[WIKI] {summary}"
    ]
    return " | ".join(fields)


df = pd.read_csv(CSV_PATH)
df["text"] = df.apply(build_text, axis=1)

# ✅ 8. Load and enrich test data

In [9]:
df = pd.read_csv(CSV_PATH)
df["text"] = df.apply(build_text, axis=1)

# ✅ 9. Tokenize inputs

In [10]:
encodings = tokenizer(df["text"].tolist(), padding=True, truncation=True, return_tensors="pt")

# ✅ 10. Run inference

In [11]:
with torch.no_grad():
    outputs = model(**encodings)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    preds = torch.argmax(probs, dim=1)

# ✅ 11. Add predictions to DataFrame

In [12]:
df["predicted_label_id"] = preds.numpy()
df["predicted_label"] = df["predicted_label_id"].map(id2label)

# ✅ 12. Save predictions to Drive

In [13]:
OUTPUT_PATH = "/content/drive/MyDrive/CulturalIA_shared_folder/Outputs/CulturalIA_output_modello1.csv"
df.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Predictions saved to: {OUTPUT_PATH}")

✅ Predictions saved to: /content/drive/MyDrive/CulturalIA_shared_folder/Outputs/CulturalIA_output_modello1.csv
