In [1]:
!pip install Wikidata
!pip install datasets

Collecting Wikidata
  Downloading Wikidata-0.8.1-py3-none-any.whl.metadata (3.0 kB)
Downloading Wikidata-0.8.1-py3-none-any.whl (29 kB)
Installing collected packages: Wikidata
Successfully installed Wikidata-0.8.1
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hDown

In [2]:
import torch, random, requests, os, pickle
import numpy as np
from wikidata.client import Client
from datasets import load_dataset
from itertools import islice
from google.colab import userdata

In [3]:
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

In [4]:
def dump(file_name, result):
    # remove dump files if present
    if os.path.exists(file_name):
        os.remove(file_name)
    with open(file_name, 'wb') as file:
        print("dumping", file_name)
        # noinspection PyTypeChecker
        pickle.dump(result, file)

def load(file_name):
    with open(file_name, 'rb') as file:
        print("loading", file_name)
        # noinspection PyTypeChecker
        return pickle.load(file)

In [5]:
def wikipedia_pages(sitelinks):
    result = []
    for site_key in sitelinks.keys():
        if site_key.endswith("wiki") and not site_key.startswith("commons"):
            lang = site_key.replace("wiki", "")
            result.append(lang)
    return result

def build_claims(claims):
    result = {}
    for prop_id, values in claims.items():
        result[prop_id] = len(values)
    return result

class Entity:
    def __init__(self, entity_id, dataset_item, wiki_data, wiki_text):
        self.entity_id = entity_id
        self.label = dataset_item['label']
        self.name = dataset_item['name']
        self.description = dataset_item['description']
        self.type = dataset_item['type']
        self.category = dataset_item['category']
        self.subcategory = dataset_item['subcategory']
        self.wiki_text = wiki_text
        # Languages
        self.labels = list(wiki_data.data.get("labels", {}).keys())
        self.descriptions = list(wiki_data.data.get("descriptions", {}).keys())
        self.aliases = list(wiki_data.data.get("aliases", {}).keys())
        self.wikipedia_pages = wikipedia_pages(wiki_data.data.get("sitelinks", {}))
        # Properties
        self.claims = build_claims(wiki_data.data.get("claims", {}))

    def __str__(self):
        return self.entity_id + ": " + self.label + " - " + self.name

API_URL = "https://en.wikipedia.org/w/api.php"

def extract_entity_id(url):
    return url.strip().split("/")[-1]

def get_wiki_text(en_wiki):
    if not en_wiki:
        return None
    title = en_wiki["title"]
    params = {
        "action": "query",
        "prop": "extracts",
        "explaintext": True,
        "titles": title,
        "format": "json",
        "redirects": 1
    }
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
    res = requests.get(API_URL, params=params, headers=headers)
    json = res.json()
    page = next(iter(json["query"]["pages"].values()))
    # Keep the original text as it is.
    # The text will be processed in other methods,
    # such as processed_dataset#tokenize().
    return page.get("extract", "")

class EntityFactory:
    def __init__(self, client):
        self.client = client

    def create(self, item):
        entity_id = extract_entity_id(item['item'])
        try:
            wikidata = self.client.get(entity_id, load=True)
            sitelinks = wikidata.data.get("sitelinks", {})
            en_wiki = sitelinks.get("enwiki")
            return Entity(entity_id, item, wikidata, get_wiki_text(en_wiki))
        except Exception as e:
            print("Error loading id:", entity_id, e)
            return None

TRAINING_FILE_NAME = "training.bin"
VALIDATION_FILE_NAME = "validation.bin"

def create_set(dataset, factory, limit, file_name):
    # apply the limits
    if limit is None:
        limit = len(dataset)
    result = []
    for index, item in enumerate(islice(dataset, limit)):
        created = factory.create(item)
        if created is not None:
            result.append(created)
        if (index + 1) % 10 == 0:
            print("creating", file_name, index + 1, "/", limit)
    return result

class NLPDataset:
    def __init__(self, training_limit=None, validation_limit=None, force_reload=False):
        if not (os.path.exists(TRAINING_FILE_NAME)) or not (os.path.exists(VALIDATION_FILE_NAME)) or force_reload:
            # load the project dataset
            dataset = load_dataset('sapienzanlp/nlp2025_hw1_cultural_dataset', token=userdata.get('HUGGINGFACE_TOKEN'))
            # a factory object is used to create our entities
            factory = EntityFactory(Client())

            self.training_set = create_set(dataset['train'], factory, training_limit, TRAINING_FILE_NAME)
            self.validation_set = create_set(dataset['validation'], factory, validation_limit, VALIDATION_FILE_NAME)
            dump(TRAINING_FILE_NAME, self.training_set)
            dump(VALIDATION_FILE_NAME, self.validation_set)
        else:
            # by default load the dataset from a local dump
            self.training_set = load(TRAINING_FILE_NAME)
            self.validation_set = load(VALIDATION_FILE_NAME)

    def __str__(self):
        return "training: " + str(len(self.training_set)) + ". validation: " + str(len(self.validation_set))

In [6]:
nlp_dataset = NLPDataset()

loading training.bin
loading validation.bin


In [7]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl.metadata (6.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->torchtext==0.6.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->torchtext==0.6.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->torchtext==0.6.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->torchtext==0.6.0)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->torchtext==0.6.0)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->torc

In [8]:
import collections, nltk, string
import pandas as pd
from torchtext.vocab import GloVe
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torch.utils.data import IterableDataset

In [9]:
def type_vector(base_type):
    vector = np.zeros(1, dtype=np.float32)
    if base_type == 'entity':
        vector[0] = vector[0] + 1
    return vector

def label_to_number(label):
    if label == 'cultural agnostic':
        return 0
    if label == 'cultural representative':
        return 1
    if label == 'cultural exclusive':
        return 2
    raise ValueError('label not suppoerted: ' + label)

class ProcessedEntity:
    def __init__(self, base: Entity, desc_text, wiki_text):
        self.base_entity = base.entity_id + ": " + base.name
        # processed fields
        self.desc_text = desc_text
        self.wiki_text = wiki_text
        self.labels_text = base.labels
        self.descriptions_text = base.descriptions
        self.aliases_text = base.aliases
        self.pages_text = base.wikipedia_pages
        # Using map to denote a Python dictionary,
        # since the dictionary is already use for a word (term) dictionary
        self.claims_map = base.claims
        self.subcategory = base.subcategory
        self.category = base.category

        # build later (then the dictionaries are finalized)
        self.desc_vector = None
        self.wiki_vector = None
        self.labels_vector = None
        self.descriptions_vector = None
        self.aliases_vector = None
        self.pages_vector = None
        self.claims_vector = None
        # it includes implicitly the category
        # since the subcategory values have been ordered by category
        self.subcategory_vector = None
        # in this case we can assume that we have only two types (entity vs concept)
        self.type_vector = type_vector(base.type)
        self.desc_glove_vector = None
        self.output_label = label_to_number(base.label)

    def __str__(self):
        return self.base_entity + " < " + str(len(self.desc_text)) + ", " + str(len(self.wiki_text)) + " >"

    def dataset_item(self):
        return {
            "desc" : self.desc_vector,
            "wiki" : self.wiki_vector,
            "labels" : self.labels_vector,
            "descriptions" : self.descriptions_vector,
            "aliases" : self.aliases_vector,
            "pages" : self.pages_vector,
            "claims" : self.claims_vector,
            "category" : self.subcategory_vector,
            "type" : self.type_vector,
            "desc_glove" : self.desc_glove_vector,
            "output_label" : self.output_label,
            "base" : self.base_entity
        }

class CategoryTable:
    def __init__(self):
        self.subcategories_entered = {}  # to avoid duplicates
        self.subcategories = []
        self.categories = []
        self.subcategory_to_id = None  # computed on build

    def include(self, processed_entity: ProcessedEntity):
        if processed_entity.subcategory in self.subcategories_entered:
            return
        self.subcategories_entered[processed_entity.subcategory] = True
        self.subcategories.append(processed_entity.subcategory)
        self.categories.append(processed_entity.category)

    def build(self):
        data = {
            'subcategory': self.subcategories,
            'category': self.categories
        }
        df = pd.DataFrame(data)
        df = df.sort_values('category')
        print(df.to_markdown())
        self.subcategory_to_id = {row["subcategory"]: index for index, (_, row) in enumerate(df.iterrows())}

    def subcat_to_vector(self, subcategory):
        vector = np.zeros(1, dtype=np.float32)
        vector[0] = vector[0] + self.subcategory_to_id[subcategory]
        return vector

    def length(self):
        return len(self.subcategory_to_id)

class Dictionary:
    def __init__(self):
        self.occurrences = []
        self.unk_token = None
        self.word_to_id = None

    def include(self, tokenized_text):
        self.occurrences.extend(tokenized_text)

    def build(self, max_vocab_size, unk_token):
        self.unk_token = unk_token
        counter = collections.Counter(self.occurrences)
        self.word_to_id = {key: index for index, (key, _) in enumerate(counter.most_common(max_vocab_size - 1))}
        assert unk_token not in self.word_to_id
        self.word_to_id[unk_token] = max_vocab_size - 1
        self.occurrences = None # free memory space

    def build_no_limits(self):
        counter = collections.Counter(self.occurrences)
        self.word_to_id = {key: index for index, (key, _) in enumerate(counter.most_common())}
        self.occurrences = None # free memory space

    def length(self):
        return len(self.word_to_id)

    def words_to_vector(self, words):
        vector = np.zeros(self.length(), dtype=np.float32)
        for word in words:
            if word == self.unk_token:
                continue
            if word in self.word_to_id:
                vector[self.word_to_id[word]] = vector[self.word_to_id[word]] + 1
        return vector

    # Using map to denote a Python dictionary,
    # since the dictionary is already use for a word (term) dictionary
    def map_to_vector(self, dictionary):
        vector = np.zeros(self.length(), dtype=np.float32)
        for word, freq in dictionary.items():
            if word == self.unk_token:
                continue
            if word in self.word_to_id:
                vector[self.word_to_id[word]] = freq
        return vector

PAD_TOKEN = '<PAD>'

class GloveProcessing:
    def __init__(self, context_size):
        self.glove = GloVe(name='6B', dim=100)
        self.context_size = context_size

    def words_to_vect(self, words):
        return self.glove.get_vecs_by_tokens(self.tokens(words)).view(-1)

    def tokens(self, words):
        return words[:self.context_size] + [PAD_TOKEN]*(self.context_size-len(words))

UNK = '<UNK>' # the token to be used for out of vocabulary words
DESC_VOCAB_SIZE = 4_000
WIKI_VOCAB_SIZE = 10_000
GLOVE_EMBEDDING_SIZE = 20

class Dictionaries:
    def __init__(self):
        # descriptions and wiki text words are in 2 different vector spaces
        self.desc = Dictionary()
        self.wiki = Dictionary()
        # we use the same languages keys dictionaries for:
        # labels_text, descriptions_text, aliases_text, pages_text
        self.languages = Dictionary()
        self.claims = Dictionary()
        self.category_table = CategoryTable()
        # extra add glove embeddings
        self.glove_desc = GloveProcessing(GLOVE_EMBEDDING_SIZE)

    def include(self, processed_entity: ProcessedEntity):
        self.desc.include(processed_entity.desc_text)
        self.wiki.include(processed_entity.wiki_text)
        self.languages.include(processed_entity.labels_text)
        self.languages.include(processed_entity.descriptions_text)
        self.languages.include(processed_entity.aliases_text)
        self.languages.include(processed_entity.pages_text)
        self.claims.include(list(processed_entity.claims_map.keys()))
        self.category_table.include(processed_entity)

    def build(self):
        self.desc.build(DESC_VOCAB_SIZE, UNK)
        self.wiki.build(WIKI_VOCAB_SIZE, UNK)
        self.claims.build_no_limits()
        # those guys are not too large: so we can not limit them
        self.languages.build_no_limits()
        self.category_table.build()

    def finalize(self, processed_entity: ProcessedEntity):
        processed_entity.desc_vector = self.desc.words_to_vector(processed_entity.desc_text)
        processed_entity.wiki_vector = self.wiki.words_to_vector(processed_entity.wiki_text)
        processed_entity.labels_vector = self.languages.words_to_vector(processed_entity.labels_text)
        processed_entity.descriptions_vector = self.languages.words_to_vector(processed_entity.descriptions_text)
        processed_entity.aliases_vector = self.languages.words_to_vector(processed_entity.aliases_text)
        processed_entity.pages_vector = self.languages.words_to_vector(processed_entity.pages_text)
        processed_entity.claims_vector = self.claims.map_to_vector(processed_entity.claims_map)
        processed_entity.subcategory_vector = self.category_table.subcat_to_vector(processed_entity.subcategory)
        processed_entity.desc_glove_vector = self.glove_desc.words_to_vect(processed_entity.desc_text)

class IterableEntities(IterableDataset):
    def __init__(self, processed_entities: list[ProcessedEntity]):
        self.processed_entities = processed_entities

    def __iter__(self):
        for entity in self.processed_entities:
            yield entity.dataset_item()

    def __len__(self):
        return len(self.processed_entities)

TRAINING_PROC_FILE_NAME = "training-proc.bin"
VALIDATION_PROC_FILE_NAME = "validation-proc.bin"

def text_process(text, stop):
    result = []
    if text is None:
        return result
    for sentence in nltk.sent_tokenize(text.lower()):
        result.extend([WordNetLemmatizer().lemmatize(i) for i in nltk.word_tokenize(sentence) if i not in stop])
    return result

def create_processed(entity, dictionaries, stop):
    description_tokenized = text_process(entity.description, stop)
    wiki_text_tokenized = text_process(entity.wiki_text, stop)
    result = ProcessedEntity(entity, description_tokenized, wiki_text_tokenized)
    dictionaries.include(result)
    return result

class ProcessedDataset(NLPDataset):
    def __init__(self, training_limit=None, validation_limit=None, force_reload=False):
        super().__init__(training_limit, validation_limit, force_reload)
        if not (os.path.exists(TRAINING_PROC_FILE_NAME)) or not (os.path.exists(VALIDATION_PROC_FILE_NAME)) or force_reload:
            self.processed_training_set, self.processed_validation_set = self.processing()
            dump(TRAINING_PROC_FILE_NAME, self.processed_training_set)
            dump(VALIDATION_PROC_FILE_NAME, self.processed_validation_set)
        else:
            # by default load the dataset from a local dump
            self.processed_training_set = load(TRAINING_PROC_FILE_NAME)
            self.processed_validation_set = load(VALIDATION_PROC_FILE_NAME)

    def processing(self):
        nltk.download('stopwords')
        nltk.download('wordnet')
        nltk.download('punkt_tab')
        stop = set(stopwords.words('english') + list(string.punctuation) + ['==', "''", '``', "'s", '==='])
        print("processing the data")
        dictionaries = Dictionaries()
        # from the base data, add a list of processed entities
        print("training set text processing started")
        processed_training_set = []
        for index, entity in enumerate(self.training_set):
            processed_training_set.append(create_processed(entity, dictionaries, stop))
            if (index+1) % 100 == 0:
                print("training set processed", index+1, "entities")
        print("training set text processing ended")
        print("validation set text processing started")
        processed_validation_set = []
        for index, entity in enumerate(self.validation_set):
            processed_validation_set.append(create_processed(entity, dictionaries, stop))
            if (index+1) % 100 == 0:
                print("validation set processed", index+1, "entities")
        print("validation set text processing ended")
        print("building dictionaries")
        # when we've collected all the words for the two spaces, we can build them
        dictionaries.build()
        print("text to vector started")
        # build the vectors from the texts
        for entity in processed_training_set:
            dictionaries.finalize(entity)
        for entity in processed_validation_set:
            dictionaries.finalize(entity)
        print("text to vector finished")
        return processed_training_set, processed_validation_set

    def training(self):
        return IterableEntities(self.processed_training_set)

    def validation(self):
        return IterableEntities(self.processed_validation_set)

In [10]:
processed_dataset = ProcessedDataset()

loading training.bin
loading validation.bin


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


processing the data


.vector_cache/glove.6B.zip: 862MB [02:39, 5.41MB/s]                           
100%|█████████▉| 399999/400000 [00:25<00:00, 15747.93it/s]


training set text processing started
training set processed 100 entities
training set processed 200 entities
training set processed 300 entities
training set processed 400 entities
training set processed 500 entities
training set processed 600 entities
training set processed 700 entities
training set processed 800 entities
training set processed 900 entities
training set processed 1000 entities
training set processed 1100 entities
training set processed 1200 entities
training set processed 1300 entities
training set processed 1400 entities
training set processed 1500 entities
training set processed 1600 entities
training set processed 1700 entities
training set processed 1800 entities
training set processed 1900 entities
training set processed 2000 entities
training set processed 2100 entities
training set processed 2200 entities
training set processed 2300 entities
training set processed 2400 entities
training set processed 2500 entities
training set processed 2600 entities
training s

In [11]:
import torch, csv
from torch import nn
from torch.utils.data import DataLoader
from huggingface_hub import PyTorchModelHubMixin

In [12]:
def rescale_vector_layer(params):
    in_features, out_features = params
    # frequency vector fields rescaling (applying also a RuLU individually):
    return nn.Sequential(nn.Linear(in_features, out_features), nn.ReLU())

class MultiModalModel(nn.Module, PyTorchModelHubMixin,
                      repo_url="fax4ever/culturalitems-no-transformer",
                      pipeline_tag="text-classification",
                      license="apache-2.0"):
    def __init__(self, params, device) -> None:
        super(MultiModalModel, self).__init__()
        self.device = device
        # individual input layers for frequency vectors
        self.desc = rescale_vector_layer(params['desc']).to(device)
        self.wiki = rescale_vector_layer(params['wiki']).to(device)
        self.labels = rescale_vector_layer(params['labels']).to(device)
        self.descriptions = rescale_vector_layer(params['descriptions']).to(device)
        self.aliases = rescale_vector_layer(params['aliases']).to(device)
        self.pages = rescale_vector_layer(params['pages']).to(device)
        self.claims = rescale_vector_layer(params['claims']).to(device)
        # individual input layers for scalar value
        self.category = nn.Linear(params['category_dim'], params['category_scale']).to(device)
        self.type_proj = nn.Linear(params['type_dim'], params['type_scale']).to(device)
        # glove
        self.desc_glove = rescale_vector_layer(params['desc_glove']).to(device)
        # common classifier
        self.classifier = nn.Sequential(
            nn.Linear(params['total_scale'], params['hidden_layers']),
            nn.ReLU(),
            nn.Dropout(params['dropout']),
            nn.Linear(params['hidden_layers'], 3)
        ).to(device)

    def forward(self, dataset_items):
        desc_feat = self.desc(dataset_items['desc'].to(self.device))
        wiki_feat = self.wiki(dataset_items['wiki'].to(self.device))
        labels_feat = self.labels(dataset_items['labels'].to(self.device))
        descriptions_feat = self.descriptions(dataset_items['descriptions'].to(self.device))
        aliases_feat = self.aliases(dataset_items['aliases'].to(self.device))
        pages_feat = self.pages(dataset_items['pages'].to(self.device))
        claims_feat = self.claims(dataset_items['claims'].to(self.device))
        category_feat = self.category(dataset_items['category'].to(self.device))
        type_feat = self.type_proj(dataset_items['type'].to(self.device))
        desc_glove_feat = self.desc_glove(dataset_items['desc_glove'].to(self.device))
        combined = torch.cat([desc_feat, desc_glove_feat, wiki_feat, labels_feat, descriptions_feat, aliases_feat, pages_feat,
                              claims_feat, category_feat, type_feat], dim=1)
        return self.classifier(combined)

In [13]:
def number_to_label(label):
    if label == 0:
        return 'cultural agnostic'
    if label == 1:
        return 'cultural representative'
    if label == 2:
        return 'cultural exclusive'
    raise ValueError('label not suppoerted: ' + label)

model = MultiModalModel.from_pretrained("fax4ever/culturalitems-no-transformer")
matching = 0

with open('no-transformer-inference.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["entity", "true label", "prediction", "correct"]
    writer.writerow(field)
    with torch.no_grad():
        validation = ProcessedDataset().validation()
        for entity in DataLoader(validation):
            prediction = model(entity).detach().clone().argmax(dim=1).numpy()[0]
            true_label = entity['output_label'].numpy()[0]
            match = prediction == true_label
            if match:
                matching = matching + 1
            base_ = entity['base'][0]
            writer.writerow([base_, number_to_label(true_label), number_to_label(prediction), match])

print('matched', matching, 'on', len(validation), '(', matching/len(validation), ')')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/9.43M [00:00<?, ?B/s]

loading training.bin
loading validation.bin
loading training-proc.bin
loading validation-proc.bin
matched 222 on 300 ( 0.74 )
