<a href="https://colab.research.google.com/github/ericyoc/huggingface-model-metadata-blockchain-poc/blob/main/huggingface_model_metadata_blockchain_poc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import random
import csv
import hashlib
import datetime
import json
from prettytable import PrettyTable
from transformers import AutoModel, AutoTokenizer, AutoConfig

In [2]:
MODEL_LIST_URL = "https://huggingface.co/api/models"

In [3]:
class Block:
    def __init__(self, index, timestamp, data, previous_hash):
        self.index = index
        self.timestamp = timestamp
        self.data = data
        self.previous_hash = previous_hash
        self.hash = self.calculate_hash()

    def calculate_hash(self):
        return hashlib.sha256(
            str(self.index).encode('utf-8') +
            str(self.timestamp).encode('utf-8') +
            str(self.data).encode('utf-8') +
            str(self.previous_hash).encode('utf-8')
        ).hexdigest()

class Blockchain:
    def __init__(self):
        self.chain = [self.create_genesis_block()]

    def create_genesis_block(self):
        return Block(0, datetime.datetime.now(), "Genesis Block", "0")

    def add_block(self, data):
        previous_block = self.chain[-1]
        new_block = Block(len(self.chain), datetime.datetime.now(), data, previous_block.hash)
        self.chain.append(new_block)

    def print_chain(self):
        for block in self.chain:
            print("Block #", block.index)
            print("Timestamp:", block.timestamp)
            print("Data:", block.data)
            print("Previous Hash:", block.previous_hash)
            print("Hash:", block.hash)
            print()

    def serialize_chain(self):
        serialized_chain = []
        for block in self.chain:
            serialized_block = {
                "index": block.index,
                "timestamp": block.timestamp.isoformat(),
                "data": block.data,
                "previous_hash": block.previous_hash,
                "hash": block.hash
            }
            serialized_chain.append(serialized_block)
        return serialized_chain

In [4]:

def get_random_models(n=10):
    response = requests.get(MODEL_LIST_URL)
    if response.status_code == 200:
        models = response.json()
        return random.sample(models, n)
    else:
        print(f"Failed to fetch model list. Status code: {response.status_code}")
        return []

In [5]:
def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    return model, tokenizer

In [6]:
def get_model_config(model_name):
    return AutoConfig.from_pretrained(model_name)

In [7]:
def get_model_parameters(model):
    return [(name, param) for name, param in model.named_parameters()]

In [8]:
def get_tokenizer_info(tokenizer):
    return {
        "vocab_size": tokenizer.vocab_size,
        "special_tokens": tokenizer.special_tokens_map
    }

In [9]:
def fetch_model_metadata(model_name):
    url = f"https://huggingface.co/api/models/{model_name}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch model information for {model_name}. Status code: {response.status_code}")
        return None

In [10]:
def classify_model(model_metadata):
    tags = model_metadata.get("tags", [])

    categories = {
        "multimodal": ["multimodal"],
        "computer vision": ["image-classification", "image-segmentation", "object-detection", "vision"],
        "natural language processing": ["text-classification", "token-classification", "question-answering", "text-generation", "translation", "nlp"],
        "audio": ["audio-classification", "speech-recognition", "audio"],
        "tabular": ["tabular-classification", "tabular"],
        "reinforcement learning": ["reinforcement-learning"],
        "other": ["graph-machine-learning", "graph"]
    }

    for category, category_tags in categories.items():
        if any(tag in tags for tag in category_tags):
            return category

    return "other"

In [11]:
def create_final_table():
    table = PrettyTable()
    table.field_names = [
        "Model Name", "Architecture", "Vocabulary Size", "Special Tokens",
        "Model Category", "Model Type", "Last Modified"
    ]
    return table

In [12]:
def add_model_to_table(table, config, tokenizer_info, model_category, model_name, model_type, last_modified):
    table.add_row([
        model_name,
        config.architectures[0] if config.architectures else "N/A",
        tokenizer_info["vocab_size"],
        tokenizer_info["special_tokens"],
        model_category,
        model_type,
        last_modified
    ])

In [13]:
def save_table_to_csv(table, file_name):
    with open(file_name, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(table.field_names)
        writer.writerows(table._rows)

In [14]:
def save_blockchain_to_file(blockchain, file_name):
    serialized_chain = blockchain.serialize_chain()
    with open(file_name, "w") as file:
        json.dump(serialized_chain, file, indent=4)

In [15]:
def main():
    # Get random models
    random_models = get_random_models(10)
    final_table = create_final_table()
    blockchain = Blockchain()

    for model_info in random_models:
        model_name = model_info['modelId']

        try:
            # Load model and tokenizer
            model, tokenizer = load_model_and_tokenizer(model_name)

            # Get model configuration
            config = get_model_config(model_name)

            # Get tokenizer information
            tokenizer_info = get_tokenizer_info(tokenizer)

            # Fetch additional metadata from Hugging Face API
            model_metadata = fetch_model_metadata(model_name)

            # Classify the model
            model_category = classify_model(model_metadata)

            # Get model type and last modified date
            model_type = model_metadata.get("pipeline_tag", "N/A")
            last_modified = model_metadata.get("lastModified", "N/A")

            # Add model information to the final table
            add_model_to_table(
                final_table, config, tokenizer_info,
                model_category, model_name, model_type, last_modified
            )

            # Add model information to the blockchain
            blockchain.add_block({
                "Model Name": model_name,
                "Architecture": config.architectures[0] if config.architectures else "N/A",
                "Vocabulary Size": tokenizer_info["vocab_size"],
                "Special Tokens": tokenizer_info["special_tokens"],
                "Model Category": model_category,
                "Model Type": model_type,
                "Last Modified": last_modified
            })

        except Exception as e:
            print(f"Error processing model {model_name}: {e}")

    # Print the final table
    print(final_table)

    # Save table to CSV
    save_table_to_csv(final_table, "models_metadata.csv")
    print("Table saved to models_metadata.csv")

    # Save the blockchain to a file
    save_blockchain_to_file(blockchain, "blockchain_data.json")
    print("Blockchain data saved to blockchain_data.json")

    # Print the blockchain
    print("\nBlockchain:")
    blockchain.print_chain()

In [16]:
if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Error processing model Andy1621/uniformer: Andy1621/uniformer does not appear to have a file named config.json. Checkout 'https://huggingface.co/Andy1621/uniformer/main' for available files.
Error processing model Aleksandar/bert-srb-ner-setimes-lr: Aleksandar/bert-srb-ner-setimes-lr does not appear to have a file named config.json. Checkout 'https://huggingface.co/Aleksandar/bert-srb-ner-setimes-lr/main' for available files.
Error processing model AdapterHub/roberta-base-pf-duorc_p: AdapterHub/roberta-base-pf-duorc_p does not appear to have a file named config.json. Checkout 'https://huggingface.co/AdapterHub/roberta-base-pf-duorc_p/main' for available files.
Error processing model AJ/rick-sanchez-bot: AJ/rick-sanchez-bot does not appear to have a file named config.json. Checkout 'https://huggingface.co/AJ/rick-sanchez-bot/main' for available files.


tokenizer_config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/245 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/840 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/53.9M [00:00<?, ?B/s]

Some weights of AlbertModel were not initialized from the model checkpoint at Akari/albert-base-v2-finetuned-squad and are newly initialized: ['albert.pooler.bias', 'albert.pooler.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Error processing model ArcQ/gpt-experiments: ArcQ/gpt-experiments does not appear to have a file named config.json. Checkout 'https://huggingface.co/ArcQ/gpt-experiments/main' for available files.
Error processing model Aviora/news2vec: Aviora/news2vec does not appear to have a file named config.json. Checkout 'https://huggingface.co/Aviora/news2vec/main' for available files.


tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/672 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

+------------------------------------------------------------------+-------------------------------+-----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------