In [28]:
import os
import time
from datetime import datetime, timezone, timedelta
import pandas as pd
from huggingface_hub import HfApi, HfFolder


# Constants
MIN_DOWNLOADS = 100
SINCE_DATE = datetime.now(timezone.utc) - timedelta(days=365*3)  # Collect models created in the last year
OUTPUT_CSV = "hf_models.csv"
OUTPUT_DIR = "model_files"  # Directory to save model files

# Authentication (optional)
ACCESS_TOKEN = os.getenv("HF_ACCESS_TOKEN")  # Set your Hugging Face API token as an environment variable

# Initialize API
api = HfApi()

# If you have an access token, save it
if ACCESS_TOKEN:
    HfFolder.save_token(ACCESS_TOKEN)

# Function to collect model data
def collect_model_data():
    models_data = []
    fetch_limit = 10000  # Maximum number of models to fetch
    sort_order = "downloads"  # You can sort by 'lastModified', 'downloads', 'stars', etc.

    # Note: As of my knowledge cutoff in 2021, the Hugging Face Hub API does not support filtering by creation date directly.
    # We'll fetch models and filter them manually.

    # Search models
    print("Fetching models from Hugging Face Hub...")
    models = api.list_models(
        sort=sort_order,
        direction=-1,  # Descending order
        limit=fetch_limit,
        use_auth_token=ACCESS_TOKEN,
        full=True,  # Fetch full metadata
        
    )
    from tqdm import tqdm
    for model in tqdm(models):
        # Convert timestamp strings to datetime objects
        if hasattr(model, 'lastModified'):
            last_modified = model.lastModified
        else:
            last_modified = datetime.now(tz=model.lastModified.tzinfo)

        # Filter models created since the specified date
        if last_modified >= SINCE_DATE:
            # Get the model's downloads (may require an authenticated request)
            downloads = model.downloads if hasattr(model, 'downloads') else 0

            if downloads >= MIN_DOWNLOADS:
                model_data = {
                    "modelId": model.modelId,
                    "modelName": model.modelId.split("/")[-1],
                    "author": model.modelId.split("/")[0] if "/" in model.modelId else None,
                    "downloads": downloads,
                    "lastModified": model.lastModified,
                    "tags": model.tags,
                    "pipeline_tag": model.pipeline_tag,
                    "sha": model.sha,
                    "private": model.private,
                    "inference": model.inference,

                }
                models_data.append(model_data)

        # Respect rate limits
        time.sleep(0.1)

    print(f"Total models collected: {len(models_data)}")
    return pd.DataFrame(models_data)

# Optional: Function to download model files
# def download_model_files(model_ids):
#     if not os.path.exists(OUTPUT_DIR):
#         os.makedirs(OUTPUT_DIR)

#     for model_id in model_ids:
#         print(f"Downloading files for model: {model_id}")
#         files = api.list_repo_files(repo_id=model_id, use_auth_token=ACCESS_TOKEN)

#         # Create a directory for each model
#         model_dir = os.path.join(OUTPUT_DIR, model_id.replace("/", "_"))
#         if not os.path.exists(model_dir):
#             os.makedirs(model_dir)

#         for file_name in files:
#             # Download each file
#             file_path = os.path.join(model_dir, file_name)
#             api.download_file(
#                 repo_id=model_id,
#                 filename=file_name,
#                 local_dir=model_dir,
#                 use_auth_token=ACCESS_TOKEN,
#             )

#         # Respect rate limits
#         time.sleep(0.5)

df = collect_model_data()

df.to_csv(OUTPUT_CSV, index=False)

Fetching models from Hugging Face Hub...


1427it [02:27,  9.83it/s]

In [26]:
df

Unnamed: 0,modelId,modelName,author,downloads,lastModified,tags,pipeline_tag,sha,private,inference
0,MIT/ast-finetuned-audioset-10-10-0.4593,ast-finetuned-audioset-10-10-0.4593,MIT,189285567,2023-09-06 14:49:15+00:00,"[transformers, pytorch, safetensors, audio-spe...",audio-classification,f826b80d28226b62986cc218e5cec390b1096902,False,cold
1,google-bert/bert-base-uncased,bert-base-uncased,google-bert,61903514,2024-02-19 11:06:12+00:00,"[transformers, pytorch, tf, jax, rust, coreml,...",fill-mask,86b5e0934494bd15c9632b12f734a8a67f723594,False,loading
2,sentence-transformers/all-MiniLM-L6-v2,all-MiniLM-L6-v2,sentence-transformers,46030976,2024-05-29 14:43:28+00:00,"[sentence-transformers, pytorch, tf, rust, onn...",sentence-similarity,8b3219a92973c328a8e22fadcfa821b5dc75636a,False,warm
3,google/vit-base-patch16-224-in21k,vit-base-patch16-224-in21k,google,35397121,2024-02-05 16:37:39+00:00,"[transformers, pytorch, tf, jax, safetensors, ...",image-feature-extraction,b4569560a39a0f1af58e3ddaf17facf20ab919b0,False,explicit-opt-out
4,openai/clip-vit-large-patch14,clip-vit-large-patch14,openai,32383367,2023-09-15 15:49:35+00:00,"[transformers, pytorch, tf, jax, safetensors, ...",zero-shot-image-classification,32bd64288804d66eefd0ccbe215aa642df71cc41,False,cold
5,amazon/chronos-t5-tiny,chronos-t5-tiny,amazon,30094073,2024-05-13 21:09:18+00:00,"[transformers, safetensors, t5, text2text-gene...",time-series-forecasting,d968d90a73cc4e3a3103e262d1d895204e74e415,False,pipeline-library-pair-not-supported
6,openai/clip-vit-base-patch32,clip-vit-base-patch32,openai,26576499,2024-02-29 09:45:55+00:00,"[transformers, pytorch, tf, jax, clip, zero-sh...",zero-shot-image-classification,3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268,False,warm
7,1231czx/llama3_it_ultra_list_and_bold500,llama3_it_ultra_list_and_bold500,1231czx,25414501,2024-09-03 12:58:12+00:00,"[transformers, safetensors, llama, text-classi...",text-classification,31bbdb84c8c535f807fad9013a50b71a924a7fc2,False,cold
8,jonatasgrosman/wav2vec2-large-xlsr-53-english,wav2vec2-large-xlsr-53-english,jonatasgrosman,21263718,2023-03-25 10:56:55+00:00,"[transformers, pytorch, jax, safetensors, wav2...",automatic-speech-recognition,569a6236e92bd5f7652a0420bfe9bb94c5664080,False,cold
9,FacebookAI/xlm-roberta-large,xlm-roberta-large,FacebookAI,21028349,2024-02-19 12:48:30+00:00,"[transformers, pytorch, tf, jax, onnx, safeten...",fill-mask,c23d21b0620b635a76227c604d44e43a9f0ee389,False,warm
