# Mine jsons from HF and paginate. At the moment (may 2024) it's only 6 pages, causing neglectable traffic

In [2]:
import requests
import pandas as pd 
from tqdm import tqdm 
import panel as pn
import json 
tqdm.pandas()

# Base URL for the API
base_url = "https://huggingface.co/models-json?other=feature-extraction&library=transformers.js&sort=trending&numItemsPerPage=50" # attention: https://huggingface.co/posts/do-me/362814004058611

# List to store all models
all_models = []

# Total number of pages to fetch
total_pages = 12 # adding more pages here for the future, should be raised once there are more than 300 models

# Use tqdm to show progress
for page_number in tqdm(range(0, total_pages + 1), desc="Fetching Pages"):
    # Construct the full URL for the current page
    url = f"{base_url}&p={page_number}"
    
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON data from the response
        json_data = response.json()
        
        # Extract the models from the current page's data
        page_models = json_data['models']
        
        # Append the models to the list
        all_models.extend(page_models)
    else:
        print(f"Failed to fetch data from page {page_number}. Status code: {response.status_code}")

print(all_models.__len__(), "Models mined")

df = pd.DataFrame(all_models)


Fetching Pages: 100%|██████████| 13/13 [00:03<00:00,  3.73it/s]

159 Models mined





# For each of the models have a look at the onnx file sizes. Must request each page once unfortunately as it's not in the model's json
Takes not more than 1.5 mins

In [3]:
from bs4 import BeautifulSoup
import requests

def extract_size_from_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find all 'a' tags with the specified title attribute
            a_tags = soup.find_all('a', title="Download file")
            model_sizes = []  # Initialize a list to store sizes of model files

            for a_tag in a_tags:
                file_name_tag = a_tag.find_previous_sibling('div').find('span')
                if not file_name_tag:  # Skip if there's no 'span' tag
                    continue
                file_name = file_name_tag.text.strip()
                if file_name.endswith(".onnx"):
                    if file_name.startswith("model"):
                        size = a_tag.text.strip().split("\n")[0]
                        model_sizes.append(size)
                    else: # only if there is no normal model
                        if file_name.startswith(("decoder", "encoder")):
                            size = a_tag.text.strip().split("\n")[0]
                            model_sizes.append(size)
                
            if model_sizes:
                return model_sizes
            else:
                return ""
        else:
            return ""#f"HTTP Status Code: {response.status_code}"
    except requests.exceptions.RequestException as e:
        return f"Request error: {e}"
    
# extract_size_from_url("https://huggingface.co/Xenova/instructor-large/tree/main/onnx") # test

def scrape_sizes(model):
    sizes = extract_size_from_url(f"https://huggingface.co/{model}/tree/main/onnx")
    sizes = [i.replace(" ","") for i in sizes]
    sizes = " | ".join(sizes)
    return sizes

# scrape_sizes("mixedbread-ai/mxbai-embed-large-v1")

In [4]:
df["sizes"] = df["id"].progress_apply(scrape_sizes) # 1 min 19s

100%|██████████| 159/159 [01:30<00:00,  1.76it/s]


## Remove models that are not currently working (but have the transformers.js and feature-extraction tag)

In [5]:
print("Removing following models from dataset: \n",df[df["sizes"] == ""].id)
df = df[df["sizes"] != ""]

Removing following models from dataset: 
 131                                       Severian/nomic
143                     ChristianAzinn/uae-large-v1-gguf
144             ChristianAzinn/mxbai-embed-large-v1-gguf
147         ChristianAzinn/snowflake-arctic-embed-l-gguf
148    ChristianAzinn/snowflake-arctic-embed-m-long-GGUF
149         ChristianAzinn/snowflake-arctic-embed-m-gguf
150         ChristianAzinn/snowflake-arctic-embed-s-gguf
151        ChristianAzinn/snowflake-arctic-embed-xs-gguf
152                                 Xenova/mobileclip_s0
153                                 Xenova/mobileclip_s1
154                                 Xenova/mobileclip_s2
155                                  Xenova/mobileclip_b
156                                Xenova/mobileclip_blt
Name: id, dtype: object


# Add min and max onnx file size for sorting. Must be converted from different units

In [6]:
import re

# Conversion dictionary
size_conversion = {'Byt': 1, 'Bytes': 1, 'kB': 1024, 'MB': 1024**2, 'GB': 1024**3}

# Conversion function
def size_to_bytes(size_str):
    # Use regex to find the number and unit
    match = re.search(r'(\d+(\.\d+)?)\s*(Byt|Bytes|kB|MB|GB)', size_str)
    if not match:
        raise ValueError(f"Invalid size format: {size_str}")
    size_value = float(match.group(1))
    size_unit = match.group(3)
    return size_value * size_conversion[size_unit]

# Parsing and conversion function
def parse_and_find_min_max(sizes_str):
    sizes_list = sizes_str.split(' | ')
    sizes_bytes = [size_to_bytes(s) for s in sizes_list]
    return min(sizes_bytes), max(sizes_bytes)

# Apply the function and create new columns
# Assuming 'df' is a pandas DataFrame and 'sizes' is a column in that DataFrame
df['min_size'], df['max_size'] = zip(*df['sizes'].apply(parse_and_find_min_max))

In [7]:
#df.sort_values("min_size", ascending=True).head(20) # sort as you please here

# Removing all sizes below 50kB for the moment to filter 

In [8]:
df = df[df["min_size"] > 50000].reset_index(drop=True)
df = df.reset_index(drop=True)
df["trending"] = df.index +1 # adding the trending column

In [9]:
from datetime import datetime

# Get today's date
today = datetime.today().strftime("%d-%m-%Y")

df["mined_date"] = today # append to df so that one could easily concat dfs of different dates and do a groupby or similar, convenience

In [10]:
df.head(10)

Unnamed: 0,author,authorData,downloads,gated,id,lastModified,likes,pipeline_tag,private,repoType,isLikedByUser,sizes,min_size,max_size,trending,mined_date
0,mixedbread-ai,{'avatarUrl': 'https://cdn-avatars.huggingface...,1755877,False,mixedbread-ai/mxbai-embed-large-v1,2024-04-18T23:20:55.000Z,309,feature-extraction,False,model,False,1.34GB | 669MB | 337MB,353370112.0,1438814000.0,1,08-05-2024
1,nomic-ai,{'avatarUrl': 'https://cdn-avatars.huggingface...,145806,False,nomic-ai/nomic-embed-text-v1.5,2024-05-03T02:21:07.000Z,171,sentence-similarity,False,model,False,548MB | 138MB,144703488.0,574619600.0,2,08-05-2024
2,nomic-ai,{'avatarUrl': 'https://cdn-avatars.huggingface...,153516,False,nomic-ai/nomic-embed-text-v1,2024-05-03T02:21:44.000Z,365,sentence-similarity,False,model,False,548MB | 138MB,144703488.0,574619600.0,3,08-05-2024
3,Alibaba-NLP,{'avatarUrl': 'https://www.gravatar.com/avatar...,75109,False,Alibaba-NLP/gte-large-en-v1.5,2024-04-26T13:51:26.000Z,56,sentence-similarity,False,model,False,1.75GB | 361MB | 873MB | 446MB | 387MB | 446MB...,378535936.0,1879048000.0,4,08-05-2024
4,WhereIsAI,{'avatarUrl': 'https://www.gravatar.com/avatar...,277613,False,WhereIsAI/UAE-Large-V1,2024-05-03T02:31:54.000Z,177,feature-extraction,False,model,False,1.34GB | 669MB | 337MB,353370112.0,1438814000.0,5,08-05-2024
5,Snowflake,{'avatarUrl': 'https://cdn-avatars.huggingface...,19315,False,Snowflake/snowflake-arctic-embed-m,2024-04-18T19:50:37.000Z,63,sentence-similarity,False,model,False,436MB | 144MB | 218MB | 110MB | 149MB | 110MB ...,115343360.0,457179100.0,6,08-05-2024
6,Snowflake,{'avatarUrl': 'https://cdn-avatars.huggingface...,33257,False,Snowflake/snowflake-arctic-embed-l,2024-04-18T19:58:11.000Z,58,sentence-similarity,False,model,False,1.34GB | 299MB | 669MB | 337MB | 318MB | 337MB...,313524224.0,1438814000.0,7,08-05-2024
7,Snowflake,{'avatarUrl': 'https://cdn-avatars.huggingface...,7756,False,Snowflake/snowflake-arctic-embed-s,2024-04-18T19:58:21.000Z,9,sentence-similarity,False,model,False,133MB | 60.1MB | 66.7MB | 34MB | 61.4MB | 34MB...,35651584.0,139460600.0,8,08-05-2024
8,jinaai,{'avatarUrl': 'https://cdn-avatars.huggingface...,15244,False,jinaai/jina-embeddings-v2-base-zh,2024-04-22T14:33:21.000Z,107,feature-extraction,False,model,False,641MB | 321MB | 162MB,169869312.0,672137200.0,9,08-05-2024
9,Snowflake,{'avatarUrl': 'https://cdn-avatars.huggingface...,10108,False,Snowflake/snowflake-arctic-embed-m-long,2024-04-18T19:58:17.000Z,21,sentence-similarity,False,model,False,548MB | 158MB | 274MB | 138MB | 165MB | 138MB ...,144703488.0,574619600.0,10,08-05-2024


# Save files 

In [11]:
df.to_excel(f"data/feature-extraction/transformersjs_{today}.xlsx")
df.to_parquet(f"data/feature-extraction/transformersjs_{today}.parquet")
df.to_csv(f"data/feature-extraction/transformersjs_{today}.csv")
df.to_json(f"data/feature-extraction/transformersjs_{today}.json")

# To html options (ready to be pasted for SemanticFinder) 

In [12]:
# Assuming df is your DataFrame
html_options = []

for index, row in df.iterrows():
    # Extracting relevant information from each row
    author = row['author']
    downloads = row['downloads']
    likes = row['likes']
    sizes = row['sizes']
    id = row['id']

    # Creating the option string
    option_str = f'<option value="{id}">{id} | 💾{sizes} 📥{downloads} ❤️{likes}</option>'
    
    # Adding the option to the list
    html_options.append(option_str)

# Joining all options into a single string
html_options_str = '\n'.join(html_options)

with open(f"data/feature-extraction/transformersjs_html_options_{today}.html", 'w') as file:
    file.write(html_options_str)
    

# To html table with filters/sorting

In [13]:
# Define the editors for your columns
tabulator_editors = {
    'float': {'type': 'number', 'max': 10, 'step': 0.1},
    'bool': {'type': 'tickCross', 'tristate': True, 'indeterminateValue': None},
    'str': {'type': 'list', 'valuesLookup': True},
}

# Create the Tabulator widget with header filters
header_filter_table = pn.widgets.Tabulator(
    df, layout='fit_columns',
    editors=tabulator_editors, header_filters=True
)

# Save the widget to HTML with header filters
header_filter_table.save(f"data/feature-extraction/transformersjs_{today}.html")
#df.to_html(index=False) # pandas has not sorting/filtering option


# Send ntfy notifications

In [14]:
# Format the DataFrame into a list
list_message = f"Trending HuggingFace Embedding Models - {today}\n"
list_message += f"{df.__len__()} available for feature-extraction in transformers.js:\n\n"

for index, row in df.head(10).iterrows():
    list_message += f"{index + 1}. {row['id']}, Likes: {row['likes']}, Downloads: {row['downloads']}\n Sizes: {row['sizes']}\n\n"

list_message += f"Meta data about all {df.__len__()} models can be downloaded on GitHub as csv, xlsx, json, parquet, html. Models can be downloaded from HuggingFace. Originally designed for SemanticFinder, a web app for in-browser semantic search where you can test all models without installing anything."


In [16]:
import requests
import json
from datetime import datetime

# Get the current date and weekday
current_date = datetime.now()
current_day_of_week = current_date.weekday()
current_day_of_month = current_date.day

# Define the base URL for the ntfy.sh server
base_url = "https://ntfy.sh/"

# Prepare the actions as a list of dictionaries
actions = [
    {"action": "view", "label": "GitHub", "url": "https://github.com/do-me/trending-huggingface-models"},
    {"action": "view", "label": "HuggingFace", "url": "https://huggingface.co/models?library=transformers.js&other=feature-extraction&sort=trending"},
    {"action": "view", "label": "SemanticFinder", "url": "https://do-me.github.io/SemanticFinder/"}
]

# Define the channel names
channels = {
    "daily": "feature_extraction_transformers_js_models_daily",
    "weekly": "feature_extraction_transformers_js_models_weekly",
    "monthly": "feature_extraction_transformers_js_models_monthly"
}

# Function to send notification
def send_notification(channel, message):
    payload = {
        "topic": channel,
        "message": list_message,
        "actions": actions
    }
    response = requests.post(base_url, json=payload)
    print(f"Notification sent to {channel}. Status Code: {response.status_code}")

# Send daily notification
send_notification(channels["daily"], "Daily request message")

# Check if today is Monday (0 is Monday, 6 is Sunday) and send weekly notification
if current_day_of_week == 0:
    send_notification(channels["weekly"], "Weekly request message")

# Check if today is the first of the month and send monthly notification
if current_day_of_month == 1:
    send_notification(channels["monthly"], "Monthly request message")

Notification sent to feature_extraction_transformers_js_models_daily. Status Code: 200


In [17]:
print(df.head(10))

          author                                         authorData  \
0  mixedbread-ai  {'avatarUrl': 'https://cdn-avatars.huggingface...   
1       nomic-ai  {'avatarUrl': 'https://cdn-avatars.huggingface...   
2       nomic-ai  {'avatarUrl': 'https://cdn-avatars.huggingface...   
3    Alibaba-NLP  {'avatarUrl': 'https://www.gravatar.com/avatar...   
4      WhereIsAI  {'avatarUrl': 'https://www.gravatar.com/avatar...   
5      Snowflake  {'avatarUrl': 'https://cdn-avatars.huggingface...   
6      Snowflake  {'avatarUrl': 'https://cdn-avatars.huggingface...   
7      Snowflake  {'avatarUrl': 'https://cdn-avatars.huggingface...   
8         jinaai  {'avatarUrl': 'https://cdn-avatars.huggingface...   
9      Snowflake  {'avatarUrl': 'https://cdn-avatars.huggingface...   

   downloads  gated                                       id  \
0    1755877  False       mixedbread-ai/mxbai-embed-large-v1   
1     145806  False           nomic-ai/nomic-embed-text-v1.5   
2     153516  False       