In [1]:
import os
import time
from datetime import datetime, timezone, timedelta
import pandas as pd
from huggingface_hub import HfApi, HfFolder
from tqdm import tqdm


# Constants
MIN_DOWNLOADS = 100
SINCE_DATE = datetime.now(timezone.utc) - timedelta(days=365*3)  # Collect models created in the last 3 year
OUTPUT_CSV = "hf_models.csv"
OUTPUT_DIR = "model_files"  # Directory to save model files

#os.environ["GITHUB_ACCESS_TOKEN"] = GITHUB_ACCESS_TOKEN
#os.environ["HUGGINGFACE_ACCESS_TOKEN"] = HUGGINGFACE_ACCESS_TOKEN



keywords = {
  "keywords": {
    "getting-started": {
      "gh-header-keywords": [
        "getting started",
        "installation",
        "quick start",
        "quickstart",
        "setup",
        "usage",
        "example",
        "demo"
      ],
      "hf-header-keywords": [
        "how to use",
        "usage",
        "inference",
        "example usage",
        "sample code",
        "model usage",
        "usage example"
      ],
      "gh-content-keywords": [
        "install",
        "pip",
        "requirements",
        "dependency",
        "clone",
        "git clone",
        "build",
        "run",
        "execute",
        "usage",
        "command line",
        "code snippet",
        "environment",
        "virtualenv",
        "conda",
        "script",
        "examples",
        "usage instructions",
        "dependencies",
        "install instructions"
      ],
      "hf-content-keywords": [
        "import",
        "from transformers import",
        "pipeline",
        "tokenizer",
        "model",
        "generate",
        "inference",
        "code snippet",
        "PyTorch",
        "TensorFlow",
        "example",
        "Hugging Face Hub",
        "task",
        "fine-tune",
        "load model",
        "preprocess",
        "prompt",
        "output",
        "usage example",
        "inference example"
      ]
    },
    "contributing": {
      "gh-header-keywords": [
        "contributing",
        "contribution",
        "contribute",
        "pull request",
        "bug report",
        "issue"
      ],
      "hf-header-keywords": [
        "how to contribute",
        "contribute",
        "report issues",
        "feedback",
        "suggestions",
        "contact",
        "collaborate",
        "acknowledgements"
      ],
      "gh-content-keywords": [
        "fork",
        "pull request",
        "issue tracker",
        "guidelines",
        "code style",
        "testing",
        "documentation",
        "contributing guide",
        "report bugs",
        "feature requests",
        "collaboration",
        "development",
        "submit",
        "branch",
        "merge",
        "code review",
        "community",
        "issues",
        "bug reports",
        "commit"
      ],
      "hf-content-keywords": [
        "issues",
        "contact",
        "suggestions",
        "improvements",
        "collaboration",
        "email",
        "open an issue",
        "feedback",
        "bug report",
        "help",
        "reach out",
        "community",
        "discussion",
        "contribution",
        "pull request",
        "modify",
        "enhance",
        "questions",
        "support",
        "contact author"
      ]
    },
    "license": {
      "gh-header-keywords": [
        "license",
        "licence",
        "copy right"
      ],
      "hf-header-keywords": [
        "license",
        "licence",
        "copyright",
        "terms",
        "usage terms",
        "legal",
        "rights"
      ],
      "gh-content-keywords": [
        "MIT",
        "Apache",
        "GPL",
        "BSD",
        "terms",
        "conditions",
        "distribution",
        "modification",
        "use",
        "commercial use",
        "liability",
        "warranty",
        "limitations",
        "rights",
        "reproduction",
        "software license",
        "license text",
        "open source",
        "copying",
        "proprietary"
      ],
      "hf-content-keywords": [
        "MIT",
        "Apache",
        "BSD",
        "GPL",
        "terms",
        "conditions",
        "use",
        "limitations",
        "copyright",
        "redistribution",
        "open source",
        "commercial use",
        "non-commercial",
        "Creative Commons",
        "CC BY",
        "license text",
        "restrictions",
        "public domain",
        "responsibility",
        "liability"
      ]
    }
  }
}

  from .autonotebook import tqdm as notebook_tqdm


In [None]:



# Authentication (optional)
ACCESS_TOKEN = os.getenv("HUGGINGFACE_ACCESS_TOKEN")  # Set your Hugging Face API token as an environment variable

# Initialize API
api = HfApi()

# If you have an access token, save it
if ACCESS_TOKEN:
    HfFolder.save_token(ACCESS_TOKEN)

# Function to collect model data
def collect_model_data():
    models_data = []
    fetch_limit = 12000  # Maximum number of models to fetch
    sort_order = "downloads"  # You can sort by 'lastModified', 'downloads', 'stars', etc.

    # Note: As of my knowledge cutoff in 2021, the Hugging Face Hub API does not support filtering by creation date directly.
    # We'll fetch models and filter them manually.

    # Search models
    print("Fetching models from Hugging Face Hub...")
    models = api.list_models(
        sort=sort_order,
        direction=-1,  # Descending order
        limit=fetch_limit,
        use_auth_token=ACCESS_TOKEN,
        full=True,  # Fetch full metadata
        
    )
    
    for model in tqdm(models):
        # Convert timestamp strings to datetime objects
        if hasattr(model, 'lastModified'):
            last_modified = model.lastModified
        else:
            last_modified = datetime.now(tz=model.lastModified.tzinfo)

        # Filter models created since the specified date
        if last_modified >= SINCE_DATE:
            # Get the model's downloads (may require an authenticated request)
            downloads = model.downloads if hasattr(model, 'downloads') else 0

            if downloads >= MIN_DOWNLOADS:
                model_data = {
                    "modelId": model.modelId,
                    "likes": model.likes if hasattr(model, 'likes') else 0,
                    "modelName": model.modelId.split("/")[-1],
                    "author": model.modelId.split("/")[0] if "/" in model.modelId else None,
                    "downloads": downloads,
                    "lastModified": model.lastModified,
                    "tags": model.tags,
                    "pipeline_tag": model.pipeline_tag,
                    "sha": model.sha,
                    "private": model.private,
                    "inference": model.inference,

                }
                #randomdly print downloads
                if downloads%22 == 0:
                    print(downloads)
                models_data.append(model_data)

        # Respect rate limits
        time.sleep(0.1)

    print(f"Total models collected: {len(models_data)}")
    return pd.DataFrame(models_data)



df = collect_model_data()

df.to_csv(OUTPUT_CSV, index=False)

Fetching models from Hugging Face Hub...


45it [00:05,  9.82it/s]

6913698


66it [00:07,  9.75it/s]

4515280


70it [00:08,  9.84it/s]

3900270


74it [00:08,  9.78it/s]

3729660


105it [00:11,  9.78it/s]

2731630


120it [00:13,  9.80it/s]

2303268


146it [00:15,  9.70it/s]

1874246


179it [00:19,  9.74it/s]

1703174


213it [00:22,  9.76it/s]

1312652


247it [00:26,  9.70it/s]

1133858


280it [00:29,  9.83it/s]

920678


295it [00:31,  9.82it/s]

862378


314it [00:33,  9.88it/s]

792704


316it [00:33,  9.89it/s]

788524


326it [00:34,  9.90it/s]

769494


340it [00:35,  9.90it/s]

747824


350it [00:36,  9.89it/s]

715484


371it [00:38,  9.83it/s]

666072


373it [00:38,  9.82it/s]

662464


382it [00:39,  9.83it/s]

646184


388it [00:40,  9.84it/s]

634942


389it [00:40,  9.81it/s]

630498


436it [00:45,  9.76it/s]

499906


459it [00:47,  9.87it/s]

470184


479it [00:49,  9.67it/s]

432806


490it [00:50,  9.67it/s]

425898


555it [00:57,  9.80it/s]

342408


575it [00:59,  9.73it/s]

323048


699it [01:12,  9.70it/s]

231176


740it [01:16,  9.65it/s]

207482


802it [01:23,  9.72it/s]

175516


805it [01:23,  9.73it/s]

175120


812it [01:24,  9.66it/s]

172876


826it [01:25,  9.78it/s]

166452


828it [01:25,  9.84it/s]

164780


844it [01:27,  9.84it/s]

159412


852it [01:28,  9.63it/s]

157366


863it [01:29,  9.74it/s]

153142


886it [01:31,  9.79it/s]

145926


897it [01:32,  9.79it/s]

142758


908it [01:33,  9.61it/s]

139766


912it [01:34,  9.77it/s]

138930


914it [01:34,  9.72it/s]

138138


1008it [01:44,  8.54it/s]

118030


1022it [01:46,  9.79it/s]

114224


1069it [01:51,  9.65it/s]

104720


1154it [01:59,  9.69it/s]

90662


1166it [02:01,  9.71it/s]

89386


1185it [02:03,  9.67it/s]

87186


1210it [02:05,  9.60it/s]

84502


1249it [02:09,  9.71it/s]

79904


1274it [02:12,  9.78it/s]

75416


1292it [02:14,  9.74it/s]

73590


1328it [02:17,  9.62it/s]

69938


1342it [02:19,  9.74it/s]

69212


1345it [02:19,  9.67it/s]

68728


1385it [02:23,  9.66it/s]

65868


1397it [02:24,  9.64it/s]

65428


1415it [02:26,  9.86it/s]

64306


1418it [02:26,  9.80it/s]

64218


1427it [02:27,  9.86it/s]

63624
63580


1441it [02:29,  9.88it/s]

61974


1457it [02:30,  9.86it/s]

60720


1466it [02:31,  9.80it/s]

59884


1481it [02:33,  9.74it/s]

59136


1486it [02:33,  9.85it/s]

58652


1488it [02:34,  9.87it/s]

58520


1567it [02:42,  9.77it/s]

55264


1593it [02:44,  9.86it/s]

53614


1609it [02:46,  9.89it/s]

51700


1624it [02:48,  9.72it/s]

50732


1644it [02:50,  9.72it/s]

49742


1648it [02:50,  9.73it/s]

49632


1650it [02:50,  9.65it/s]

49456


1689it [02:54,  9.67it/s]

47476


1783it [03:04,  9.63it/s]

43032


1840it [03:10,  9.65it/s]

40502


1921it [03:18,  9.75it/s]

36630


1926it [03:19,  9.80it/s]

36432


1951it [03:21,  9.70it/s]

35530


2005it [03:27,  6.61it/s]

33880


2056it [03:33,  9.66it/s]

31856


2074it [03:35,  9.65it/s]

31174
31108


2080it [03:35,  9.57it/s]

30910


2083it [03:35,  9.64it/s]

30800


2094it [03:37,  9.63it/s]

30492


2106it [03:38,  9.78it/s]

30272


2140it [03:41,  9.73it/s]

29414


2198it [03:47,  9.64it/s]

27588


2219it [03:49,  9.67it/s]

27016


2223it [03:50,  9.75it/s]

26950


2299it [03:58,  9.73it/s]

25080


2316it [04:00,  9.70it/s]

24860
24794


2358it [04:04,  9.72it/s]

24024


2441it [04:12,  9.63it/s]

22396


2451it [04:13,  9.79it/s]

22198


2455it [04:14,  9.67it/s]

22154


2464it [04:15,  9.74it/s]

21934


2482it [04:17,  9.72it/s]

21626


2495it [04:18,  9.70it/s]

21406


2498it [04:18,  9.71it/s]

21340


2532it [04:22,  9.68it/s]

20834


2551it [04:24,  9.61it/s]

20394


2562it [04:25,  9.70it/s]

20196


2627it [04:32,  9.73it/s]

19338


2650it [04:34,  9.71it/s]

19030


2667it [04:36,  9.65it/s]

18700


2702it [04:39,  9.73it/s]

18128


2724it [04:42,  9.83it/s]

17974


2752it [04:44,  9.55it/s]

17644


2766it [04:46,  9.68it/s]

17468


2796it [04:49,  9.73it/s]

16984


2831it [04:53,  9.74it/s]

16522


2857it [04:55,  9.83it/s]

16214


2867it [04:56,  9.64it/s]

16038


2881it [04:58,  9.67it/s]

15862
15862


2941it [05:04,  9.75it/s]

15312
15312


2943it [05:04,  9.71it/s]

15312


2956it [05:05,  9.65it/s]

15136


2969it [05:07,  9.70it/s]

15070
15070


2989it [05:09,  9.71it/s]

14850


3005it [05:11,  7.00it/s]

14630


3017it [05:12,  9.64it/s]

14542


3025it [05:13,  9.65it/s]

14410


3033it [05:14,  9.69it/s]

14344


3049it [05:16,  9.61it/s]

14168


3125it [05:23,  9.60it/s]

13552


3132it [05:24,  9.75it/s]

13486


3153it [05:26,  9.78it/s]

13354


3156it [05:27,  9.73it/s]

13332


3157it [05:27,  9.78it/s]

13310


3177it [05:29,  9.77it/s]

13134
13134


3199it [05:31,  9.75it/s]

12958


3206it [05:32,  9.70it/s]

12892
12892


3235it [05:35,  9.71it/s]

12650


3241it [05:35,  9.62it/s]

12628


3291it [05:41,  9.66it/s]

12188


3302it [05:42,  9.68it/s]

12122


3314it [05:43,  9.66it/s]

11990


3375it [05:49,  9.70it/s]

11462


3408it [05:53,  9.71it/s]

11242


3434it [05:55,  9.73it/s]

11044


3457it [05:58,  9.74it/s]

10890


3465it [05:58,  9.63it/s]

10824


3470it [05:59,  9.61it/s]

10802


3506it [06:03,  9.74it/s]

10582


3522it [06:04,  9.59it/s]

10494


3590it [06:11,  9.63it/s]

10230


3606it [06:13,  9.70it/s]

10164
10164


3608it [06:13,  9.65it/s]

10164


3620it [06:14,  9.59it/s]

10098


3704it [06:23,  9.73it/s]

9636


3763it [06:29,  9.68it/s]

9372


3780it [06:31,  9.81it/s]

9284


3785it [06:31,  9.77it/s]

9262


3836it [06:37,  9.73it/s]

9020


3850it [06:38,  9.68it/s]

8976


3878it [06:41,  9.70it/s]

8844


3886it [06:42,  9.78it/s]

8800
8800


3897it [06:43,  9.76it/s]

8734


3945it [06:48,  9.88it/s]

8514
8514


3990it [06:53,  9.53it/s]

8316


4008it [06:55,  8.33it/s]

8250


4023it [06:57,  9.69it/s]

8184
8184


4083it [07:03,  9.61it/s]

7964
7964


4091it [07:04,  9.79it/s]

7942


4123it [07:07,  9.76it/s]

7832


4128it [07:07,  9.74it/s]

7810


4133it [07:08,  9.78it/s]

7788


4139it [07:09,  9.69it/s]

7766


4164it [07:11,  9.65it/s]

7678


4181it [07:13,  9.75it/s]

7634


4198it [07:15,  9.59it/s]

7590


4206it [07:15,  9.67it/s]

7568


4230it [07:18,  9.79it/s]

7480
7480


4240it [07:19,  9.80it/s]

7436
7436


4270it [07:22,  9.62it/s]

7326


4339it [07:29,  9.78it/s]

7084
7084


4341it [07:29,  9.74it/s]

7084


4359it [07:31,  9.59it/s]

7040
7040


4385it [07:34,  9.60it/s]

6974


4390it [07:34,  9.73it/s]

6952


4428it [07:38,  9.66it/s]

6864


4443it [07:40,  9.72it/s]

6820


4459it [07:42,  9.76it/s]

6798


4474it [07:43,  9.62it/s]

6754


4480it [07:44,  9.78it/s]

6732


4483it [07:44,  9.80it/s]

6710
6710


4499it [07:46,  9.74it/s]

6666


4539it [07:50,  9.73it/s]

6556


4561it [07:52,  9.72it/s]

6468
6468


4574it [07:53,  9.87it/s]

6424
6424


4591it [07:55,  9.59it/s]

6380


4651it [08:01,  9.78it/s]

6248


4688it [08:05,  9.59it/s]

6160
6160


4702it [08:07,  9.70it/s]

6138


4722it [08:09,  9.69it/s]

6072


4730it [08:09,  9.55it/s]

6050


4740it [08:11,  9.70it/s]

6028


4765it [08:13,  9.82it/s]

5962


4792it [08:16,  9.78it/s]

5874
5874


4847it [08:21,  9.92it/s]

5786
5786


4861it [08:23,  9.91it/s]

5764


4871it [08:24,  9.90it/s]

5742


4893it [08:26,  9.90it/s]

5698


4917it [08:29,  9.90it/s]

5654


4939it [08:31,  9.91it/s]

5610


4947it [08:32,  9.91it/s]

5588


4963it [08:33,  9.91it/s]

5566


4973it [08:34,  9.90it/s]

5544
5544


4984it [08:35,  9.91it/s]

5522


5036it [08:41,  9.91it/s]

5434


5066it [08:44,  9.90it/s]

5412


5077it [08:45,  9.89it/s]

5390


5129it [08:50,  9.90it/s]

5324


5171it [08:55,  9.90it/s]

5258


5187it [08:56,  9.91it/s]

5236


5254it [09:03,  9.91it/s]

5126


5280it [09:06,  9.92it/s]

5082


5310it [09:09,  9.91it/s]

5038


5326it [09:10,  9.92it/s]

5016


5380it [09:16,  9.75it/s]

4950


5429it [09:21,  9.60it/s]

4884
4884


5432it [09:21,  9.74it/s]

4884


5455it [09:24,  9.59it/s]

4862


5528it [09:31,  9.71it/s]

4774


5544it [09:33,  9.72it/s]

4752


5556it [09:34,  9.71it/s]

4730


5586it [09:37,  9.69it/s]

4686


5614it [09:40,  9.73it/s]

4642


5627it [09:41,  9.67it/s]

4620
4620


5654it [09:44,  9.79it/s]

4576


5670it [09:46,  9.70it/s]

4554
4554


5701it [09:49,  9.73it/s]

4510
4510


5704it [09:49,  9.70it/s]

4510


5784it [09:58,  9.60it/s]

4422
4422


5785it [09:58,  9.59it/s]

4422


5802it [09:59,  9.66it/s]

4400


5826it [10:02,  9.82it/s]

4378
4378


5865it [10:06,  9.67it/s]

4356
4356


5922it [10:12,  9.79it/s]

4334
4334


5925it [10:12,  9.61it/s]

4334


5961it [10:16,  9.68it/s]

4312


5996it [10:19,  9.78it/s]

4290
4290


6071it [10:28,  9.68it/s]

4268
4268


6074it [10:28,  9.66it/s]

4268


6126it [10:33,  9.70it/s]

4224


6170it [10:38,  9.74it/s]

4202
4202


6172it [10:38,  9.74it/s]

4202


6215it [10:43,  9.69it/s]

4180


6229it [10:44,  9.66it/s]

4158
4158


6232it [10:44,  9.60it/s]

4158


6257it [10:47,  9.73it/s]

4136
4136


6259it [10:47,  9.69it/s]

4136
4136


6277it [10:49,  9.70it/s]

4114


6309it [10:52,  9.69it/s]

4070
4070


6321it [10:53,  9.69it/s]

4048
4048


6324it [10:54,  9.74it/s]

4048


6396it [11:01,  9.65it/s]

3938


6419it [11:04,  9.70it/s]

3916
3916


6449it [11:07,  9.60it/s]

3872
3872


6452it [11:07,  9.76it/s]

3872


6506it [11:12,  9.79it/s]

3806


6528it [11:15,  9.62it/s]

3784


6557it [11:18,  9.75it/s]

3762


6609it [11:23,  9.74it/s]

3740
3740


6612it [11:23,  9.67it/s]

3740
3740


6673it [11:30,  9.63it/s]

3718
3718


6676it [11:30,  9.66it/s]

3718


6702it [11:33,  9.57it/s]

3696


6726it [11:35,  9.72it/s]

3674
3674


6772it [11:40,  9.76it/s]

3630
3630


6794it [11:42,  9.64it/s]

3608
3608


6815it [11:44,  9.69it/s]

3586
3586


6816it [11:44,  9.67it/s]

3586


6843it [11:47,  9.69it/s]

3564


6867it [11:50,  9.74it/s]

3542


6893it [11:52,  9.76it/s]

3520
3520


6914it [11:55,  9.75it/s]

3498


6937it [11:57,  9.79it/s]

3476
3476


6955it [11:59,  9.77it/s]

3454
3454


6974it [12:01,  9.63it/s]

3432


6997it [12:03,  9.70it/s]

3410


7020it [12:06,  9.77it/s]

3388


7041it [12:08,  9.70it/s]

3366


7058it [12:10,  9.63it/s]

3344


7078it [12:12,  9.71it/s]

3322
3322


7081it [12:12,  9.67it/s]

3322


7126it [12:17,  9.64it/s]

3278
3278


7153it [12:20,  9.73it/s]

3256
3256


7215it [12:26,  9.66it/s]

3212


7241it [12:29,  9.74it/s]

3190
3190


7270it [12:32,  9.72it/s]

3168
3168


7309it [12:36,  9.59it/s]

3146
3146


7414it [12:47,  9.69it/s]

3080
3080


7451it [12:51,  9.74it/s]

3058


7502it [12:56,  9.63it/s]

3014
3014


7535it [12:59,  9.64it/s]

2992
2992


7538it [13:00,  9.61it/s]

2992


7575it [13:03,  9.57it/s]

2970
2970


7576it [13:04,  9.50it/s]

2970


7623it [13:08,  9.71it/s]

2948


7657it [13:12,  9.62it/s]

2926
2926


7688it [13:15,  9.65it/s]

2904
2904


7690it [13:15,  9.71it/s]

2904
2904


7740it [13:21,  9.69it/s]

2882
2882


7778it [13:24,  9.52it/s]

2860
2860


7781it [13:25,  9.66it/s]

2860
2860


7831it [13:30,  9.72it/s]

2838
2838


7874it [13:34,  9.64it/s]

2816


7907it [13:38,  9.75it/s]

2794
2794


7937it [13:41,  9.62it/s]

2772
2772


7978it [13:45,  9.75it/s]

2750


8043it [13:52,  9.64it/s]

2706


8083it [13:56,  9.61it/s]

2684


8121it [14:00,  9.76it/s]

2662


8168it [14:05,  9.61it/s]

2640
2640


8170it [14:05,  9.62it/s]

2640


8195it [14:08,  9.58it/s]

2618
2618


8196it [14:08,  9.55it/s]

2618
2618


8272it [14:16,  9.77it/s]

2574
2574


8313it [14:20,  9.63it/s]

2552
2552


8358it [14:25,  9.63it/s]

2530


8401it [14:29,  9.68it/s]

2508


8463it [14:36,  9.55it/s]

2486


8507it [14:40,  9.69it/s]

2464


8561it [14:46,  9.65it/s]

2442
2442


8609it [14:51,  9.71it/s]

2420
2420


8646it [14:55,  9.68it/s]

2398
2398


8690it [14:59,  9.61it/s]

2376
2376


8692it [14:59,  9.50it/s]

2376


8719it [15:02,  9.66it/s]

2354
2354


8721it [15:02,  9.60it/s]

2354


8764it [15:07,  9.70it/s]

2332


8784it [15:09,  9.68it/s]

2310
2310


8820it [15:13,  9.78it/s]

2288
2288


8859it [15:17,  9.59it/s]

2266
2266


8860it [15:17,  9.60it/s]

2266


8904it [15:21,  9.70it/s]

2244


8949it [15:26,  9.57it/s]

2222
2222


8952it [15:26,  9.64it/s]

2222


9010it [15:33,  8.83it/s]

2200
2200


9013it [15:33,  9.30it/s]

2200


9048it [15:37,  9.69it/s]

2178
2178


9051it [15:37,  9.66it/s]

2178
2178


9095it [15:42,  9.70it/s]

2156
2156


9180it [15:51,  9.68it/s]

2112


9212it [15:54,  9.70it/s]

2090
2090


9312it [16:04,  9.70it/s]

2046
2046


9314it [16:04,  9.69it/s]

2046


9410it [16:14,  9.69it/s]

2002
2002


9412it [16:15,  9.68it/s]

2002


9446it [16:18,  9.73it/s]

1980


9489it [16:23,  9.58it/s]

1958


9561it [16:30,  9.73it/s]

1936
1936


9719it [16:46,  9.70it/s]

1870
1870


9722it [16:47,  9.56it/s]

1870
1870


9792it [16:54,  9.69it/s]

1848
1848


9794it [16:54,  9.62it/s]

1848
1848


9869it [17:02,  9.78it/s]

1826
1826


9871it [17:02,  9.71it/s]

1826
1826


9983it [17:14,  9.52it/s]

1804
1804


9986it [17:14,  9.66it/s]

1804
1804


9987it [17:14,  9.65it/s]

1804


10166it [17:33,  9.65it/s]

1782
1782


10168it [17:34,  9.63it/s]

1782
1782


10169it [17:34,  9.65it/s]

1782
1782


10171it [17:34,  9.71it/s]

1782


10289it [17:46,  9.73it/s]

1760
1760


10290it [17:46,  9.65it/s]

1760
1760


10351it [17:52,  9.60it/s]

1738
1738


10352it [17:53,  9.59it/s]

1738


10433it [18:01,  9.65it/s]

1716
1716


10435it [18:01,  9.70it/s]

1716


10501it [18:08,  9.55it/s]

1694
1694


10502it [18:08,  9.54it/s]

1694


10574it [18:15,  9.74it/s]

1672
1672


10659it [18:24,  9.49it/s]

1650
1650


10735it [18:32,  9.57it/s]

1628
1628


10737it [18:32,  9.62it/s]

1628


10816it [18:41,  9.61it/s]

1606
1606


10819it [18:41,  9.67it/s]

1606
1606


10821it [18:41,  9.65it/s]

1606


10895it [18:49,  9.60it/s]

1584
1584


10898it [18:49,  9.58it/s]

1584
1584


10899it [18:49,  9.58it/s]

1584
1584


10996it [18:59,  9.79it/s]

1562
1562


10997it [18:59,  9.66it/s]

1562
1562


10999it [18:59,  9.63it/s]

1562
1562


11126it [19:13,  9.57it/s]

1540
1540


11128it [19:14,  9.57it/s]

1540
1540


11131it [19:14,  9.65it/s]

1540
1540


11133it [19:14,  9.60it/s]

1540


11258it [19:27,  9.67it/s]

1518
1518


11259it [19:27,  9.55it/s]

1518
1518


11261it [19:27,  9.61it/s]

1518


11377it [19:39,  9.64it/s]

1496
1496


11379it [19:40,  9.56it/s]

1496
1496


11381it [19:40,  9.55it/s]

1496
1496


11383it [19:40,  9.55it/s]

1496
1496


11385it [19:40,  9.62it/s]

1496
1496


11387it [19:40,  9.59it/s]

1496


11526it [19:55,  9.60it/s]

1474
1474


11527it [19:55,  9.65it/s]

1474
1474


11529it [19:55,  9.64it/s]

1474
1474


11532it [19:55,  9.57it/s]

1474
1474


11773it [20:20,  9.71it/s]

1452
1452


11775it [20:20,  9.77it/s]

1452
1452


11778it [20:21,  9.63it/s]

1452
1452


11779it [20:21,  9.57it/s]

1452
1452


11782it [20:21,  9.66it/s]

1452
1452


11783it [20:21,  9.61it/s]

1452


12000it [20:44,  9.64it/s]


Total models collected: 11649


In [5]:
import os
import time
import shutil
import pandas as pd
OUTPUT_CSV = "hf_models.csv"
OUTPUT_DIR = "model_cards"  # Directory to save model files
ACCESS_TOKEN = os.getenv("HUGGINGFACE_ACCESS_TOKEN") 
from huggingface_hub import HfApi, HfFolder
from tqdm import tqdm


df = pd.read_csv(OUTPUT_CSV)

# Initialize API
api = HfApi()

# Optional: Function to download model files
def download_model_files(model_ids):
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    for model_id in tqdm(model_ids):

        #check if readme already exists
        if os.path.exists(f"{OUTPUT_DIR}/{model_id.replace('/', '_')}.md"):
            #print(f"Skipping {model_id} as README.md already exists")
            continue

        #print(f"Downloading files for model: {model_id}")
        #files = api.list_repo_files(repo_id=model_id, use_auth_token=ACCESS_TOKEN)

        # Create a directory for each model
        model_dir = os.path.join(OUTPUT_DIR, model_id.replace("/", "_"))
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)

        #downlaod it as {model_id}.md
        try:
            api.hf_hub_download(
                repo_id=model_id,
                filename='README.md',
                local_dir=model_dir,
                use_auth_token=ACCESS_TOKEN,
            )
        except Exception as e:
            #print(f"Error downloading README.md for model {model_id}: {e}")
            continue
        #move the file to OUTPUT_DIR/{model_id}.md
        model_id = model_id.replace("/", "_")
        os.rename(f"{model_dir}/README.md", f"{OUTPUT_DIR}/{model_id}.md")
        #delete folder and contents# folder may still have files
        
        shutil.rmtree(model_dir)


        # Respect rate limits
        time.sleep(0.1)
download_model_files(df.modelId.values.tolist())

100%|██████████| 11649/11649 [17:22<00:00, 11.18it/s]


In [2]:
import pandas as pd
repos_df = pd.read_csv("hf_models.csv")
repos_df.head(20)
readme_text = []

#for every repo in the csv, get the readme
for index,row in repos_df.iterrows():
    repo_full_name = row["modelId"]
    safe_name = repo_full_name.replace("/", "_")

    #open the readme file
    try:
        with open(f"model_cards/{safe_name}.md", "r", encoding="utf-8") as f:
            readme = f.read()
            readme_text.append(readme)
    except:
        readme_text.append(None)
repos_df["readme_text"] = readme_text
repos_df = repos_df.dropna(subset=["readme_text"])

from readmepp import ReadMe
predictor = ReadMe(lang='en')
predictor.model.to('cuda')

NameError: name 'ReadMe' is not defined

# Process Dependant Columns

In [4]:
import re
import numpy as np

#count the occurences of "^- " of each readme
lists_count = repos_df["readme_text"].str.count(r"^- ", flags=re.MULTILINE)
#log then normalize it
lists_count = np.log1p(1 + lists_count)
lists_count = (lists_count - lists_count.min()) / (lists_count.max() - lists_count.min())
repos_df["lists_count"] = lists_count

#count hypperlinks
hyperlinks_count = repos_df["readme_text"].str.count(r"http[s]?:\/\/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", flags=re.MULTILINE)
#log then normalize it
hyperlinks_count = np.log1p(1 + hyperlinks_count)
hyperlinks_count = (hyperlinks_count - hyperlinks_count.min()) / (hyperlinks_count.max() - hyperlinks_count.min())
repos_df["hyperlinks_count"] = hyperlinks_count

#number of images ending in .png, .jpg, .jpeg, .gif (url or local path) in .md ir .rst
#[image] (https://example.com/image.png)
#image:: https://example.com/image.png
#<.*src.*=.*\.(png|jpg|jpeg|gif|svg).*>
images_count = repos_df["readme_text"].str.count(r"!\[.*\]\(.*\.(png|jpg|jpeg|gif|svg)\)", flags=re.MULTILINE) + \
            repos_df["readme_text"].str.count(r"image::.*\.(png|jpg|jpeg|gif|svg)", flags=re.MULTILINE) + \
            repos_df["readme_text"].str.count(r"<.*src.*=.*\.(png|jpg|jpeg|gif|svg).*>", flags=re.MULTILINE)
#log then normalize it
images_count = np.log1p(1 + images_count)
images_count = (images_count - images_count.min()) / (images_count.max() - images_count.min())
repos_df["images_count"] = images_count

#number of code blocks divide by 2 and round down
code_blocks_count = (repos_df["readme_text"].str.count(r"^```", flags=re.MULTILINE) // 2).astype(int)
#log then normalize it
code_blocks_count = np.log1p(1 + code_blocks_count)
code_blocks_count = (code_blocks_count - code_blocks_count.min()) / (code_blocks_count.max() - code_blocks_count.min())
repos_df["code_blocks_count"] = code_blocks_count

#number of new lines with content
content_lines_count = repos_df["readme_text"].str.count(r"^.*[^\s]", flags=re.MULTILINE)
#log then normalize it
content_lines_count = np.log1p(1 + content_lines_count)
content_lines_count = (content_lines_count - content_lines_count.min()) / (content_lines_count.max() - content_lines_count.min())
repos_df["content_lines_count"] = content_lines_count



# Flesch Reading Score
#textstat.flesch_reading_ease(test_data)
import textstat
flesch_reading_scores = repos_df["readme_text"].apply(lambda x: textstat.flesch_reading_ease(x) if x else None)
#normalize
flesch_reading_scores = (flesch_reading_scores - flesch_reading_scores.min()) / (flesch_reading_scores.max() - flesch_reading_scores.min())
repos_df["flesch_reading_scores"] = flesch_reading_scores


# number of headers
headers_count = repos_df["readme_text"].str.count(r"#+", flags=re.MULTILINE)
#log then normalize
headers_count = np.log1p(1 + headers_count)
headers_count = (headers_count - headers_count.min()) / (headers_count.max() - headers_count.min())
repos_df["headers_count"] = headers_count

In [5]:
repos_df["readmepp"] = repos_df["readme_text"].apply(lambda x: predictor.predict(x) if x else None)

In [6]:
import re
import pandas as pd

def parse_markdown_headers(text):
    '''Parse markdown text into a list of (header_level, header_text, content) tuples'''
    lines = text.split('\n')
    headers = []
    current_header = None
    current_content = []
    current_level = None
    for line in lines:
        header_match = re.match(r'^(#{1,3})\s+(.*)', line)
        if header_match:
            # Save the current content if any
            if current_header is not None:
                headers.append((current_level, current_header, '\n'.join(current_content)))
            # Start a new header
            current_level = len(header_match.group(1))
            current_header = header_match.group(2)
            current_content = []
        else:
            if current_header is not None:
                current_content.append(line)
    # Save the last header content
    if current_header is not None:
        headers.append((current_level, current_header, '\n'.join(current_content)))
    return headers

def process_readme(text, keywords_subset,content_subset, levels=(1,2,3)):
    """
    Process a single README text.
    Returns:
        header_found: True if at least one header keyword is found in headers of specified levels.
        percentage: Percentage of keywords found in content under matching headers.
    """
    headers = parse_markdown_headers(text)
    # Initialize
    header_found = False
    content_text = ''
    index = 0
    for level, header_text, content in headers:
        if level in levels:
            # Check if any of the keywords are present in the header text
            if any(kw.lower() in header_text.lower() for kw in keywords_subset):
                header_found = True
                #then collect all header with lower level and stop untill equal or higher level
                content_text += '\n' + content
                for next_level, next_header_text, next_content in headers[index+1:]:
                    if next_level <= level:
                        break
                    content_text += '\n' + next_content

        index += 1
    # If no matching headers, percentage is 0
    if not header_found or not keywords_subset:
        return header_found, 0.0
    # Now compute the percentage of keywords found in content_text
    total_keywords = len(content_subset)
    found_keywords = sum(1 for kw in content_subset if kw.lower() in content_text.lower())
    percentage = found_keywords / total_keywords
    return header_found, percentage

# Modify your main loop
readme_type = "hf"  # "gh" or "hf"
for keyword_category in keywords["keywords"]:
    print(keyword_category)

    # Initialize columns
    header_column = f"header_{keyword_category}"
    percentage_column = f"percentage_{keyword_category}"
    #repos_df[header_column] = False  # This will be the header_found boolean
    #repos_df[percentage_column] = 0.0  # This will be the keyword percentage

    keywords_subset = keywords["keywords"][keyword_category][f"{readme_type}-header-keywords"]
    content_subset = keywords["keywords"][keyword_category][f"{readme_type}-content-keywords"]

    # Apply the function to each row
    def process_row(row):
        text = row['readme_text']
        header_found, percentage = process_readme(text, keywords_subset, content_subset)
        return pd.Series({header_column: header_found, percentage_column: percentage})

    repos_df[[header_column, percentage_column]] = repos_df.apply(process_row, axis=1)

getting-started
contributing
license


# Dependant Extraction

In [64]:
downloads_normalized = np.log1p(1 + repos_df["downloads"])
repos_df["downloads_normalized"] = (downloads_normalized - downloads_normalized.min()) / (downloads_normalized.max() - downloads_normalized.min())

likes_normalized = np.log1p(1 + repos_df["likes"])
repos_df["likes_normalized"] = (likes_normalized - likes_normalized.min()) / (likes_normalized.max() - likes_normalized.min())

heurstics = downloads_normalized/likes_normalized
heurstics = (heurstics - heurstics.min()) / (heurstics.max() - heurstics.min())
repos_df['onboarding_normalized'] = heurstics


repos_df[['readmepp_normalized']] = (repos_df[['readmepp']] - 1)/5

# Get Statistics

In [65]:
#Independant Columns normalized between 0 and 1
count_independent_columns = ["lists_count", "hyperlinks_count", "images_count", "code_blocks_count", "content_lines_count", "headers_count"]
#score Indecpend column
score_independent_columns = ["flesch_reading_scores", "readmepp_normalized"]
#keyword independant columns#binary 0 or 1
keyword_independent_columns = ["header_getting-started", "header_contributing", "header_license"]
#
dependant_columns = ["likes_normalized","downloads_normalized","onboarding_normalized"]
content_indenpendant_columns = ["percentage_contributing", "percentage_getting-started", "percentage_license"]


independent_columns = count_independent_columns + score_independent_columns + keyword_independent_columns



score_dict={"counts":{ column:{} for column in count_independent_columns}
            ,
       "keyword":{ column:{} for column in keyword_independent_columns},
       "content": { column:{} for column in content_indenpendant_columns},
       "scores":{ column:{} for column in score_independent_columns}
       }


repos_df = repos_df.dropna(subset=independent_columns + dependant_columns)
# Pearson correlation
from sklearn.feature_selection import r_regression

for dependant in dependant_columns:
    score = r_regression(repos_df[independent_columns].values, repos_df[dependant].values)

    for i, column in enumerate(independent_columns):
        print(f"Pearson correlation between {column} and {dependant}: {round(score[i],3)}")

        if column in count_independent_columns:
            score_dict["counts"][column][dependant] = round(score[i],3)
        elif column in keyword_independent_columns:
            score_dict["keyword"][column][dependant] = round(score[i],3)
        elif column in score_independent_columns:
            score_dict["scores"][column][dependant] = round(score[i],3)

for dependant in dependant_columns:
    for i in range(len(content_indenpendant_columns)):
        content_col = content_indenpendant_columns[i]
        keyword_col = keyword_independent_columns[i]
        repos_df_sub = repos_df[repos_df[keyword_col] == True]
        
        score = r_regression(repos_df_sub[[content_col]].values, repos_df_sub[dependant].values)
        print(f"Pearson correlation between {content_col} and {dependant}: {round(score[0],3)}")
        score_dict["content"][content_col][dependant] = round(score[0],3)

Pearson correlation between lists_count and likes_normalized: 0.121
Pearson correlation between hyperlinks_count and likes_normalized: 0.278
Pearson correlation between images_count and likes_normalized: 0.155
Pearson correlation between code_blocks_count and likes_normalized: 0.282
Pearson correlation between content_lines_count and likes_normalized: 0.307
Pearson correlation between headers_count and likes_normalized: 0.308
Pearson correlation between flesch_reading_scores and likes_normalized: 0.143
Pearson correlation between readmepp_normalized and likes_normalized: 0.14
Pearson correlation between header_getting-started and likes_normalized: 0.138
Pearson correlation between header_contributing and likes_normalized: 0.117
Pearson correlation between header_license and likes_normalized: 0.175
Pearson correlation between lists_count and downloads_normalized: 0.066
Pearson correlation between hyperlinks_count and downloads_normalized: 0.108
Pearson correlation between images_count a

In [78]:
# convert to pands dataframe with mulit index
# rows is the score type
# columns is the feature (with subcolumns for each method)

score_df = pd.DataFrame()
for score_type, score_data in score_dict.items():
    df_sub = pd.DataFrame(score_data).T
    df_sub = df_sub.reset_index()
    df_sub.insert(0, "score_type", score_type)
    
    score_df = pd.concat([score_df, df_sub], axis=0)

sort_array = [2] * len(content_indenpendant_columns) + [0] * len(count_independent_columns) + [1] * len(keyword_independent_columns) + [3] * len(score_independent_columns)
df_group = score_df.groupby(["score_type",'index']).mean()
df_group["sort"] = sort_array
df_group = df_group.sort_values(by=["sort"]).drop(columns="sort")
df_group

Unnamed: 0_level_0,Unnamed: 1_level_0,likes_normalized,downloads_normalized,onboarding_normalized
score_type,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
counts,code_blocks_count,0.282,0.12,-0.244
counts,content_lines_count,0.307,0.137,-0.265
counts,headers_count,0.308,0.135,-0.253
counts,hyperlinks_count,0.278,0.108,-0.237
counts,images_count,0.155,-0.06,-0.158
counts,lists_count,0.121,0.066,-0.086
keyword,header_contributing,0.117,-0.026,-0.123
keyword,header_getting-started,0.138,0.107,-0.112
keyword,header_license,0.175,0.011,-0.159
content,percentage_contributing,0.117,-0.059,-0.154


In [73]:
sort_array

[2, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3, 3]

In [26]:
repos_df

Unnamed: 0,modelId,likes,modelName,author,downloads,lastModified,tags,pipeline_tag,sha,private,...,headers_count,header_getting-started,percentage_getting-started,header_contributing,percentage_contributing,header_license,percentage_license,downloads_normalized,likes_normalized,popularity_heurstic
0,sentence-transformers/all-mpnet-base-v2,891,all-mpnet-base-v2,sentence-transformers,436946085,2024-11-05 15:25:48+00:00,"['sentence-transformers', 'pytorch', 'onnx', '...",sentence-similarity,9a3225965996d404b775526de6dbfe85d3368642,False,...,0.496783,True,0.3,False,0.0,False,0.00,1.000000,0.753362,1.000000
1,sentence-transformers/all-MiniLM-L6-v2,2562,all-MiniLM-L6-v2,sentence-transformers,95520454,2024-11-01 10:26:30+00:00,"['sentence-transformers', 'pytorch', 'tf', 'ru...",sentence-similarity,fa97f6e7cb1a59073dff9e6b13e2715cf7475ac9,False,...,0.496783,True,0.3,False,0.0,False,0.00,0.879567,0.883593,0.975130
2,nesaorg/benchmark_v0,1,benchmark_v0,nesaorg,73440489,2024-08-19 18:24:49+00:00,"['safetensors', 'model_hub_mixin', 'pytorch_mo...",,7e0c8823b673ea012ce4c0f9868802648a12305d,False,...,0.082473,False,0.0,False,0.0,False,0.00,0.858746,0.050064,0.600602
3,FacebookAI/xlm-roberta-large,349,xlm-roberta-large,FacebookAI,68346199,2024-02-19 12:48:30+00:00,"['transformers', 'pytorch', 'tf', 'jax', 'onnx...",fill-mask,c23d21b0620b635a76227c604d44e43a9f0ee389,False,...,0.305935,True,0.3,False,0.0,False,0.00,0.853052,0.638063,0.851065
4,google-bert/bert-base-uncased,1914,bert-base-uncased,google-bert,68302750,2024-02-19 11:06:12+00:00,"['transformers', 'pytorch', 'tf', 'jax', 'rust...",fill-mask,86b5e0934494bd15c9632b12f734a8a67f723594,False,...,0.409839,True,0.4,False,0.0,False,0.00,0.853001,0.847622,0.941663
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11643,Lajonbot/Llama-2-7b-chat-hf-instruct-pl-lora_u...,1,Llama-2-7b-chat-hf-instruct-pl-lora_unload,Lajonbot,1435,2023-07-28 07:31:07+00:00,"['transformers', 'pytorch', 'llama', 'text-gen...",text-generation,f838fda8d2b97effae1e8af4dbb6217eab14fb7e,False,...,0.000000,False,0.0,False,0.0,False,0.00,0.000000,0.050064,0.021652
11644,jondurbin/airoboros-l2-7b-2.1,6,airoboros-l2-7b-2.1,jondurbin,1435,2023-09-08 09:24:46+00:00,"['transformers', 'pytorch', 'llama', 'text-gen...",text-generation,bc5737f0e7636fae2b22c5ccdee96ec5d5e1e1e4,False,...,0.380732,True,0.3,True,0.1,True,0.25,0.000000,0.171170,0.074029
11645,CHIH-HUNG/llama-2-13b-FINETUNE1_17w,0,llama-2-13b-FINETUNE1_17w,CHIH-HUNG,1435,2023-09-13 17:41:38+00:00,"['transformers', 'pytorch', 'llama', 'text-gen...",text-generation,b499f140dc69506035dcb8212c66fcedff933d29,False,...,0.346753,False,0.0,False,0.0,False,0.00,0.000000,0.000000,0.000000
11646,CHIH-HUNG/llama-2-13b-FINETUNE1_17w-r4,0,llama-2-13b-FINETUNE1_17w-r4,CHIH-HUNG,1435,2023-09-12 12:33:46+00:00,"['transformers', 'pytorch', 'llama', 'text-gen...",text-generation,bae66e90d7e30e4d6f8cf7cd460f52b993d49244,False,...,0.346753,False,0.0,False,0.0,False,0.00,0.000000,0.000000,0.000000


In [27]:
print(f"getting-started: {repos_df['header_getting-started'].sum()}")
print(f"contributing: {repos_df['header_contributing'].sum()}")
print(f"license: {repos_df['header_license'].sum()}")

getting-started: 4190
contributing: 1453
license: 1071
