In [None]:
 
import os
os.environ["GITHUB_ACCESS_TOKEN"] = GITHUB_ACCESS_TOKEN
os.environ["HUGGINGFACE_ACCESS_TOKEN"] = HUGGINGFACE_ACCESS_TOKEN

In [4]:
import os
import time
from datetime import datetime, timezone, timedelta
import pandas as pd
from huggingface_hub import HfApi, HfFolder
from tqdm import tqdm


# Constants
MIN_DOWNLOADS = 100
SINCE_DATE = datetime.now(timezone.utc) - timedelta(days=365*3)  # Collect models created in the last year
OUTPUT_CSV = "hf_models.csv"
OUTPUT_DIR = "model_files"  # Directory to save model files

# Authentication (optional)
ACCESS_TOKEN = os.getenv("HUGGINGFACE_ACCESS_TOKEN")  # Set your Hugging Face API token as an environment variable

# Initialize API
api = HfApi()

# If you have an access token, save it
if ACCESS_TOKEN:
    HfFolder.save_token(ACCESS_TOKEN)

# Function to collect model data
def collect_model_data():
    models_data = []
    fetch_limit = 12000  # Maximum number of models to fetch
    sort_order = "downloads"  # You can sort by 'lastModified', 'downloads', 'stars', etc.

    # Note: As of my knowledge cutoff in 2021, the Hugging Face Hub API does not support filtering by creation date directly.
    # We'll fetch models and filter them manually.

    # Search models
    print("Fetching models from Hugging Face Hub...")
    models = api.list_models(
        sort=sort_order,
        direction=-1,  # Descending order
        limit=fetch_limit,
        use_auth_token=ACCESS_TOKEN,
        full=True,  # Fetch full metadata
        
    )
    
    for model in tqdm(models):
        # Convert timestamp strings to datetime objects
        if hasattr(model, 'lastModified'):
            last_modified = model.lastModified
        else:
            last_modified = datetime.now(tz=model.lastModified.tzinfo)

        # Filter models created since the specified date
        if last_modified >= SINCE_DATE:
            # Get the model's downloads (may require an authenticated request)
            downloads = model.downloads if hasattr(model, 'downloads') else 0

            if downloads >= MIN_DOWNLOADS:
                model_data = {
                    "modelId": model.modelId,
                    "modelName": model.modelId.split("/")[-1],
                    "author": model.modelId.split("/")[0] if "/" in model.modelId else None,
                    "downloads": downloads,
                    "lastModified": model.lastModified,
                    "tags": model.tags,
                    "pipeline_tag": model.pipeline_tag,
                    "sha": model.sha,
                    "private": model.private,
                    "inference": model.inference,

                }
                #randomdly print downloads
                if downloads%22 == 0:
                    print(downloads)
                models_data.append(model_data)

        # Respect rate limits
        time.sleep(0.1)

    print(f"Total models collected: {len(models_data)}")
    return pd.DataFrame(models_data)



df = collect_model_data()

df.to_csv(OUTPUT_CSV, index=False)

Fetching models from Hugging Face Hub...


50it [00:05,  9.85it/s]

5326046


65it [00:07,  9.84it/s]

3957492


81it [00:08,  9.66it/s]

3042292


87it [00:09,  9.79it/s]

2837098


98it [00:10,  9.78it/s]

2529142


127it [00:13,  9.73it/s]

1739826


130it [00:13,  9.77it/s]

1647514


133it [00:14,  9.85it/s]

1644148


144it [00:15,  9.82it/s]

1483614


155it [00:16,  9.82it/s]

1339910


177it [00:18,  9.67it/s]

1163536


219it [00:22,  9.69it/s]

865216


220it [00:23,  9.60it/s]

860354


231it [00:24,  9.78it/s]

812856


233it [00:24,  9.77it/s]

796752


254it [00:26,  9.73it/s]

723316


262it [00:27,  9.71it/s]

701162


281it [00:29,  9.82it/s]

646404


291it [00:30,  9.72it/s]

614306


315it [00:32,  9.85it/s]

540364


358it [00:37,  9.86it/s]

447920


372it [00:38,  9.79it/s]

419320


374it [00:38,  9.84it/s]

414414


424it [00:43,  9.72it/s]

361130


451it [00:46,  9.73it/s]

333652
333278


464it [00:48,  9.70it/s]

322344


477it [00:49,  9.77it/s]

313698


538it [00:55,  9.70it/s]

258434


607it [01:02,  9.76it/s]

210364


630it [01:05,  9.71it/s]

200288


635it [01:05,  9.64it/s]

199122


679it [01:10,  9.69it/s]

178266


732it [01:15,  9.58it/s]

157454
156552


748it [01:17,  9.70it/s]

148940


749it [01:17,  9.57it/s]

148544


762it [01:18,  9.61it/s]

145486


769it [01:19,  9.78it/s]

142406


787it [01:21,  9.78it/s]

135872


804it [01:22,  9.78it/s]

132352


872it [01:29,  9.68it/s]

112332


883it [01:31,  9.84it/s]

110858


902it [01:33,  9.89it/s]

106216


952it [01:38,  9.70it/s]

97416


954it [01:38,  9.67it/s]

97152


986it [01:41,  9.87it/s]

92158


1003it [01:43,  6.64it/s]

90222


1028it [01:46,  9.83it/s]

86064


1093it [01:52,  9.79it/s]

76670


1167it [02:00,  9.69it/s]

67738


1172it [02:00,  9.73it/s]

67298


1222it [02:06,  9.70it/s]

64240


1244it [02:08,  9.68it/s]

63206


1257it [02:09,  9.75it/s]

62436


1288it [02:12,  9.71it/s]

61512


1295it [02:13,  9.74it/s]

61490


1354it [02:19,  9.78it/s]

55154


1363it [02:20,  9.67it/s]

54670


1378it [02:22,  9.72it/s]

53746


1394it [02:23,  9.59it/s]

52602


1474it [02:32,  9.73it/s]

45760


1509it [02:35,  9.67it/s]

43626


1538it [02:38,  9.64it/s]

42196


1539it [02:38,  9.54it/s]

42108


1575it [02:42,  9.78it/s]

40414


1592it [02:44,  9.75it/s]

39644


1601it [02:45,  9.64it/s]

39468


1619it [02:47,  9.67it/s]

38742


1657it [02:50,  9.60it/s]

37026


1712it [02:56,  9.63it/s]

34936


1718it [02:57,  9.64it/s]

34804


1792it [03:04,  9.72it/s]

32164


1853it [03:11,  9.64it/s]

30558


1871it [03:13,  9.76it/s]

29810


1880it [03:13,  9.62it/s]

29502


1892it [03:15,  9.72it/s]

29172


1917it [03:17,  9.66it/s]

28688


1969it [03:23,  9.62it/s]

27390


2019it [03:28,  9.66it/s]

26356


2023it [03:29,  9.64it/s]

26268


2029it [03:29,  9.75it/s]

26070


2034it [03:30,  9.74it/s]

25960


2059it [03:32,  9.83it/s]

25366


2121it [03:39,  9.88it/s]

23804


2140it [03:41,  9.80it/s]

23518


2157it [03:42,  9.71it/s]

23122


2174it [03:44,  9.83it/s]

22858


2196it [03:46,  9.73it/s]

22462


2202it [03:47,  9.83it/s]

22352


2234it [03:50,  9.74it/s]

21780


2246it [03:51,  9.75it/s]

21494


2252it [03:52,  9.63it/s]

21274


2257it [03:53,  9.67it/s]

21186


2265it [03:53,  9.69it/s]

20988


2295it [03:57,  9.80it/s]

20460


2378it [04:05,  9.71it/s]

19030


2430it [04:10,  9.70it/s]

18238


2503it [04:18,  9.66it/s]

17072


2511it [04:19,  9.64it/s]

16962


2537it [04:21,  9.68it/s]

16654


2564it [04:24,  9.72it/s]

16368


2593it [04:27,  9.73it/s]

16016


2613it [04:29,  9.62it/s]

15840


2645it [04:33,  9.66it/s]

15444


2692it [04:37,  9.69it/s]

14850


2708it [04:39,  9.72it/s]

14674


2716it [04:40,  9.63it/s]

14564


2720it [04:40,  9.75it/s]

14520


2723it [04:41,  9.69it/s]

14476


2726it [04:41,  9.64it/s]

14454


2740it [04:42,  9.75it/s]

14300


2762it [04:45,  9.67it/s]

14102


2781it [04:47,  9.66it/s]

13882
13882


2791it [04:48,  9.68it/s]

13794


2805it [04:49,  9.71it/s]

13596


2845it [04:53,  9.61it/s]

13112


2916it [05:01,  9.70it/s]

12628


2932it [05:02,  9.60it/s]

12386


2949it [05:04,  9.60it/s]

12232


2957it [05:05,  9.71it/s]

12166


3074it [05:17,  9.72it/s]

11308


3103it [05:20,  9.71it/s]

11110


3143it [05:25,  9.67it/s]

10824


3163it [05:27,  9.59it/s]

10692


3166it [05:27,  9.66it/s]

10670


3195it [05:30,  9.57it/s]

10428
10428


3245it [05:35,  9.68it/s]

10032


3305it [05:41,  9.64it/s]

9680


3322it [05:43,  9.62it/s]

9614


3376it [05:49,  9.71it/s]

9306


3430it [05:54,  9.69it/s]

9064


3452it [05:57,  9.62it/s]

8976


3469it [05:58,  9.62it/s]

8866


3479it [05:59,  9.68it/s]

8822


3484it [06:00,  9.67it/s]

8800


3494it [06:01,  9.68it/s]

8734


3593it [06:11,  9.76it/s]

8272


3606it [06:12,  9.71it/s]

8228
8228


3706it [06:23,  9.79it/s]

7832


3729it [06:25,  9.68it/s]

7722


3855it [06:38,  9.60it/s]

7128


3917it [06:45,  9.78it/s]

6952


3936it [06:46,  9.88it/s]

6886


3948it [06:48,  9.77it/s]

6842


3962it [06:49,  9.78it/s]

6798


4009it [06:54,  9.00it/s]

6666


4028it [06:56,  9.85it/s]

6600


4043it [06:58,  9.63it/s]

6556


4053it [06:59,  9.54it/s]

6512


4108it [07:05,  9.66it/s]

6380


4200it [07:14,  9.56it/s]

6160


4234it [07:18,  9.70it/s]

6094


4243it [07:18,  9.75it/s]

6072


4261it [07:20,  9.59it/s]

6006
6006


4273it [07:22,  9.72it/s]

5984


4300it [07:24,  9.59it/s]

5896
5896


4336it [07:28,  9.68it/s]

5808
5808


4337it [07:28,  9.69it/s]

5808


4347it [07:29,  9.64it/s]

5786


4373it [07:32,  9.68it/s]

5720
5720


4400it [07:35,  9.74it/s]

5654


4471it [07:42,  9.63it/s]

5500
5500


4482it [07:43,  9.74it/s]

5478


4509it [07:46,  9.70it/s]

5412
5412


4528it [07:48,  9.68it/s]

5368


4538it [07:49,  9.61it/s]

5346
5346


4554it [07:51,  9.78it/s]

5324


4571it [07:52,  9.70it/s]

5280
5280


4584it [07:54,  9.65it/s]

5258


4593it [07:55,  9.67it/s]

5236


4625it [07:58,  9.63it/s]

5170


4630it [07:58,  9.66it/s]

5148


4678it [08:03,  9.64it/s]

5060


4741it [08:10,  9.67it/s]

4972
4972


4779it [08:14,  9.65it/s]

4928
4928


4825it [08:19,  9.66it/s]

4862


4852it [08:21,  9.73it/s]

4818


4868it [08:23,  9.71it/s]

4796


4883it [08:24,  9.81it/s]

4752
4752


4884it [08:25,  9.76it/s]

4752


4936it [08:30,  9.63it/s]

4642


4950it [08:31,  9.69it/s]

4620
4620


4971it [08:34,  9.77it/s]

4598


5034it [08:41,  9.71it/s]

4488


5057it [08:43,  9.76it/s]

4466
4466


5125it [08:50,  9.67it/s]

4400
4400


5152it [08:53,  9.69it/s]

4378


5174it [08:55,  9.66it/s]

4356


5223it [09:00,  9.72it/s]

4334


5269it [09:05,  9.68it/s]

4312
4312


5272it [09:05,  9.64it/s]

4312
4312


5352it [09:13,  9.72it/s]

4290
4290


5353it [09:14,  9.60it/s]

4290
4290


5355it [09:14,  9.72it/s]

4290


5452it [09:24,  9.62it/s]

4202
4202


5481it [09:27,  9.75it/s]

4180


5497it [09:28,  9.63it/s]

4158
4158


5498it [09:29,  9.61it/s]

4158
4158


5500it [09:29,  9.75it/s]

4158
4158


5522it [09:31,  9.74it/s]

4136


5541it [09:33,  9.57it/s]

4114
4114


5581it [09:37,  9.67it/s]

4048


5627it [09:42,  9.63it/s]

3982


5653it [09:45,  9.72it/s]

3960


5682it [09:48,  9.76it/s]

3916


5738it [09:53,  9.70it/s]

3850


5842it [10:04,  9.65it/s]

3740
3740


5845it [10:04,  9.78it/s]

3740


5899it [10:10,  9.74it/s]

3718
3718


5901it [10:10,  9.79it/s]

3718


5936it [10:14,  9.68it/s]

3696


5962it [10:16,  9.65it/s]

3674


6022it [10:23,  9.67it/s]

3630
3630


6100it [10:31,  9.65it/s]

3564
3564


6102it [10:31,  9.71it/s]

3564


6129it [10:34,  9.71it/s]

3542
3542


6191it [10:41,  9.76it/s]

3498


6213it [10:43,  9.68it/s]

3476


6280it [10:50,  9.73it/s]

3410


6306it [10:52,  9.66it/s]

3388


6329it [10:55,  9.65it/s]

3366
3366


6353it [10:57,  9.59it/s]

3344


6388it [11:01,  9.72it/s]

3322


6431it [11:05,  9.81it/s]

3278
3278


6550it [11:17,  9.74it/s]

3168


6585it [11:21,  9.71it/s]

3146


6613it [11:24,  9.65it/s]

3124


6640it [11:27,  9.70it/s]

3102
3102


6677it [11:31,  9.70it/s]

3080


6768it [11:40,  9.72it/s]

3014
3014


6804it [11:44,  9.61it/s]

2992


6830it [11:46,  9.70it/s]

2970
2970


6892it [11:53,  9.67it/s]

2926
2926


6930it [11:57,  9.56it/s]

2904


6966it [12:00,  9.60it/s]

2882


7002it [12:05,  4.80it/s]

2860
2860


7004it [12:05,  6.49it/s]

2860


7035it [12:08,  9.60it/s]

2838


7081it [12:13,  9.66it/s]

2816


7138it [12:19,  9.63it/s]

2794


7184it [12:23,  9.62it/s]

2772
2772


7218it [12:27,  9.73it/s]

2750
2750


7220it [12:27,  9.77it/s]

2750


7255it [12:31,  9.64it/s]

2728
2728


7320it [12:38,  9.72it/s]

2684
2684


7321it [12:38,  9.65it/s]

2684
2684


7356it [12:41,  9.69it/s]

2662
2662


7426it [12:48,  9.64it/s]

2618


7481it [12:54,  9.68it/s]

2574
2574


7483it [12:54,  9.65it/s]

2574
2574


7529it [12:59,  9.64it/s]

2552
2552


7532it [12:59,  9.68it/s]

2552
2552


7573it [13:04,  9.70it/s]

2530
2530


7614it [13:08,  9.67it/s]

2508
2508


7658it [13:12,  9.70it/s]

2486
2486


7659it [13:13,  9.73it/s]

2486


7712it [13:18,  9.84it/s]

2464


7749it [13:22,  9.79it/s]

2442


7797it [13:27,  9.78it/s]

2420


7837it [13:31,  9.66it/s]

2398


7879it [13:35,  9.64it/s]

2376
2376


7914it [13:39,  9.63it/s]

2354
2354


7961it [13:44,  9.75it/s]

2332
2332


7963it [13:44,  9.74it/s]

2332


8009it [13:49,  9.00it/s]

2310
2310


8048it [13:53,  9.69it/s]

2288
2288


8097it [13:58,  9.58it/s]

2266


8148it [14:03,  9.67it/s]

2244


8204it [14:09,  9.64it/s]

2222


8253it [14:14,  9.63it/s]

2200
2200


8254it [14:14,  9.56it/s]

2200
2200


8257it [14:15,  9.65it/s]

2200


8301it [14:19,  9.72it/s]

2178
2178


8354it [14:25,  9.68it/s]

2156
2156


8357it [14:25,  9.66it/s]

2156
2156


8359it [14:25,  9.68it/s]

2156
2156


8429it [14:32,  9.66it/s]

2134


8493it [14:39,  9.62it/s]

2112
2112


8593it [14:49,  9.81it/s]

2090
2090


8595it [14:49,  9.79it/s]

2090
2090


8598it [14:50,  9.72it/s]

2090


8660it [14:56,  9.81it/s]

2068
2068


8767it [15:07,  9.61it/s]

2046
2046


8770it [15:07,  9.70it/s]

2046
2046


8772it [15:08,  9.77it/s]

2046
2046


8774it [15:08,  9.73it/s]

2046
2046


8775it [15:08,  9.63it/s]

2046


8926it [15:23,  9.75it/s]

2024
2024


8927it [15:24,  9.64it/s]

2024
2024


8929it [15:24,  9.68it/s]

2024
2024


8932it [15:24,  9.69it/s]

2024
2024


8933it [15:24,  9.59it/s]

2024
2024


8935it [15:24,  9.60it/s]

2024
2024


9060it [15:38,  9.68it/s]

2002
2002


9063it [15:38,  9.69it/s]

2002
2002


9125it [15:45,  9.75it/s]

1980


9195it [15:52,  9.79it/s]

1958
1958


9197it [15:52,  9.75it/s]

1958
1958


9198it [15:52,  9.80it/s]

1958


9295it [16:02,  9.71it/s]

1936
1936


9297it [16:02,  9.75it/s]

1936
1936


9371it [16:10,  9.64it/s]

1914
1914


9456it [16:19,  9.75it/s]

1892
1892


9459it [16:19,  9.68it/s]

1892
1892


9536it [16:27,  9.76it/s]

1870
1870


9537it [16:27,  9.73it/s]

1870
1870


9540it [16:27,  9.61it/s]

1870


9630it [16:37,  9.74it/s]

1848


9693it [16:43,  9.73it/s]

1826
1826


9694it [16:43,  9.68it/s]

1826


9750it [16:49,  9.65it/s]

1804
1804


9751it [16:49,  9.60it/s]

1804
1804


9753it [16:49,  9.76it/s]

1804
1804


9813it [16:55,  9.72it/s]

1782


9867it [17:01,  9.74it/s]

1760
1760


9869it [17:01,  9.76it/s]

1760


9937it [17:08,  9.75it/s]

1738
1738


9939it [17:08,  9.74it/s]

1738
1738


9940it [17:08,  9.64it/s]

1738


10011it [17:16,  9.29it/s]

1716
1716


10014it [17:17,  9.55it/s]

1716
1716


10091it [17:24,  9.62it/s]

1694
1694


10092it [17:25,  9.63it/s]

1694
1694


10094it [17:25,  9.65it/s]

1694


10160it [17:32,  9.68it/s]

1672
1672


10162it [17:32,  9.74it/s]

1672


10249it [17:41,  9.69it/s]

1650
1650


10250it [17:41,  9.65it/s]

1650
1650


10318it [17:48,  9.80it/s]

1628
1628


10320it [17:48,  9.85it/s]

1628
1628


10322it [17:48,  9.88it/s]

1628
1628


10493it [18:06,  9.73it/s]

1584
1584


10495it [18:06,  9.72it/s]

1584
1584


10497it [18:06,  9.75it/s]

1584


10581it [18:15,  9.65it/s]

1562
1562


10583it [18:15,  9.69it/s]

1562
1562


10584it [18:15,  9.66it/s]

1562
1562


10685it [18:26,  9.69it/s]

1540
1540


10783it [18:36,  9.76it/s]

1518
1518


10784it [18:36,  9.69it/s]

1518
1518


10787it [18:36,  9.69it/s]

1518


10922it [18:50,  9.84it/s]

1496
1496


10924it [18:50,  9.84it/s]

1496
1496


11102it [19:09,  9.72it/s]

1474
1474


11103it [19:09,  9.65it/s]

1474
1474


11105it [19:09,  9.73it/s]

1474
1474


11107it [19:09,  9.65it/s]

1474


11224it [19:21,  9.67it/s]

1452
1452


11227it [19:22,  9.77it/s]

1452
1452


11352it [19:35,  9.72it/s]

1430
1430


11355it [19:35,  9.66it/s]

1430
1430


11356it [19:35,  9.56it/s]

1430
1430


11359it [19:35,  9.66it/s]

1430
1430


11361it [19:35,  9.65it/s]

1430
1430


11362it [19:36,  9.66it/s]

1430
1430


11528it [19:53,  9.72it/s]

1408
1408


11529it [19:53,  9.66it/s]

1408
1408


11532it [19:53,  9.70it/s]

1408
1408


11533it [19:53,  9.67it/s]

1408
1408


11735it [20:14,  9.60it/s]

1386
1386


11736it [20:14,  9.59it/s]

1386
1386


11738it [20:14,  9.66it/s]

1386
1386


11741it [20:15,  9.64it/s]

1386
1386


11742it [20:15,  9.61it/s]

1386


11932it [20:34,  9.64it/s]

1364
1364


11933it [20:35,  9.58it/s]

1364
1364


11936it [20:35,  9.56it/s]

1364
1364


11937it [20:35,  9.64it/s]

1364
1364


11940it [20:35,  9.62it/s]

1364
1364


11941it [20:35,  9.56it/s]

1364


12000it [20:42,  9.66it/s]


Total models collected: 11673


In [None]:
import os
import time
import shutil
import pandas as pd
OUTPUT_CSV = "hf_models.csv"
OUTPUT_DIR = "model_cards"  # Directory to save model files
ACCESS_TOKEN = os.getenv("HUGGINGFACE_ACCESS_TOKEN") 
from huggingface_hub import HfApi, HfFolder
from tqdm import tqdm


df = pd.read_csv(OUTPUT_CSV)

# Initialize API
api = HfApi()

# Optional: Function to download model files
def download_model_files(model_ids):
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    for model_id in tqdm(model_ids):

        #check if readme already exists
        if os.path.exists(f"{OUTPUT_DIR}/{model_id.replace('/', '_')}.md"):
            #print(f"Skipping {model_id} as README.md already exists")
            continue

        #print(f"Downloading files for model: {model_id}")
        #files = api.list_repo_files(repo_id=model_id, use_auth_token=ACCESS_TOKEN)

        # Create a directory for each model
        model_dir = os.path.join(OUTPUT_DIR, model_id.replace("/", "_"))
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)

        #downlaod it as {model_id}.md
        try:
            api.hf_hub_download(
                repo_id=model_id,
                filename='README.md',
                local_dir=model_dir,
                use_auth_token=ACCESS_TOKEN,
            )
        except Exception as e:
            #print(f"Error downloading README.md for model {model_id}: {e}")
            continue
        #move the file to OUTPUT_DIR/{model_id}.md
        model_id = model_id.replace("/", "_")
        os.rename(f"{model_dir}/README.md", f"{OUTPUT_DIR}/{model_id}.md")
        #delete folder and contents# folder may still have files
        
        shutil.rmtree(model_dir)


        # Respect rate limits
        time.sleep(0.1)
download_model_files(df.modelId.values.tolist())

In [1]:
import pandas as pd
repos_df = pd.read_csv("hf_models.csv")
repos_df.head(20)
readme_text = []

#for every repo in the csv, get the readme
for index,row in repos_df.iterrows():
    repo_full_name = row["modelId"]
    safe_name = repo_full_name.replace("/", "_")

    #open the readme file
    try:
        with open(f"model_cards/{safe_name}.md", "r", encoding="utf-8") as f:
            readme = f.read()
            readme_text.append(readme)
    except:
        readme_text.append(None)
repos_df["readme_text"] = readme_text
repos_df = repos_df.dropna(subset=["readme_text"])

In [2]:
import re
import numpy as np

#count the occurences of "^- " of each readme
lists_count = repos_df["readme_text"].str.count(r"^- ", flags=re.MULTILINE)
#log then normalize it
lists_count = np.log1p(1 + lists_count)
lists_count = (lists_count - lists_count.min()) / (lists_count.max() - lists_count.min())
repos_df["lists_count"] = lists_count

#count hypperlinks
hyperlinks_count = repos_df["readme_text"].str.count(r"http[s]?:\/\/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", flags=re.MULTILINE)
#log then normalize it
hyperlinks_count = np.log1p(1 + hyperlinks_count)
hyperlinks_count = (hyperlinks_count - hyperlinks_count.min()) / (hyperlinks_count.max() - hyperlinks_count.min())
repos_df["hyperlinks_count"] = hyperlinks_count

#number of images ending in .png, .jpg, .jpeg, .gif (url or local path) in .md ir .rst
#[image] (https://example.com/image.png)
#image:: https://example.com/image.png
#<.*src.*=.*\.(png|jpg|jpeg|gif|svg).*>
images_count = repos_df["readme_text"].str.count(r"!\[.*\]\(.*\.(png|jpg|jpeg|gif|svg)\)", flags=re.MULTILINE) + \
            repos_df["readme_text"].str.count(r"image::.*\.(png|jpg|jpeg|gif|svg)", flags=re.MULTILINE) + \
            repos_df["readme_text"].str.count(r"<.*src.*=.*\.(png|jpg|jpeg|gif|svg).*>", flags=re.MULTILINE)
#log then normalize it
images_count = np.log1p(1 + images_count)
images_count = (images_count - images_count.min()) / (images_count.max() - images_count.min())
repos_df["images_count"] = images_count

#number of code blocks divide by 2 and round down
code_blocks_count = (repos_df["readme_text"].str.count(r"^```", flags=re.MULTILINE) // 2).astype(int)
#log then normalize it
code_blocks_count = np.log1p(1 + code_blocks_count)
code_blocks_count = (code_blocks_count - code_blocks_count.min()) / (code_blocks_count.max() - code_blocks_count.min())
repos_df["code_blocks_count"] = code_blocks_count

#number of new lines with content
content_lines_count = repos_df["readme_text"].str.count(r"^.*[^\s]", flags=re.MULTILINE)
#log then normalize it
content_lines_count = np.log1p(1 + content_lines_count)
content_lines_count = (content_lines_count - content_lines_count.min()) / (content_lines_count.max() - content_lines_count.min())
repos_df["content_lines_count"] = content_lines_count


# Section keyword search
# start with #
# 1 if contains atleast 1 keyword, 0 otherwise


#Contributing section
contributing_keywords = ["contribut"]
repos_df[f"keyword_contributing"] = False
for keyword in contributing_keywords:
    repos_df[f"keyword_contributing"] = repos_df[f"keyword_contributing"] | repos_df["readme_text"].str.contains(f"#+ .*{keyword}", flags=re.IGNORECASE)

    

# Getting Started section
getting_started_keywords = ["getting started", "installation", "quick start", "quickstart", "setup", "usage", "example", "examples", "demo", "demos"]
repos_df[f"keyword_getting_started"] = False
for keyword in getting_started_keywords:
    repos_df[f"keyword_getting_started"] = repos_df[f"keyword_getting_started"] | repos_df["readme_text"].str.contains(f"#+ .*{keyword}", flags=re.IGNORECASE)

# License section
license_keywords = ["license"]  
repos_df[f"keyword_license"] = False
for keyword in license_keywords:
    repos_df[f"keyword_license"] = repos_df[f"keyword_license"] | repos_df["readme_text"].str.contains(f"#+ .*{keyword}", flags=re.IGNORECASE)


# Flesch Reading Score
#textstat.flesch_reading_ease(test_data)
import textstat
flesch_reading_scores = repos_df["readme_text"].apply(lambda x: textstat.flesch_reading_ease(x) if x else None)
#normalize
flesch_reading_scores = (flesch_reading_scores - flesch_reading_scores.min()) / (flesch_reading_scores.max() - flesch_reading_scores.min())
repos_df["flesch_reading_scores"] = flesch_reading_scores


# number of headers
headers_count = repos_df["readme_text"].str.count(r"#+", flags=re.MULTILINE)
#log then normalize
headers_count = np.log1p(1 + headers_count)
headers_count = (headers_count - headers_count.min()) / (headers_count.max() - headers_count.min())
repos_df["headers_count"] = headers_count

KeyError: 'contributing'

In [3]:

#Contributing section
contributing_keywords = ["contribut"]
repos_df[f"keyword_contributing"] = False
for keyword in contributing_keywords:
    repos_df[f"keyword_contributing"] = repos_df[f"keyword_contributing"] | repos_df["readme_text"].str.contains(f"#+ .*{keyword}", flags=re.IGNORECASE)

    

# Getting Started section
getting_started_keywords = ["getting started", "installation", "quick start", "quickstart", "setup", "usage", "example", "examples", "demo", "demos"]
repos_df[f"keyword_getting_started"] = False
for keyword in getting_started_keywords:
    repos_df[f"keyword_getting_started"] = repos_df[f"keyword_getting_started"] | repos_df["readme_text"].str.contains(f"#+ .*{keyword}", flags=re.IGNORECASE)

# License section
license_keywords = ["license"]  
repos_df[f"keyword_license"] = False
for keyword in license_keywords:
    repos_df[f"keyword_license"] = repos_df[f"keyword_license"] | repos_df["readme_text"].str.contains(f"#+ .*{keyword}", flags=re.IGNORECASE)


# Flesch Reading Score
#textstat.flesch_reading_ease(test_data)
import textstat
flesch_reading_scores = repos_df["readme_text"].apply(lambda x: textstat.flesch_reading_ease(x) if x else None)
#normalize
flesch_reading_scores = (flesch_reading_scores - flesch_reading_scores.min()) / (flesch_reading_scores.max() - flesch_reading_scores.min())
repos_df["flesch_reading_scores"] = flesch_reading_scores


# number of headers
headers_count = repos_df["readme_text"].str.count(r"#+", flags=re.MULTILINE)
#log then normalize
headers_count = np.log1p(1 + headers_count)
headers_count = (headers_count - headers_count.min()) / (headers_count.max() - headers_count.min())
repos_df["headers_count"] = headers_count

# Dependant Extraction

In [87]:
repos_df["headers_count"]

0        0.747774
1        0.496783
2        0.409839
3        0.082473
4        0.496783
           ...   
11668    0.346753
11669    0.346753
11670    0.254817
11671    0.364451
11672    0.395806
Name: headers_count, Length: 10608, dtype: float64

In [35]:
downloads_normalized = np.log1p(1 + repos_df["downloads"])
repos_df["downloads_normalized"] = (downloads_normalized - downloads_normalized.min()) / (downloads_normalized.max() - downloads_normalized.min())


# Get Statistics

In [89]:
#Independant Columns normalized between 0 and 1
count_independent_columns = ["lists_count", "hyperlinks_count", "images_count", "code_blocks_count", "content_lines_count", "headers_count"]
#score Indecpend column
score_independent_columns = ["flesch_reading_scores"]
#keyword independant columns#binary 0 or 1
keyword_independent_columns = ["keyword_contributing", "keyword_getting_started", "keyword_license"]
#
dependant_columns = ["downloads_normalized"]


independent_columns = count_independent_columns + score_independent_columns + keyword_independent_columns


score_dict={"counts":{ column:{} for column in count_independent_columns}
            ,
       "keyword":{ column:{} for column in keyword_independent_columns
       },
       "scores":{ column:{} for column in score_independent_columns
       }}


repos_df = repos_df.dropna(subset=independent_columns + dependant_columns)
# Pearson correlation
from sklearn.feature_selection import r_regression

for dependant in dependant_columns[-1:]:
    score = r_regression(repos_df[independent_columns].values, repos_df[dependant].values)

    for i, column in enumerate(independent_columns):
        print(f"Pearson correlation between {column} and {dependant}: {round(score[i],3)}")

        if column in count_independent_columns:
            score_dict["counts"][column]['Pearsons'] = round(score[i],3)
        elif column in keyword_independent_columns:
            score_dict["keyword"][column]['Pearsons'] = round(score[i],3)
        elif column in score_independent_columns:
            score_dict["scores"][column]['Pearsons'] = round(score[i],3)
        


# Cliffs Delta


#Fishers Exact Test
# import scipy.stats as stats

# odd_ratio, p_value = stats.fisher_exact([repos_df["content_lines_count"], repos_df["stargazers_count_normalized"]])


#logistic regression feature important extraction
from sklearn.linear_model import LinearRegression

X = repos_df[independent_columns].values
y = repos_df["downloads_normalized"].values

clf = LinearRegression().fit(X, y)
clf.coef_  # logistic regression feature importance extraction

for i, column in enumerate(independent_columns):
    print(f"Linear regression coefficient for {column}: {round(clf.coef_[i],3)}")

    if column in count_independent_columns:
        score_dict["counts"][column]['LinearRegression'] = round(clf.coef_[i],3)
    elif column in keyword_independent_columns:
        score_dict["keyword"][column]['LinearRegression'] = round(clf.coef_[i],3)
    elif column in score_independent_columns:
        score_dict["scores"][column]['LinearRegression'] = round(clf.coef_[i],3)


# random forest feature importance extraction

from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor().fit(X, y)
clf.feature_importances_  # random forest feature importance extraction

for i, column in enumerate(independent_columns):
    print(f"Random forest feature importance for {column}: {round(clf.feature_importances_[i],3)}")

    if column in count_independent_columns:
        score_dict["counts"][column]["Random Forest"] = round(clf.feature_importances_[i],3)
    elif column in keyword_independent_columns:
        score_dict["keyword"][column]["Random Forest"] = round(clf.feature_importances_[i],3)
    elif column in score_independent_columns:
        score_dict["scores"][column]["Random Forest"] = round(clf.feature_importances_[i],3)


Pearson correlation between lists_count and downloads_normalized: 0.06
Pearson correlation between hyperlinks_count and downloads_normalized: 0.056
Pearson correlation between images_count and downloads_normalized: -0.034
Pearson correlation between code_blocks_count and downloads_normalized: 0.166
Pearson correlation between content_lines_count and downloads_normalized: 0.129
Pearson correlation between headers_count and downloads_normalized: 0.088
Pearson correlation between flesch_reading_scores and downloads_normalized: 0.108
Pearson correlation between keyword_contributing and downloads_normalized: -0.038
Pearson correlation between keyword_getting_started and downloads_normalized: 0.063
Pearson correlation between keyword_license and downloads_normalized: 0.064
Linear regression coefficient for lists_count: 0.016
Linear regression coefficient for hyperlinks_count: 0.001
Linear regression coefficient for images_count: -0.055
Linear regression coefficient for code_blocks_count: 0.1

In [90]:
# convert to pands dataframe with mulit index
# rows is the score type
# columns is the feature (with subcolumns for each method)

score_df = pd.DataFrame()
for score_type, score_data in score_dict.items():
    df_sub = pd.DataFrame(score_data).T
    df_sub = df_sub.reset_index()
    df_sub.insert(0, "score_type", score_type)
    
    score_df = pd.concat([score_df, df_sub], axis=0)
score_df.groupby(["score_type",'index']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Pearsons,LinearRegression,Random Forest
score_type,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
counts,code_blocks_count,0.166,0.132,0.086
counts,content_lines_count,0.129,0.075,0.212
counts,headers_count,0.088,-0.047,0.114
counts,hyperlinks_count,0.056,0.001,0.133
counts,images_count,-0.034,-0.055,0.032
counts,lists_count,0.06,0.016,0.134
keyword,keyword_contributing,-0.038,-0.063,0.006
keyword,keyword_getting_started,0.063,0.005,0.019
keyword,keyword_license,0.064,0.013,0.012
scores,flesch_reading_scores,0.108,0.2,0.252


Unnamed: 0_level_0,Unnamed: 1_level_0,Pearsons,LinearRegression,Random Forest
score_type,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
counts,code_blocks_count,0.166,0.132,0.087
counts,content_lines_count,0.129,0.075,0.215
counts,hyperlinks_count,0.056,0.001,0.133
counts,images_count,-0.034,-0.055,0.031
counts,lists_count,0.06,0.016,0.133
keyword,keyword_contributing,-0.038,-0.063,0.007
keyword,keyword_getting_started,0.063,0.005,0.019
keyword,keyword_license,0.064,0.013,0.011
scores,flesch_reading_scores,0.108,0.2,0.251


In [62]:
score_type

'scores'