In [57]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from datetime import datetime

# Find likes and donwloads

In [3]:
huggingface_languages_link = 'https://huggingface.co/languages'

In [151]:
def extract_huggingface_languages(url):
    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, 'html.parser')
    # Initialize lists to store extracted data
    languages = []
    iso_codes = []
    datasets_links = []
    models_links = []
    datasets_numbers = []
    model_numbers = []

    # Extract data from each row of the table
    for row in soup.find_all('tr'):
        cols = row.find_all('td')
        if len(cols) > 0:  # Ensure it's not the header row
            language = cols[0].get_text(strip=True)
            iso_code = cols[1].find('code').get_text(strip=True)
            datasets_link = "https://huggingface.co" + cols[2].find('a')['href']
            models_link = "https://huggingface.co" + cols[3].find('a')['href']
            datasets_number = int(cols[2].find('a').get_text(strip=True).replace(',', ''))
            model_number = int(cols[3].find('a').get_text(strip=True).replace(',', ''))
            
            # Append data to lists
            languages.append(language)
            iso_codes.append(iso_code)
            datasets_links.append(datasets_link)
            models_links.append(models_link)
            datasets_numbers.append(datasets_number)
            model_numbers.append(model_number)

    # Create DataFrame
    df = pd.DataFrame({
        'Language': languages,
        'ISO Code': iso_codes,
        'Datasets Link': datasets_links,
        'Models Link': models_links,
        'Datasets Number': datasets_numbers,
        'Model Number': model_numbers
    })
    return df

def process_huggingface_datasets(dataset_url):
    dataset_request = requests.get(dataset_url).text
    dataset_soup = BeautifulSoup(dataset_request, 'html.parser')
    # Extract text and href attributes for all articles
    articles_data = []
    articles = dataset_soup.find_all('article')
    for article in articles:
        text = article.h4.text.strip()
        href = article.a['href']
        articles_data.append({'dataset': text, 'dataset_url': href})
        

    # Convert to DataFrame
    df = pd.DataFrame(articles_data)
    return df 

In [152]:
df = extract_huggingface_languages(huggingface_languages_link)

In [153]:
df

Unnamed: 0,Language,ISO Code,Datasets Link,Models Link,Datasets Number,Model Number
0,EnglishEnglish,en,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=en,7816,34499
1,Chinese中文,zh,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=zh,924,3496
2,FrenchFrançais,fr,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=fr,824,3240
3,SpanishEspañol,es,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=es,646,2525
4,RussianРусский,ru,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=ru,619,1752
...,...,...,...,...,...,...
2041,Southern Luri,luz,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=luz,1,14
2042,Argentine Sign Language,aed,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=aed,1,4
2043,Simte,smt,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=smt,1,15
2044,Chilean Sign Language,csg,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=csg,1,3


### Tests on finding datasets for one language

In [154]:
dataset_stats = process_huggingface_datasets('https://huggingface.co/datasets?language=language:ofs')

In [155]:
dataset_stats

Unnamed: 0,dataset,dataset_url
0,tatoeba,/datasets/tatoeba
1,lbourdois/language_tags,/datasets/lbourdois/language_tags
2,lbourdois/panlex,/datasets/lbourdois/panlex


# Working with more datasets and more languages

In [156]:
def find_huggingface_datasets(dataset_url, df = None):
    '''
    This function recursively retrieves dataset URLs from the Hugging Face website, starting from a given URL. 
    It iterates through all pages, extracts dataset names and URLs, and returns a list of dataset URLs.
    '''
    if df is None:
        df = pd.DataFrame(columns=['dataset', 'dataset_url'])

    dataset_request = requests.get(dataset_url).text
    dataset_soup = BeautifulSoup(dataset_request, 'html.parser')
    # Extract text and href attributes for all articles
    articles_data = []
    articles = dataset_soup.find_all('article')
    for article in articles:
        text = article.h4.text.strip()
        href = article.a['href']
        articles_data.append({'dataset': text, 'dataset_url': href})
        

    # Convert to DataFrame
    temp_df = pd.DataFrame(articles_data)
    print('first_df!')
    df = pd.concat([df, temp_df], ignore_index=True)

    sections = dataset_soup.find_all('section')[-1]
    next_page_link = sections.find('nav').find_all('a')[-1].get('href')
    if next_page_link: 
        next_page_url = f'https://huggingface.co/datasets{next_page_link}'
        return find_huggingface_datasets(next_page_url, df)
    else: 
        print('language_done!')
        return df['dataset_url'].tolist()
    

def find_earliest_commit(model_name):
    '''
    This function retrieves the commit history of a model from the Hugging Face website. 
    It identifies the earliest commit timestamp if available.
    '''
    soup = BeautifulSoup(requests.get(f'https://huggingface.co{model_name}').text, 'html.parser')
    hrefs = [a['href'] for a in soup.find_all('a', class_ = 'tab-alternate')]
    commits_href = ['https://huggingface.co' + url for url in hrefs if '/tree/main' in url]
    if commits_href:
        commits_href_history = commits_href[0].replace('tree', 'commits')
        commits_soup = BeautifulSoup(requests.get(commits_href_history).text, 'html.parser')
        time_elements = commits_soup.find_all('time')
        datetimes = [time['datetime'] for time in time_elements]
        processed_datetime = [datetime.fromisoformat(dt_str) for dt_str in datetimes]
        if processed_datetime:
            return min(processed_datetime)
    else:
        print('commits_not_found')
        return 'nothing'
    

def find_earliest_datetime_pls(lst):
    datetimes = [item for item in lst if isinstance(item, datetime)]
    earliest_datetime = min(datetimes) if datetimes else None
    return earliest_datetime


# Find the tab alternate with tree main - from that one get the oldest of all dates. 

# Not storing anything for now! 

## What if at the beginning they were all uploaded together?

## Try on small dataset for testing

In [81]:
df_small = df[100:105]

In [82]:
df_small

Unnamed: 0,Language,ISO Code,Datasets Link,Models Link,Datasets Number,Model Number
100,LuxembourgishLëtzebuergesch,lb,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=lb,56,273
101,Yue Chinese,yue,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=yue,55,134
102,Central Kurdish,ckb,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=ckb,55,72
103,Asturian,ast,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=ast,55,83
104,Scottish GaelicGàidhlig,gd,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=gd,54,233


In [109]:
df_small['all_datasets'] = df_small['Datasets Link'].apply(lambda url: find_huggingface_datasets(url))

first_df!
first_df!
language_done!
first_df!
first_df!
language_done!
first_df!
first_df!
language_done!
first_df!
first_df!
language_done!
first_df!
first_df!
language_done!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['all_datasets'] = df_small['Datasets Link'].apply(lambda url: find_huggingface_datasets(url))


In [110]:
df_small

Unnamed: 0,Language,ISO Code,Datasets Link,Models Link,Datasets Number,Model Number,all_datasets
100,LuxembourgishLëtzebuergesch,lb,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=lb,56,273,"[/datasets/CohereForAI/xP3x, /datasets/wikimed..."
101,Yue Chinese,yue,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=yue,55,134,"[/datasets/CohereForAI/xP3x, /datasets/wikimed..."
102,Central Kurdish,ckb,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=ckb,55,72,"[/datasets/CohereForAI/xP3x, /datasets/wikimed..."
103,Asturian,ast,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=ast,55,83,"[/datasets/CohereForAI/xP3x, /datasets/wikimed..."
104,Scottish GaelicGàidhlig,gd,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=gd,54,233,"[/datasets/CohereForAI/xP3x, /datasets/wikimed..."


# Try on even smaller data for date finding

In [118]:
df_sample = df_small[0:1]

In [157]:
df_sample['earliest_update'] = df_sample['all_datasets'].apply(lambda x: [find_earliest_commit(item) for item in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample['earliest_update'] = df_sample['all_datasets'].apply(lambda x: [find_earliest_commit(item) for item in x])


In [158]:
df_sample

Unnamed: 0,Language,ISO Code,Datasets Link,Models Link,Datasets Number,Model Number,all_datasets,earliest_update,earliest_datetime
100,LuxembourgishLëtzebuergesch,lb,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=lb,56,273,"[/datasets/CohereForAI/xP3x, /datasets/wikimed...","[2023-05-25 21:13:23, 2023-11-21 22:11:34, 202...",2021-03-08 20:57:23


In [159]:
df_sample['earliest_datetime'] = df_sample['earliest_update'].apply(find_earliest_datetime_pls)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample['earliest_datetime'] = df_sample['earliest_update'].apply(find_earliest_datetime_pls)


In [160]:
df_sample

Unnamed: 0,Language,ISO Code,Datasets Link,Models Link,Datasets Number,Model Number,all_datasets,earliest_update,earliest_datetime
100,LuxembourgishLëtzebuergesch,lb,https://huggingface.co/datasets?language=langu...,https://huggingface.co/models?language=lb,56,273,"[/datasets/CohereForAI/xP3x, /datasets/wikimed...","[2023-05-25 21:13:23, 2023-11-21 22:11:34, 202...",2021-03-08 20:57:23


In [150]:
        #div = article.find('div').get_text(strip=True)
        #this part does not work very well yet...(the downloads and likes one) - also: they are already sorted by trending....
        #likes = re.findall(r'\b\d+\b', div)[0]
        #downloads = re.findall(r'\b\d+\b', div)[1]

In [106]:
def extract_next_page_url(soup):
    sections = soup.find_all('section')[-1]
    if sections:
        next_page_link = last_section.find('nav').find_all('a')[-1].get('href')
    return next_page_link

In [115]:
def find_data_per_language(language_url):
    language_file = requests.get(language_url)
    if language_file.status_code == 200:
        soup = BeautifulSoup(language_file.text, 'html.parser')
        next_page_url = extract_next_page_url(soup) 
        while next_page_url:
            next_file = requests.get("https://huggingface.co" + next_page_url)
            if language_file.status_code == 200:
                new_soup = BeautifulSoup(language_file.text, 'html.parser')
                return new_soup
            else:
                print('Error: Unable to fetch language URL:', language_url)
                break