In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

## ELAR

In [3]:
ELAR_archive = 'https://www.elararchive.org/uncategorized/SO_5f038640-311d-4296-a3e9-502e8a18f5b7/'

In [4]:
html_content = requests.get(ELAR_archive).text

soup = BeautifulSoup(html_content, 'html.parser')

# Find all <p> tags with class="facet-title" and title="Language"
facet_titles = soup.find_all('div', class_='facet-group')
language_facets = [facet for facet in facet_titles if 'language' in facet.get('facet-name', '')]

input_ids = []

    # Loop through each language facet title to find its associated <ul> tag
for facet_title in language_facets:
    # Find the <fieldset> tag within this <p> tag
    fieldset_tag = facet_title.find_next('fieldset')
    
    if fieldset_tag:
        # Find all <input> tags within this <fieldset> tag
        input_tags = fieldset_tag.find_all('input')
        # Extract and store ids
        ids = [input_tag.get('id') for input_tag in input_tags]
        input_ids.extend(ids)

In [5]:
input_ids

['!Xun',
 "'Olekha",
 "=X'ao-||'aen",
 'Abom',
 'Abui',
 'Adyumba',
 'Ahamb',
 'A Hou',
 'Ainu',
 'Aisi',
 'Ajumbu',
 'Akan',
 'Akha',
 'Akuntsú',
 'Alipur Village Sign Language (AVSL)',
 'Allang',
 'Ambel',
 'Amurdak',
 'Anal',
 'Aneityum, Sie, Raga (Hano), Namakura, Southwest Tanna, North Tanna',
 'Animere',
 'Anindilyakwa',
 'Antia Whistling Language',
 'Apurinã',
 'Arammba',
 'Arandic',
 'Arandic      ',
 'Araona',
 'Arapaho',
 'Arawak',
 'Archaic Akha',
 'Archi',
 'Aren',
 'Aro',
 'Arta',
 'Asheninka Perene',
 'Asimjeeg Datooga',
 'Asur',
 'Atchin (Uripiv-Wala-Rano-Atchin)',
 'Auslan',
 'Australian Irish Sign Language',
 'Avatime',
 'Awiakay',
 'Awu Alaya',
 'Ayere',
 'Ayoreo',
 'Ayuru',
 'Ayutla Mixe',
 'Baa',
 "Baba'1",
 'Babanki Ritual Speech',
 'Badaga',
 'Bafia',
 'Bafut',
 'Baga Mandori',
 'Bainouk',
 'Bajjika',
 'Balochi',
 'Baluchi',
 'Banam Bay Area Language',
 'Baram',
 'Brahui',
 'Buu',
 "Cha'palaa",
 'Chatino Sign Language, San Juan Quiahije',
 'Cofán',
 'Dalabon',
 'D

## ELAN

In [23]:
with open("elan.html", "r", encoding="utf-8") as file:
    elan_ul = file.read()

In [25]:
# Parse the HTML content
soup = BeautifulSoup(elan_ul, "html.parser")

# Find all <li> tags
li_tags = soup.find_all("li")

# Initialize lists to store data
names = []
hrefs = []
counts = []

# Extract name, href, and count from each <li> tag
for li in li_tags:
    a_tag = li.find("a")
    if a_tag:
        name = a_tag.get_text(strip=True)
        href = "https://archive.mpi.nl" + a_tag["href"]
        count = int(li.find("span", class_="count").get_text(strip=True).strip("()"))
        names.append(name)
        hrefs.append(href)
        counts.append(count)

# Create a DataFrame
df = pd.DataFrame({"Name": names, "Href": hrefs, "Count": counts})

print(df)

                                         Name  \
0                          Jakarta Indonesian   
1           Jakarta Indonesian Child Language   
2                       Turkish Sign Language   
3    Jakarta Indonesian Child-Directed Speech   
4                             infant babbling   
..                                        ...   
785           Zapotec, Santa Maria Albarradas   
786              Zapotec, Santiago Lachiguiri   
787                      Zapotec, Suchixtepec   
788                     Zapotec, Tabaa Elodia   
789                            Zapotec, Talea   

                                                  Href  Count  
0    https://archive.mpi.nl/tla/islandora/object/la...   1409  
1    https://archive.mpi.nl/tla/islandora/object/la...   1338  
2    https://archive.mpi.nl/tla/islandora/object/la...   1232  
3    https://archive.mpi.nl/tla/islandora/object/la...   1183  
4    https://archive.mpi.nl/tla/islandora/object/la...   1157  
..                         

## DOBES

In [28]:
with open("dobes.html", "r", encoding="utf-8") as file:
    dobes_ul = file.read()

In [29]:
# Parse the HTML content
dobes_soup = BeautifulSoup(dobes_ul, "html.parser")

# Find all <li> tags
li_tags = dobes_soup.find_all("li")

# Initialize lists to store data
names = []
hrefs = []
counts = []

# Extract name, href, and count from each <li> tag
for li in li_tags:
    a_tag = li.find("a")
    if a_tag:
        name = a_tag.get_text(strip=True)
        href = "https://archive.mpi.nl" + a_tag["href"]
        count = int(li.find("span", class_="count").get_text(strip=True).strip("()"))
        names.append(name)
        hrefs.append(href)
        counts.append(count)

# Create a DataFrame
df = pd.DataFrame({"Name": names, "Href": hrefs, "Count": counts})

print(df)

            Name                                               Href  Count
0        English  https://archive.mpi.nl/tla/islandora/object/tl...   2883
1        Spanish  https://archive.mpi.nl/tla/islandora/object/tl...   2738
2       Yurakaré  https://archive.mpi.nl/tla/islandora/object/tl...   1645
3         Beaver  https://archive.mpi.nl/tla/islandora/object/tl...   1042
4     Portuguese  https://archive.mpi.nl/tla/islandora/object/tl...    925
..           ...                                                ...    ...
301      Tzeltal  https://archive.mpi.nl/tla/islandora/object/tl...      1
302        Waurá  https://archive.mpi.nl/tla/islandora/object/tl...      1
303  Wára (Wära)  https://archive.mpi.nl/tla/islandora/object/tl...      1
304   Yawalapití  https://archive.mpi.nl/tla/islandora/object/tl...      1
305         |Gwi  https://archive.mpi.nl/tla/islandora/object/tl...      1

[306 rows x 3 columns]
