# Original Scraper File
Before realizing that Glottolog provides a link to the relevant Wikipedia page for a given language, I began writing my web scraper to obtain information from the language pages listed in Wikipedia's https://en.wikipedia.org/wiki/List_of_language_names page. This page provides a list of commonly known languages and their links, but does not provide pages for a complete list of languages. 

In [3]:
import pandas as pd 
import requests
import unicodedata
from tqdm import tqdm
from bs4 import BeautifulSoup

In [5]:
# Links to the pages for each language, scraped from Wikipedia:
    # def get_links(): 
    #     response = requests.get('https://en.wikipedia.org/wiki/List_of_language_names')
    #     soup = BeautifulSoup(response.content, 'html.parser')
    #     links = soup.find_all('a')
    #     # links = [link.get('href') for link in links if (link.get('title') is not None) and ('language' in link.get('title'))]
    #     links = [link.get('href') for link in links if (link.get('title') is not None)]
    #     # links = [link for link in links if link.endswith('_language')]
    #     links = ["https://en.wikipedia.org"+link for link in links]
    #     return links

#Updated Solution: Links to the pages for each language, scraped from Glottolog: 
glottolog_info = pd.read_csv('../csv_files/glottolog_data.csv')

In [6]:
# The following function finds the index of the headers in the infobox of the Wikipedia page for a given language, allowing 
# us to extract the information we need from the page using the headers as keys.

def find_index_of_headers(url):
    response        = requests.get(url)
    content         = response.content
    parser          = BeautifulSoup(content, 'html.parser')
    table           = parser.find('table', {'class': 'infobox'})
    included_headers = {}
    
    for i in range(0, len(table.find_all('th', {'class': 'infobox-label'}))):
        included_headers[unicodedata.normalize('NFKD', table.find_all('th', {'class': 'infobox-label'}).__getitem__(i).getText())] = i
    return included_headers


urls = glottolog_info['Wikipedia_Url'].astype(str)

In [None]:
# The following code block scrapes the Wikipedia pages for each language in the list of links, extracting the information

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

data = []
errors = []

for url in tqdm(urls): 
    # try: 
        # lang = url.split('/')[-1]

        # print(requests.get(url).status_code)
        # print(url)

        session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)

        if url == 'nan':
             continue
        if str(url).startswith('/wiki/'):
             url = 'https://en.wikipedia.org' + str(url)
        if requests.get(str(url)).status_code != 200:
             continue
        response = session.get(str(url))

        content         = response.content
        parser          = BeautifulSoup(content, 'html.parser')
        table           = parser.find('table', {'class': 'infobox'})

        if table == None:
             print("No table found ", url)
             continue

        header_indices  = find_index_of_headers(url)

        # print(header_indices)

        infobox_data = table.find_all('td', {'class': 'infobox-data'})

        lang            = table.find('th', {'class': 'infobox-above above'}).get_text('title') if table.find('th', {'class': 'infobox-above above'}) else lang
        off_lang        = infobox_data[header_indices['Official language in']].text if 'Official language in' in header_indices else None
        rec_min_lang    = infobox_data[header_indices['Recognised minoritylanguage in']].text if 'Recognised minoritylanguage in' in header_indices else None
        # speakers        = infobox_data[header_indices['Speakers']].text             if 'Speakers' in header_indices else None
        iso6393        = infobox_data[header_indices['ISO 639-3']].get_text('title')            if 'ISO 639-3' in header_indices else None
        glottocode      = infobox_data[header_indices['Glottolog']].get_text('title')            if 'Glottolog' in header_indices else None

        if 'Speakers' in header_indices.keys():
            speakers = infobox_data[header_indices['Speakers']].text
        elif 'Native speakers' in header_indices.keys():
            speakers = infobox_data[header_indices['Native speakers']].text
        else:
            speakers = None

        if 'Region' in header_indices.keys():
            regions = infobox_data[header_indices['Region']].text
        elif 'Native Region' in header_indices.keys():
            regions = infobox_data[header_indices['Native Region']].text
        else:
            regions = None

        family          = [x for x in infobox_data[header_indices['Language family']].get_text('title').split('title') if x != "\n"]\
                            if 'Language family' in header_indices else None
        
        dialects        = [x for x in infobox_data[header_indices['Dialects']].get_text('title').split('title') if x != "\n"] \
                            if 'Dialects' in header_indices else None

        url             = str(url)
        
        data.append((lang, family, dialects, iso6393, glottocode, speakers, regions, off_lang, rec_min_lang, url))
    # except Exception as e: 
    #     errors.append((e, url))
    #     continue

df = pd.DataFrame(data, columns=['lang', 'family', 'dialects', 'iso6393', 'glottocode', 'speakers', 'regions', 'off_lang', 'rec_min_lang', 'Wikipedia_Url'])
df.to_csv('../csv_files/wiki_languages_most_recent.csv')

In [11]:
df = pd.DataFrame(data, columns=['lang', 'family', 'dialects', 'iso6393', 'glottocode', 'speakers', 'regions', 'off_lang', 'rec_min_lang', 'Wikipedia_Url'])
df.to_csv('../csv_files/wiki_languages_most_recent.csv')