# Original Scraper File
&nbsp; Before realizing that Glottolog provides a link to the relevant Wikipedia page for a given language, I began writing my web scraper to obtain information from the language pages listed in Wikipedia's https://en.wikipedia.org/wiki/List_of_language_names page. This page provides a list of commonly known languages and their links, but does not provide pages for a complete list of languages. 

In [3]:
import pandas as pd 
import requests
import unicodedata
from tqdm import tqdm
from bs4 import BeautifulSoup

In [5]:
# Links to the pages for each language, scraped from Wikipedia:
    # def get_links(): 
    #     response = requests.get('https://en.wikipedia.org/wiki/List_of_language_names')
    #     soup = BeautifulSoup(response.content, 'html.parser')
    #     links = soup.find_all('a')
    #     # links = [link.get('href') for link in links if (link.get('title') is not None) and ('language' in link.get('title'))]
    #     links = [link.get('href') for link in links if (link.get('title') is not None)]
    #     # links = [link for link in links if link.endswith('_language')]
    #     links = ["https://en.wikipedia.org"+link for link in links]
    #     return links

#Updated Solution: Links to the pages for each language, scraped from Glottolog: 
glottolog_info = pd.read_csv('../ipynb_files/glottolog_data.csv')

In [8]:
# countries_and_territories = requests.get('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages')
# soup = BeautifulSoup(countries_and_territories.content, 'html.parser')

# table = soup.find('table', {'class': ['wikitable', 'sortable', 'jquery-tablesorter']})
# regions = []
# languages = []

# table_rows = table.find_all('tr')
# print(len(table_rows))


In [7]:
# unesco_languages = pd.read_csv('../Extinct languages - DATA SUMMARY.csv')
# unesco_languages

In [15]:
glottolog_info

Unnamed: 0,iso6393,glottocode,aes_status,Wikipedia_Url
0,aaa,ghot1243,not endangered,https://en.wikipedia.org/wiki/Ghotuo_language
1,aab,alum1246,not endangered,https://en.wikipedia.org/wiki/Alumu_language
2,aac,arii1243,moribund,https://en.wikipedia.org/wiki/Ari_language_(Ne...
3,aad,amal1242,shifting,https://en.wikipedia.org/wiki/Amal_language
4,aae,arbe1236,threatened,https://en.wikipedia.org/wiki/Arb%C3%ABresh_la...
...,...,...,...,...
7585,zyg,yang1286,not endangered,https://en.wikipedia.org/wiki/Yang_Zhuang_lang...
7586,zyj,youj1238,not endangered,https://en.wikipedia.org/wiki/Youjiang_Zhuang
7587,zyn,yong1275,not endangered,https://en.wikipedia.org/wiki/Yongnan_languages
7588,zyp,zyph1238,not endangered,https://en.wikipedia.org/wiki/Zyphe_language


In [18]:
# The following function finds the index of the headers in the infobox of the Wikipedia page for a given language, allowing 
# us to extract the information we need from the page using the headers as keys.

def find_index_of_headers(url):
    response        = requests.get(url)
    content         = response.content
    parser          = BeautifulSoup(content, 'html.parser')
    table           = parser.find('table', {'class': 'infobox'})
    included_headers = {}
    
    for i in range(0, len(table.find_all('th', {'class': 'infobox-label'}))):
        included_headers[unicodedata.normalize('NFKD', table.find_all('th', {'class': 'infobox-label'}).__getitem__(i).getText())] = i
    return included_headers


urls = glottolog_info['Wikipedia_Url'].astype(str)

In [19]:
# The following code block scrapes the Wikipedia pages for each language in the list of links, extracting the information

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

data = []
errors = []

for url in tqdm(urls): 
    # try: 
        # lang = url.split('/')[-1]

        # print(requests.get(url).status_code)
        # print(url)

        session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)

        if url == 'nan':
             continue
        if str(url).startswith('/wiki/'):
             url = 'https://en.wikipedia.org' + str(url)
        if requests.get(str(url)).status_code != 200:
             continue
        response = session.get(str(url))

        content         = response.content
        parser          = BeautifulSoup(content, 'html.parser')
        table           = parser.find('table', {'class': 'infobox'})

        if table == None:
             print("No table found ", url)
             continue

        header_indices  = find_index_of_headers(url)

        # print(header_indices)

        infobox_data = table.find_all('td', {'class': 'infobox-data'})

        # lang            = table.find('th', {'class': 'infobox-above above'}).get_text('title') if table.find('th', {'class': 'infobox-above above'}) else lang

        off_lang        = infobox_data[header_indices['Official language in']].text if 'Official language in' in header_indices else None
        rec_min_lang    = infobox_data[header_indices['Recognised minoritylanguage in']].text if 'Recognised minoritylanguage in' in header_indices else None
        # speakers        = infobox_data[header_indices['Speakers']].text             if 'Speakers' in header_indices else None
        iso3code        = infobox_data[header_indices['ISO 639-3']].get_text('title')            if 'ISO 639-3' in header_indices else None
        glottocode      = infobox_data[header_indices['Glottolog']].get_text('title')            if 'Glottolog' in header_indices else None

        if 'Speakers' in header_indices.keys():
            speakers = infobox_data[header_indices['Speakers']].text
        elif 'Native speakers' in header_indices.keys():
            speakers = infobox_data[header_indices['Native speakers']].text
        else:
            speakers = None

        if 'Region' in header_indices.keys():
            regions = infobox_data[header_indices['Region']].text
        elif 'Native Region' in header_indices.keys():
            regions = infobox_data[header_indices['Native Region']].text
        else:
            regions = None

        family          = [x for x in infobox_data[header_indices['Language family']].get_text('title').split('title') if x != "\n"]\
                            if 'Language family' in header_indices else None
        
        dialects        = [x for x in infobox_data[header_indices['Dialects']].get_text('title').split('title') if x != "\n"] \
                            if 'Dialects' in header_indices else None

        url             = str(url)
        
        data.append((family, dialects, iso3code, glottocode, speakers, regions, off_lang, rec_min_lang, url))
    # except Exception as e: 
    #     errors.append((e, url))
    #     continue

df = pd.DataFrame(data, columns=['family', 'dialects', 'iso3code', 'goglottocodettolog', 'speakers', 'regions', 'off_lang', 'rec_min_lang', 'url'])

  2%|▏         | 187/7590 [01:32<44:39,  2.76it/s]  

No table found  https://en.wikipedia.org/wiki/South_Levantine_Arabic


  4%|▍         | 311/7590 [02:40<1:13:37,  1.65it/s]

No table found  https://en.wikipedia.org/wiki/North_Levantine_Arabic


  6%|▋         | 490/7590 [04:33<57:54,  2.04it/s]  

No table found  https://en.wikipedia.org/wiki/Sorsogon_Ayta_language


  9%|▉         | 677/7590 [06:17<50:58,  2.26it/s]  

No table found  https://en.wikipedia.org/wiki/Bhadarwahi


 24%|██▍       | 1852/7590 [17:33<1:06:23,  1.44it/s]

No table found  https://en.wikipedia.org/wiki/Northwestern_Fars_language


 28%|██▊       | 2128/7590 [20:07<47:21,  1.92it/s]  

No table found  https://en.wikipedia.org/wiki/Gowlan_language


 31%|███       | 2324/7590 [21:59<40:46,  2.15it/s]  

No table found  https://en.wikipedia.org/wiki/Duan_language


 42%|████▏     | 3156/7590 [30:12<40:59,  1.80it/s]  

No table found  https://en.wikipedia.org/wiki/Koro%E2%80%93Olrat_language


 43%|████▎     | 3240/7590 [30:59<43:02,  1.68it/s]  

No table found  https://en.wikipedia.org/wiki/Karipuna_language_(Rond%C3%B4nia)


 44%|████▍     | 3333/7590 [31:55<43:26,  1.63it/s]

No table found  https://en.wikipedia.org/wiki/Kpatili_language


 75%|███████▌  | 5722/7590 [56:16<24:55,  1.25it/s]  

No table found  https://en.wikipedia.org/wiki/Salchuq_language


 76%|███████▌  | 5744/7590 [56:31<14:47,  2.08it/s]

No table found  https://en.wikipedia.org/wiki/Inari_Sami


 90%|█████████ | 6835/7590 [1:07:27<07:32,  1.67it/s]

No table found  https://en.wikipedia.org/wiki/Mawa_language_(Nigeria)


 96%|█████████▌| 7277/7590 [1:11:32<02:14,  2.32it/s]

No table found  https://en.wikipedia.org/wiki/Yindjilandji


 99%|█████████▉| 7501/7590 [1:13:29<00:31,  2.79it/s]

No table found  https://en.wikipedia.org/wiki/Marri_Tjevin


100%|██████████| 7590/7590 [1:14:22<00:00,  1.70it/s]


AttributeError: 'list' object has no attribute 'to_csv'

In [20]:
df.to_csv('wiki_languages_most_recent.csv')

## CSV Generated, Edge Cases Observed: 

In [None]:
# edge case observed in punjabi_language page: 
    # https://en.wikipedia.org/wiki/Punjabi_language
    # for the family column:
        # 'Indo-European, Indo-Iranian, Indo-Aryan, Northwestern zone, Punjabi'
        # these values are originally in a list. after running the following code:  
            cols = languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())].columns[1:]
            for col in cols: 
                languages[col] = languages[col].apply(lambda x: [] if pd.isnull(x) else [x])
            languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]
        # they are turned into an object, but appear as a nested list:
            # [['Indo-European', 'Indo-Iranian', 'Indo-Aryan...
        # when adding a line to the code: 
            cols = languages.columns[1:]
            for col in cols: 
                languages[col] = languages[col].apply(lambda x: [] if pd.isnull(x) else [x])
                languages[col] = languages[col].apply(lambda values: [x[:x.find('[')] if '[' in x else x for x in values])
            languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]
        # they are still turned into an object, but appear as an empty list:
            # []
    # for the dialects column:
        # Object: [['See', ' ', 'Punjabi dialects']]
        # The above output does not change after running the first code block: 
            cols = languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())].columns[1:]
            for col in cols: 
                languages[col] = languages[col].apply(lambda x: [] if pd.isnull(x) else [x])
            languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]
        # After runningt the second code block, the output changes to:
            # Object: []

## Modifications to Previously Generated CSV for Data Cleaning:

In [61]:
df = pd.read_csv('wiki_languages.csv')
languages = df

In [58]:
pd.isnull(languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())].columns)

array([False, False, False, False, False, False, False, False])

In [63]:
languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]

Unnamed: 0,lang,family,dialects,iso3code,speakers,regions,off_lang,rec_min_lang
196,Punjabi_language,"['Indo-European', 'Indo-Iranian', 'Indo-Aryan'...","['See', ' ', 'Punjabi dialects']",Either:pan – Panjabipnb – Western Panjabi,,Punjab,Pakistan\n Punjab (provincial)[c][11]\n India...,


In [60]:
cols = languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())].columns[1:]
for col in cols: 
    languages[col] = languages[col].apply(lambda x: [] if pd.isnull(x) else [x])
languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]

Unnamed: 0,lang,family,dialects,iso3code,speakers,regions,off_lang,rec_min_lang
196,Punjabi_language,"[['Indo-European', 'Indo-Iranian', 'Indo-Aryan...","[['See', ' ', 'Punjabi dialects']]",[Either:pan – Panjabipnb – Western Panjabi],[],[Punjab],[ Pakistan\n Punjab (provincial)[c][11]\n Indi...,[]


In [67]:
languages[~languages['dialects'].isnull()]

Unnamed: 0,lang,family,dialects,iso3code,speakers,regions,off_lang,rec_min_lang
6,Nahuatl_language,"['Uto-Aztecan', 'Southern Uto-Aztecan', 'Nahua...","['Western Peripheral Nahuatl', 'Eastern Periph...","nhe Huasteca NahuatlFor other varieties, see N...",,Mexico: Puebla Veracruz Hidalgo Guerrero San L...,Mexico (through the General Law of Linguistic...,
21,Odia_language,"['Indo-European', 'Indo-Iranian', 'Indo-Aryan'...","['Northern', ', Central, ', 'Southern', ', ', ...",ori – inclusive codeIndividual codes:ory – Odi...,,Odisha[a],India\nOdisha\nJharkhand[3] (additional)\nWes...,
23,Kendeje_language,"['Nilo-Saharan', '?\n', 'Maban', 'Kenjeje']","['Yaali', 'Faranga']",klf,,Ouaddaï,,
24,Hindko,"['Indo-European', 'Indo-Iranian', 'Indo-Aryan'...","['Peshawari', 'Kohati', 'Awankari', ', ', 'Ghe...",Either:hnd – Southern Hindkohno – Northern Hindko,,"Hazara Division, Peshawar, Kohat, Pothohar",,
26,Fon_language,"['Niger–Congo', '?\n', 'Atlantic–Congo', 'Volt...","['Agbome', 'Arohun', 'Gbekon', 'Kpase']",fon,,,Benin,
...,...,...,...,...,...,...,...,...
1297,Coptic_language,"['Afro-Asiatic', 'Egyptian', 'Coptic']","['Bohairic', 'Sahidic', 'Akhmimic', 'Lycopolit...",cop,,,,
1303,Sundanese_language,"['Austronesian', 'Malayo-Polynesian', 'dispute...","['Baduy', ' (considered a separate language)',...",Variously:sun – Sundanesebac – Baduy Sundanese...,,"West Java, Banten, Jakarta, small parts of wes...",,
1307,Chhattisgarhi,"['Indo-European', 'Indo-Iranian', 'Indo-Aryan'...",['Surgujia'],Either:hne – Chhattisgarhisgj – Surgujia,,Chhattisgarh and a minority of speakers in Odi...,India\nChhattisgarh (additional)[2]\n,
1310,Herero_language,"['Niger–Congo', '?\n', 'Atlantic–Congo', 'Volt...","['Himba', 'Kuvale', 'Zemba', ' (Thimba, ', 'Ot...",her Herero,,"Kunene, Omaheke Region and Otjozondjupa Region...",,


In [56]:
cols = languages.columns[1:]
for col in cols: 
    languages[col] = languages[col].apply(lambda x: [] if pd.isnull(x) else [x])
    languages[col] = languages[col].apply(lambda values: [x[:x.find('[')] if '[' in x else x for x in values])
languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]

Unnamed: 0,lang,family,dialects,iso3code,speakers,regions,off_lang,rec_min_lang
196,Punjabi_language,[],[],[Either:pan – Panjabipnb – Western Panjabi],[],[Punjab],[ Pakistan\n Punjab (provincial)],[]


In [22]:
languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]

Unnamed: 0,lang,family,dialects,iso3code,speakers,regions,off_lang,rec_min_lang
196,Punjabi_language,[],[],[Either:pan – Panjabipnb – Western Panjabi],[],[Punjab],[ Pakistan\n Punjab (provincial)],[]


In [21]:
languages

Unnamed: 0,lang,family,dialects,iso3code,speakers,regions,off_lang,rec_min_lang
0,Hong_Kong_Special_Administration_Region,[],[],[],[],[],[],[]
1,Adai_language,[],[],[xad],[],[Louisiana],[],[]
2,Northern_Ireland,[],[],[],[],[],[],[]
3,Kapampangan_language,[],[],[pam],[],"[Central Luzon (entirety of Pampanga, southern...",[Angeles City],[Regional language of the Philippines]
4,Vlax_Romani_language,[],[],[rmy],[],[],[],"[Hungary, Romania, Poland, and Serbia]"
...,...,...,...,...,...,...,...,...
1325,Maguindanao,[],[],[],[],[],[],[]
1326,Aas%C3%A1x,[],[],[aas],[],[Tanzania],[],[]
1327,Yapese_language,[],[],[yap],[],[Island of Yap],[],[]
1328,Yanesha%27_language,[],[],[ame],[],[Department of Pasco],[],[]


In [2]:
# unesco_languages

In [1]:
# languages = languages.merge(unesco_languages, left_on='lang', right_on='Name in English')
# print(languages.columns)
# languages