# Scraper

In [1]:
import pandas as pd 
import requests
import unicodedata
from tqdm import tqdm
from bs4 import BeautifulSoup

def get_links(): 
    response = requests.get('https://en.wikipedia.org/wiki/List_of_language_names')
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.find_all('a')
    # links = [link.get('href') for link in links if (link.get('title') is not None) and ('language' in link.get('title'))]
    links = [link.get('href') for link in links if (link.get('title') is not None)]
    # links = [link for link in links if link.endswith('_language')]
    links = ["https://en.wikipedia.org"+link for link in links]
    return links

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
countries_and_territories = requests.get('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages')
soup = BeautifulSoup(countries_and_territories.content, 'html.parser')

table = soup.find('table', {'class': ['wikitable', 'sortable', 'jquery-tablesorter']})
regions = []
languages = []

table_rows = table.find_all('tr')
print(len(table_rows))

16


In [3]:
unesco_languages = pd.read_csv('../Extinct languages - DATA SUMMARY.csv')
unesco_languages

Unnamed: 0,Name in English,Number of speakers,Degree of endangerment
0,South Italian,7500000,Vulnerable
1,Sicilian,5000000,Vulnerable
2,Low Saxon,4800000,Vulnerable
3,Belarusian,4000000,Vulnerable
4,Lombard,3500000,Definitely endangered
...,...,...,...
2534,|'Auni,0,Extinct
2535,|Xam,0,Extinct
2536,ǁKu ǁ'e,0,Extinct
2537,ǁKx'au,0,Extinct


In [4]:
def find_index_of_headers(url):
    response        = requests.get(url)
    content         = response.content
    parser          = BeautifulSoup(content, 'html.parser')
    table           = parser.find('table', {'class': 'infobox'})
    included_headers = {}
    
    for i in range(0, len(table.find_all('th', {'class': 'infobox-label'}))):
        included_headers[unicodedata.normalize('NFKD', table.find_all('th', {'class': 'infobox-label'}).__getitem__(i).getText())] = i
    return included_headers


urls = get_links()
print(len(urls))
print(urls[:10])

urls = list(set(urls))

urls = [url for url in urls if url not in ['Official_language', 'Minority_language']]
urls = [url for url in urls if url.startswith('List_of_countries') == False]
urls = [url for url in urls if url.count(':') == 1]
urls = [url for url in urls if '?' not in url]

3540
['https://en.wikipedia.org/wiki/Main_Page', 'https://en.wikipedia.org/wiki/Wikipedia:Contents', 'https://en.wikipedia.org/wiki/Portal:Current_events', 'https://en.wikipedia.org/wiki/Special:Random', 'https://en.wikipedia.org/wiki/Wikipedia:About', 'https://en.wikipedia.org//en.wikipedia.org/wiki/Wikipedia:Contact_us', 'https://en.wikipedia.orghttps://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en', 'https://en.wikipedia.org/wiki/Help:Contents', 'https://en.wikipedia.org/wiki/Help:Introduction', 'https://en.wikipedia.org/wiki/Wikipedia:Community_portal']


In [5]:
data = []
errors = []

for url in tqdm(urls): 
    # try: 
        lang = url.split('/')[-1]

        # print(requests.get(url).status_code)
        # print(url)
        response        = requests.get(url)
        content         = response.content
        parser          = BeautifulSoup(content, 'html.parser')
        table           = parser.find('table', {'class': 'infobox'})

        if table == None:
             print("No table found ", url)
             continue

        header_indices  = find_index_of_headers(url)

        # print(header_indices)

        infobox_data = table.find_all('td', {'class': 'infobox-data'})

        lang            = table.find('th', {'class': 'infobox-above above'}).get_text('title') if table.find('th', {'class': 'infobox-above above'}) else lang

        off_lang        = infobox_data[header_indices['Official language in']].text if 'Official language in' in header_indices else None
        rec_min_lang    = infobox_data[header_indices['Recognised minoritylanguage in']].text if 'Recognised minoritylanguage in' in header_indices else None
        # speakers        = infobox_data[header_indices['Speakers']].text             if 'Speakers' in header_indices else None
        iso3code        = infobox_data[header_indices['ISO 639-3']].get_text('title')            if 'ISO 639-3' in header_indices else None
        glottocode      = infobox_data[header_indices['Glottolog']].get_text('title')            if 'Glottolog' in header_indices else None

        if 'Speakers' in header_indices.keys():
            speakers = infobox_data[header_indices['Speakers']].text
        elif 'Native speakers' in header_indices.keys():
            speakers = infobox_data[header_indices['Native speakers']].text
        else:
            speakers = None

        if 'Region' in header_indices.keys():
            regions = infobox_data[header_indices['Region']].text
        elif 'Native Region' in header_indices.keys():
            regions = infobox_data[header_indices['Native Region']].text
        else:
            regions = None

        family          = [x for x in infobox_data[header_indices['Language family']].get_text('title').split('title') if x != "\n"]\
                            if 'Language family' in header_indices else None
        
        dialects        = [x for x in infobox_data[header_indices['Dialects']].get_text('title').split('title') if x != "\n"] \
                            if 'Dialects' in header_indices else None

        url             = str(url)
        
        # print('Regions: ', regions)
        data.append((lang, family, dialects, iso3code, glottocode, speakers, regions, off_lang, rec_min_lang, url))
    # except Exception as e: 
    #     errors.append((e, url))
    #     continue

df = pd.DataFrame(data, columns=['lang', 'family', 'dialects', 'iso3code', 'goglottocodettolog', 'speakers', 'regions', 'off_lang', 'rec_min_lang', 'url'])


  1%|          | 8/1389 [00:16<34:57,  1.52s/it]  

No table found  https://en.wikipedia.org/wiki/Liturgical_language


  1%|          | 12/1389 [00:19<22:03,  1.04it/s]

No table found  https://en.wikipedia.org/wiki/List_of_countries_where_Spanish_is_an_official_language


  6%|▌         | 81/1389 [02:10<25:48,  1.18s/it]

No table found  https://en.wikipedia.org/wiki/Court


  7%|▋         | 101/1389 [02:41<32:24,  1.51s/it]

No table found  https://en.wikipedia.org/wiki/Karelia


 10%|█         | 145/1389 [03:49<35:09,  1.70s/it]

No table found  https://en.wikipedia.org/wiki/Zonal_auxiliary_language


 12%|█▏        | 169/1389 [04:27<22:35,  1.11s/it]

No table found  https://en.wikipedia.org/wiki/Autonomous_counties_of_the_People%27s_Republic_of_China


 13%|█▎        | 176/1389 [04:35<20:41,  1.02s/it]

No table found  https://en.wikipedia.org/wiki/List_of_countries_where_Arabic_is_an_official_language


 13%|█▎        | 187/1389 [04:47<16:12,  1.24it/s]

No table found  https://en.wikipedia.org/wiki/Exonym_and_endonym


 14%|█▍        | 191/1389 [04:54<28:07,  1.41s/it]

No table found  https://en.wikipedia.org/wiki/List_of_countries_where_English_is_an_official_language


 14%|█▍        | 196/1389 [05:03<34:10,  1.72s/it]

No table found  https://en.wikipedia.org/wiki/Official_languages_of_the_United_Nations


 16%|█▋        | 228/1389 [05:53<24:47,  1.28s/it]

No table found  https://en.wikipedia.org/wiki/Administrative_divisions_of_Uzbekistan


 17%|█▋        | 241/1389 [06:08<19:04,  1.00it/s]

No table found  https://en.wikipedia.org/wiki/Communities,_regions_and_language_areas_of_Belgium


 18%|█▊        | 245/1389 [06:11<14:55,  1.28it/s]

No table found  https://en.wikipedia.org/wiki/Official_language


 18%|█▊        | 250/1389 [06:17<16:51,  1.13it/s]

No table found  https://en.wikipedia.org/wiki/Middle_East_and_North_Africa


 19%|█▊        | 258/1389 [06:28<27:48,  1.48s/it]

No table found  https://en.wikipedia.org/wiki/Mesopotamia


 19%|█▉        | 267/1389 [06:39<16:52,  1.11it/s]

No table found  https://en.wikipedia.org/wiki/Provinces_of_Mongolia


 21%|██        | 286/1389 [07:08<24:06,  1.31s/it]

No table found  https://en.wikipedia.org/wiki/Pidgin


 26%|██▌       | 363/1389 [10:10<20:12,  1.18s/it]  

No table found  https://en.wikipedia.org/wiki/Main_Page


 27%|██▋       | 370/1389 [10:19<17:34,  1.03s/it]

No table found  https://en.wikipedia.org/wiki/Southeastern_Europe


 28%|██▊       | 393/1389 [10:55<17:45,  1.07s/it]

No table found  https://en.wikipedia.org/wiki/Akkad_(city)


 30%|██▉       | 414/1389 [11:23<16:16,  1.00s/it]

No table found  https://en.wikipedia.org/wiki/Regional_language


 32%|███▏      | 444/1389 [11:54<09:23,  1.68it/s]

No table found  https://en.wikipedia.org/wiki/Cities_of_Brazil


 33%|███▎      | 463/1389 [12:19<15:06,  1.02it/s]

No table found  https://en.wikipedia.org/wiki/Archipelago


 34%|███▎      | 468/1389 [12:25<15:54,  1.04s/it]

No table found  https://en.wikipedia.org/wiki/List_of_language_names


 34%|███▍      | 471/1389 [12:26<10:57,  1.40it/s]

No table found  https://en.wikipedia.org/wiki/Administrative_divisions_of_Serbia


 36%|███▌      | 498/1389 [13:11<21:01,  1.42s/it]

No table found  https://en.wikipedia.org/wiki/Autonomous_area


 40%|███▉      | 555/1389 [14:25<10:24,  1.34it/s]

No table found  https://en.wikipedia.org/wiki/Ronald_Kingsley_Read


 40%|████      | 559/1389 [14:29<10:44,  1.29it/s]

No table found  https://en.wikipedia.org/wiki/National_language


 40%|████      | 562/1389 [14:32<11:42,  1.18it/s]

No table found  https://en.wikipedia.org/wiki/States_of_Mexico


 44%|████▎     | 606/1389 [15:30<18:04,  1.38s/it]

No table found  https://en.wikipedia.org/wiki/International_auxiliary_language


 47%|████▋     | 653/1389 [16:34<12:00,  1.02it/s]

No table found  https://en.wikipedia.org/wiki/Minority_language


 47%|████▋     | 654/1389 [16:34<09:33,  1.28it/s]

No table found  https://en.wikipedia.org/wiki/Communities_of_Belgium


 48%|████▊     | 662/1389 [16:43<11:20,  1.07it/s]

No table found  https://en.wikipedia.org/wiki/Fief


 51%|█████     | 702/1389 [17:26<08:17,  1.38it/s]

No table found  https://en.wikipedia.org/wiki/Language


 53%|█████▎    | 743/1389 [18:15<09:10,  1.17it/s]

No table found  https://en.wikipedia.org/wiki/Karamoja


 54%|█████▍    | 750/1389 [18:23<09:14,  1.15it/s]

No table found  https://en.wikipedia.org/wiki/List_of_countries_where_Russian_is_an_official_language


 57%|█████▋    | 790/1389 [19:11<08:35,  1.16it/s]

No table found  https://en.wikipedia.org/wiki/States_and_federal_territories_of_Malaysia


 57%|█████▋    | 798/1389 [19:19<08:13,  1.20it/s]

No table found  https://en.wikipedia.org/wiki/Near_East


 59%|█████▉    | 825/1389 [19:53<08:34,  1.10it/s]

No table found  https://en.wikipedia.org/wiki/Eastern_Europe


 62%|██████▏   | 858/1389 [20:30<09:01,  1.02s/it]

No table found  https://en.wikipedia.org/wiki/Lingua_franca


 63%|██████▎   | 879/1389 [20:57<12:41,  1.49s/it]

No table found  https://en.wikipedia.org/wiki/Al-Andalus


 64%|██████▍   | 890/1389 [21:10<08:24,  1.01s/it]

No table found  https://en.wikipedia.org/wiki/Emirau_Island


 66%|██████▌   | 913/1389 [21:37<07:27,  1.06it/s]

No table found  https://en.wikipedia.org/wiki/Germanic_tribes


 71%|███████▏  | 991/1389 [23:05<07:08,  1.08s/it]

No table found  https://en.wikipedia.org/wiki/Municipalities_of_Slovakia


 72%|███████▏  | 998/1389 [23:12<06:37,  1.02s/it]

No table found  https://en.wikipedia.org/wiki/National_Language


 72%|███████▏  | 1003/1389 [23:18<05:43,  1.12it/s]

No table found  https://en.wikipedia.org/wiki/Provinces_of_Costa_Rica


 74%|███████▎  | 1021/1389 [23:37<05:21,  1.15it/s]

No table found  https://en.wikipedia.org/wiki/French_Basque_Country


 76%|███████▌  | 1054/1389 [24:19<05:48,  1.04s/it]

No table found  https://en.wikipedia.org/wiki/List_of_countries_where_French_is_an_official_language


 77%|███████▋  | 1065/1389 [24:31<04:59,  1.08it/s]

No table found  https://en.wikipedia.org/wiki/Armenian_diaspora


 79%|███████▉  | 1101/1389 [25:13<05:02,  1.05s/it]

No table found  https://en.wikipedia.org/wiki/Municipality


 80%|████████  | 1112/1389 [25:24<04:01,  1.15it/s]

No table found  https://en.wikipedia.org/wiki/Abkhaz_alphabet


 82%|████████▏ | 1135/1389 [25:52<03:04,  1.38it/s]

No table found  https://en.wikipedia.org/wiki/St_Matthias_Islands


 83%|████████▎ | 1148/1389 [26:09<05:06,  1.27s/it]

No table found  https://en.wikipedia.org/wiki/Kawthoolei


 84%|████████▎ | 1160/1389 [26:21<03:32,  1.08it/s]

No table found  https://en.wikipedia.org/wiki/Lists_of_languages


 85%|████████▍ | 1174/1389 [26:36<03:45,  1.05s/it]

No table found  https://en.wikipedia.org/wiki/Jewish_Diaspora


 89%|████████▊ | 1230/1389 [36:14<02:30,  1.06it/s]   

No table found  https://en.wikipedia.org/wiki/Comunes_of_Italy


 89%|████████▉ | 1242/1389 [36:30<03:27,  1.41s/it]

No table found  https://en.wikipedia.org/wiki/Middle_Ages


 91%|█████████ | 1258/1389 [36:50<02:09,  1.01it/s]

No table found  https://en.wikipedia.org/wiki/Mussau_Island


 91%|█████████▏| 1270/1389 [37:03<01:44,  1.14it/s]

No table found  https://en.wikipedia.org/wiki/Sub-provincial_divisions_in_the_People%27s_Republic_of_China


100%|██████████| 1389/1389 [39:25<00:00,  1.70s/it]


In [7]:
df.to_csv('wiki_languages_most_recent.csv')

## CSV Generated, Edge Cases Observed: 

In [None]:
# edge case observed in punjabi_language page: 
    # https://en.wikipedia.org/wiki/Punjabi_language
    # for the family column:
        # 'Indo-European, Indo-Iranian, Indo-Aryan, Northwestern zone, Punjabi'
        # these values are originally in a list. after running the following code:  
            cols = languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())].columns[1:]
            for col in cols: 
                languages[col] = languages[col].apply(lambda x: [] if pd.isnull(x) else [x])
            languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]
        # they are turned into an object, but appear as a nested list:
            # [['Indo-European', 'Indo-Iranian', 'Indo-Aryan...
        # when adding a line to the code: 
            cols = languages.columns[1:]
            for col in cols: 
                languages[col] = languages[col].apply(lambda x: [] if pd.isnull(x) else [x])
                languages[col] = languages[col].apply(lambda values: [x[:x.find('[')] if '[' in x else x for x in values])
            languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]
        # they are still turned into an object, but appear as an empty list:
            # []
    # for the dialects column:
        # Object: [['See', ' ', 'Punjabi dialects']]
        # The above output does not change after running the first code block: 
            cols = languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())].columns[1:]
            for col in cols: 
                languages[col] = languages[col].apply(lambda x: [] if pd.isnull(x) else [x])
            languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]
        # After runningt the second code block, the output changes to:
            # Object: []

## Modifications to Previously Generated CSV for Data Cleaning:

In [61]:
df = pd.read_csv('wiki_languages.csv')
languages = df

In [58]:
pd.isnull(languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())].columns)

array([False, False, False, False, False, False, False, False])

In [63]:
languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]

Unnamed: 0,lang,family,dialects,iso3code,speakers,regions,off_lang,rec_min_lang
196,Punjabi_language,"['Indo-European', 'Indo-Iranian', 'Indo-Aryan'...","['See', ' ', 'Punjabi dialects']",Either:pan – Panjabipnb – Western Panjabi,,Punjab,Pakistan\n Punjab (provincial)[c][11]\n India...,


In [60]:
cols = languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())].columns[1:]
for col in cols: 
    languages[col] = languages[col].apply(lambda x: [] if pd.isnull(x) else [x])
languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]

Unnamed: 0,lang,family,dialects,iso3code,speakers,regions,off_lang,rec_min_lang
196,Punjabi_language,"[['Indo-European', 'Indo-Iranian', 'Indo-Aryan...","[['See', ' ', 'Punjabi dialects']]",[Either:pan – Panjabipnb – Western Panjabi],[],[Punjab],[ Pakistan\n Punjab (provincial)[c][11]\n Indi...,[]


In [67]:
languages[~languages['dialects'].isnull()]

Unnamed: 0,lang,family,dialects,iso3code,speakers,regions,off_lang,rec_min_lang
6,Nahuatl_language,"['Uto-Aztecan', 'Southern Uto-Aztecan', 'Nahua...","['Western Peripheral Nahuatl', 'Eastern Periph...","nhe Huasteca NahuatlFor other varieties, see N...",,Mexico: Puebla Veracruz Hidalgo Guerrero San L...,Mexico (through the General Law of Linguistic...,
21,Odia_language,"['Indo-European', 'Indo-Iranian', 'Indo-Aryan'...","['Northern', ', Central, ', 'Southern', ', ', ...",ori – inclusive codeIndividual codes:ory – Odi...,,Odisha[a],India\nOdisha\nJharkhand[3] (additional)\nWes...,
23,Kendeje_language,"['Nilo-Saharan', '?\n', 'Maban', 'Kenjeje']","['Yaali', 'Faranga']",klf,,Ouaddaï,,
24,Hindko,"['Indo-European', 'Indo-Iranian', 'Indo-Aryan'...","['Peshawari', 'Kohati', 'Awankari', ', ', 'Ghe...",Either:hnd – Southern Hindkohno – Northern Hindko,,"Hazara Division, Peshawar, Kohat, Pothohar",,
26,Fon_language,"['Niger–Congo', '?\n', 'Atlantic–Congo', 'Volt...","['Agbome', 'Arohun', 'Gbekon', 'Kpase']",fon,,,Benin,
...,...,...,...,...,...,...,...,...
1297,Coptic_language,"['Afro-Asiatic', 'Egyptian', 'Coptic']","['Bohairic', 'Sahidic', 'Akhmimic', 'Lycopolit...",cop,,,,
1303,Sundanese_language,"['Austronesian', 'Malayo-Polynesian', 'dispute...","['Baduy', ' (considered a separate language)',...",Variously:sun – Sundanesebac – Baduy Sundanese...,,"West Java, Banten, Jakarta, small parts of wes...",,
1307,Chhattisgarhi,"['Indo-European', 'Indo-Iranian', 'Indo-Aryan'...",['Surgujia'],Either:hne – Chhattisgarhisgj – Surgujia,,Chhattisgarh and a minority of speakers in Odi...,India\nChhattisgarh (additional)[2]\n,
1310,Herero_language,"['Niger–Congo', '?\n', 'Atlantic–Congo', 'Volt...","['Himba', 'Kuvale', 'Zemba', ' (Thimba, ', 'Ot...",her Herero,,"Kunene, Omaheke Region and Otjozondjupa Region...",,


In [56]:
cols = languages.columns[1:]
for col in cols: 
    languages[col] = languages[col].apply(lambda x: [] if pd.isnull(x) else [x])
    languages[col] = languages[col].apply(lambda values: [x[:x.find('[')] if '[' in x else x for x in values])
languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]

Unnamed: 0,lang,family,dialects,iso3code,speakers,regions,off_lang,rec_min_lang
196,Punjabi_language,[],[],[Either:pan – Panjabipnb – Western Panjabi],[],[Punjab],[ Pakistan\n Punjab (provincial)],[]


In [22]:
languages[languages['lang'].apply(lambda x: 'punjabi' in x.lower())]

Unnamed: 0,lang,family,dialects,iso3code,speakers,regions,off_lang,rec_min_lang
196,Punjabi_language,[],[],[Either:pan – Panjabipnb – Western Panjabi],[],[Punjab],[ Pakistan\n Punjab (provincial)],[]


In [21]:
languages

Unnamed: 0,lang,family,dialects,iso3code,speakers,regions,off_lang,rec_min_lang
0,Hong_Kong_Special_Administration_Region,[],[],[],[],[],[],[]
1,Adai_language,[],[],[xad],[],[Louisiana],[],[]
2,Northern_Ireland,[],[],[],[],[],[],[]
3,Kapampangan_language,[],[],[pam],[],"[Central Luzon (entirety of Pampanga, southern...",[Angeles City],[Regional language of the Philippines]
4,Vlax_Romani_language,[],[],[rmy],[],[],[],"[Hungary, Romania, Poland, and Serbia]"
...,...,...,...,...,...,...,...,...
1325,Maguindanao,[],[],[],[],[],[],[]
1326,Aas%C3%A1x,[],[],[aas],[],[Tanzania],[],[]
1327,Yapese_language,[],[],[yap],[],[Island of Yap],[],[]
1328,Yanesha%27_language,[],[],[ame],[],[Department of Pasco],[],[]


In [2]:
# unesco_languages

In [1]:
# languages = languages.merge(unesco_languages, left_on='lang', right_on='Name in English')
# print(languages.columns)
# languages