Languages spoken in a given region, and their legal status (i.e. officially/unofficially recognized languages) listed by country are listed in the following Wikipedia page: https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory 

This scraper pulls this information and places it in a dataframe. 

In [35]:
import pandas as pd 
import requests
import unicodedata
import numpy as np 
from tqdm import tqdm
from bs4 import BeautifulSoup


In [36]:
countries_and_territories = requests.get('https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory')
# 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages'

soup = BeautifulSoup(countries_and_territories.content, 'html.parser')
table = soup.find('table', {'class': ['wikitable', 'sortable', 'jquery-tablesorter']})
languages_by_region = pd.DataFrame(columns=['Country/Region', 'Official Language', 'Regional Language', 'Minority Language', 'National Language', 'Widely Spoken'], index = range(0, len(table.find_all('tr')) - 1))
delimeter = ','

i = 0
for region in table.find('tbody').find_all('tr')[1:]:

    columns = region.find_all('td')

    if len(columns) != 6:
        languages_by_region.iloc[i] = [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN]

# [item for item in delimeter.join(columns[0].text.strip().split('[')).split(delimeter) if (not item[0].isdigit() or not item.endswith(']'))]

    else:
        languages_by_region.loc[i, 'Country/Region'] = columns[0].text.strip().split('\n')
        languages_by_region.loc[i, 'Official Language'] = columns[1].text.strip().split('\n')
        languages_by_region.loc[i, 'Regional Language'] = columns[2].text.strip().split('\n')
        languages_by_region.loc[i, 'Minority Language'] = columns[3].text.strip().split('\n')
        languages_by_region.loc[i, 'National Language'] = columns[4].text.strip().split('\n')
        languages_by_region.loc[i, 'Widely Spoken'] = columns[5].text.strip().split('\n')

    i += 1

languages_by_region.iloc[21] = [['Bolivia'], ['Castilian (Spanish)', 'Aymara', 'Araona', 'Baure', 'Bésiro (Chiquitano)', 'Canichana', \
                                            'Cavineña', 'Cayubaba', 'Chácobo', 'Chimán', 'Ese Ejja', 'Guaraní', 'Guarasu\'we', 'Guarayu', \
                                            'Itonama', 'Leco', 'Machajuyai-Kallawaya', 'Machineri', 'Maropa', 'Mojeño-Ignaciano', 'Mojeño-Trinitario', \
                                            'Moré', 'Mosetén', 'Movima', 'Pacawara', 'Puquina', 'Quechua', 'Sirionó', 'Tacana', 'Tapieté', 'Toromona', \
                                            'Uru-Chipaya', 'Weenhayek', 'Yaminawa', 'Yuki', 'Yuracaré', 'Zamuco'], \
                                [''], [''], [''], ['']]
languages_by_region.iloc[180] = [['Sweden'], ['Swedish'], [' '], ['Finnish (Gällivare, Haparanda, Kiruna, Pajala, Övertorneå)', \
                                                            'Meänkieli (Gällivare, Haparanda, Kiruna, Pajala, Övertorneå)', \
                                                            'Sami (Arjeplog, Gällivare, Jokkmokk, Kiruna)', 'Yiddish', 'Romani'], 'English', ' ']
languages_by_region.iloc[195] = [['Tuvalu'], ['Tuvaluan', 'English'], [''], [''], ['Tuvaluan', 'English'], ['']]
languages_by_region.iloc[208] = [['Zambia'], ['English'], [''], [''], [''], ['']]

languages_by_region['Country/Region'] = languages_by_region['Country/Region'].apply(lambda x: delimeter.join([item for item in delimeter.join(x).split('[') if (not item[0].isdigit() and not item.endswith(']'))]) if type(x) == list else x)

languages_by_region

Unnamed: 0,Country/Region,Official Language,Regional Language,Minority Language,National Language,Widely Spoken
0,[Abkhazia[a]],"[Abkhaz, Russian]",[],[Georgian],[Abkhaz],[]
1,[Afghanistan[1]],"[Persian (Dari), Pashto]","[Uzbek[b], Turkmen[b], Pashayi[b], Nuristani[b...",[],"[Persian (Dari), Pashto]",[Persian (Dari)]
2,[Albania[2]],[Albanian],[],"[Greek, Macedonian, Aromanian]",[],[Italian]
3,[Algeria[3]],"[Arabic, Berber]",[],[],"[Arabic, Berber]",[French]
4,[Andorra],[Catalan[4]],[],"[Spanish, French, Portuguese]",[],[]
...,...,...,...,...,...,...
205,[Venezuela],"[Spanish, Venezuelan Sign Language]",[],[Native languages are official for indigenous ...,[],[]
206,[Vietnam],[Vietnamese],[],"[Cantonese, Cham, Hmong, Khmer, Lao, Muong, Ra...",[Vietnamese],[]
207,[Yemen],[Arabic],[],"[Mehri, Soqotri]",[],[]
208,[Zambia],[English],[],[],[],[]


In [42]:
languages_by_region.to_csv('../csv_files/languages_by_region.csv', index=False)