In [100]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import pycountry

# Acquiring Most Common Languages per Country

In [101]:
# url of a website containg most common languages per country
language_url = "https://www.infoplease.com/world/countries/languages-spoken-in-each-country-of-the-world"

In [102]:
# get html page
page = requests.get(language_url)

# transform to soup object
soup = BeautifulSoup(page.content, 'html.parser')
    
# find the table with all information (right, top of wikipedia article)
table = soup.find('table')

# get all the rows of the table, discard the header line
rows = table.find_all("tr")[1:]

In [107]:

country_languages = {}

# regular expressions
find_country_regex = r">.*<"
remove_bracket_regex = r'\([^\]]*\)'

# extract countries and corresponding languages
for row in rows:
    # get the country names
    country = str(re.findall(find_country_regex, str(row.findChild("a")))[0][1:-1])
    # remove hexadecimals
    country = country.encode('ascii', errors='ignore').decode("UTF-8")
    # remove brackets
    country = re.sub(remove_bracket_regex, "", country)
    # replace st. by sant
    country = country.replace("St.", "Saint")
    #country = country.decode('utf8').encode('ascii', errors='ignore')
    # contains the whole row of one country
    temp_list = str(row.findAll('p')[1])[3:-4].split(",")
    # extract only the language names
    languages = []
    for language in temp_list:
        # only get 3 languages
        if len(languages) < 3:
            languages.append(re.findall(r"([^\s]+)", language)[0])
    # add to dictionary
    country_languages[country] = languages



In [108]:
# fix wrong words
remove_words = ["other", "and", "widely", "tribal", "24", "60", "declining", "Qazaq)", 
                "multilingual", "indigenous", "which", "small", "unspecified", "but", "universal",
               "regular", "120", "the", "only", "one", "taught", "used", "local", "some", "numerous", 
                "a", "Standard", "based"]

for country, language in country_languages.items():
    for illegal_word in remove_words:
        if illegal_word in country_languages[country]:
            country_languages[country].remove(illegal_word)
################################################################################################
# fix country names for pycountry
replace_country_names_dic = {"Congo, Democratic Republic of the":"Congo, The Democratic Republic of the",     
                        "Congo, Republic of": "Congo",
                        "Cape Verde":"Cabo Verde",
                        "Palestinian State ":"Palestine, State of",
                        "East Timor":"Timor-Leste",
                        "Korea, North":"Korea, Democratic People's Republic of",
                        "Korea, South":"Korea, Republic of",
                        "Laos":"Lao People's Democratic Republic",
                        "So Tom and Prncipe":"Sao Tome and Principe",
                        "Cte d'Ivoire":"Côte d'Ivoire"}
# since we cannot change the size of the dictionary withing a loop, save the ones that are supposed
# to be deleted and their replacements
to_delete = []
to_add = {}
# depending on replace_country_names_dic, look at what to replace and what to replace with
for country, language_list in country_languages.items():
    if country in replace_country_names_dic.keys():
        to_add[replace_country_names_dic[country]] = language_list
        to_delete.append(country)
        
# delete old names
for country in to_delete:
    del country_languages[country]
    
# add new ones
country_languages = {**country_languages, **to_add}

################################################################################################
# manual fixes
country_languages["China"].extend(["Mandarin", "Chinese"])
country_languages["Western Sahara"].extend(["Arabic"])
country_languages["Philippines"].extend(["English"])
del country_languages["Kosovo"] # not included in pycountry

In [109]:
country_languages

{'Afghanistan': ['Afghan', 'Pashto', 'Uzbek'],
 'Albania': ['Albanian', 'Greek'],
 'Algeria': ['Arabic', 'French', 'Berber'],
 'Andorra': ['Catalan', 'French', 'Castilian'],
 'Angola': ['Portuguese', 'Umbundu', 'Kikongo'],
 'Antigua and Barbuda': ['English', 'Antiguan'],
 'Argentina': ['Spanish', 'Italian', 'English'],
 'Armenia': ['Armenian', 'Kurdish'],
 'Australia': ['English', 'Mandarin', 'Arabic'],
 'Austria': ['German', 'Turkish', 'Serbian'],
 'Azerbaijan': ['Azerbaijani', 'Russian', 'Armenian'],
 'Bahamas': ['Azerbaijani', 'Russian', 'Armenian'],
 'Bahrain': ['Arabic', 'English', 'Farsi'],
 'Bangladesh': ['Bangla', 'also'],
 'Barbados': ['English', 'Bajan'],
 'Belarus': ['Russian', 'Belarusian'],
 'Belgium': ['Dutch', 'French', 'German'],
 'Belize': ['English', 'Spanish', 'Creole'],
 'Benin': ['French', 'Fon'],
 'Bhutan': ['Sharchhopka', 'Dzongkha', 'Lhotshamkha'],
 'Bolivia': ['Spanish', 'Quechua', 'Aymara'],
 'Bosnia and Herzegovina': ['Bosnian', 'Serbian', 'Croatian'],
 'Bots

# Acquiring Most Common Religions per Country

In [115]:
# url of a website containg most common languages per country
language_url = "https://www.infoplease.com/world/social-statistics/world-religions"
religions_permitted = ["Islam", "Roman Catholic", "Jewish", "Buddhist", "Protestant", "Christian", 
                       "Orthodox", "Buddhism", "Hindu", "Anglican", "Catholic", "Evangelical", "Protestant",
                      "Church of Tuvalu"]

In [116]:
# get html page
page = requests.get(language_url)

# transform to soup object
soup = BeautifulSoup(page.content, 'html.parser')
    
# find the table with all information (right, top of wikipedia article)
table = soup.find('table')

# get all the rows of the table, discard the header line
rows = table.find_all("tr")

In [117]:
country_religions = {}

# regular expressions
find_country_regex = r">.*<"
find_country_regex_alt = r"<td>(.*?)</td>"
remove_bracket_regex = r"\([^\]]*\)"
get_percentage_regex = r"/d{,3}\%"
# extract countries and corresponding languages
for row in rows:
    # get the country names depending how the text is embedded in the html
    if row.findChild("a"):

        country = str(re.findall(find_country_regex, str(row.findChild("a")))[0][1:-1])
    else:
        country = str(re.findall(find_country_regex_alt, str(row))[0])
    
    # remove hexadecimals
    country = country.encode('ascii', errors='ignore').decode("UTF-8")
    # remove brackets
    country = re.sub(remove_bracket_regex, "", country)
    # replace st. by sant
    country = country.replace("St.", "Saint")
    # removing trailing whitespace
    country = country.rstrip()
    
    # fill dictionary
    country_religions[country] = []
    for r in religions_permitted:
        if r in str(row):
            country_religions[country].append(r)

In [118]:
# fix country names for pycountry
replace_country_names_dic = {"Congo, Democratic Republic of the":"Congo, The Democratic Republic of the",     
                        "Congo, Republic of": "Congo",
                        "Cape Verde":"Cabo Verde",
                        "Palestinian State":"Palestine, State of",
                        "East Timor":"Timor-Leste",
                        "Korea, North":"Korea, Democratic People's Republic of",
                        "Korea, South":"Korea, Republic of",
                        "Laos":"Lao People's Democratic Republic",
                        "So Tom and Prncipe":"Sao Tome and Principe",
                        "Cte d'Ivoire":"Côte d'Ivoire",
                        "Macedonia":"North Macedonia",
                        "Swaziland":"Eswatini"}
# since we cannot change the size of the dictionary withing a loop, save the ones that are supposed
# to be deleted and their replacements
to_delete = []
to_add = {}

# depending on replace_country_names_dic, look at what to replace and what to replace with
for country, religion_list in country_religions.items():
    if country in replace_country_names_dic.keys():
        to_add[replace_country_names_dic[country]] = religion_list
        to_delete.append(country)
        
# delete old names
for country in to_delete:
    del country_religions[country]
    
# add new ones
country_religions = {**country_religions, **to_add}

# manual addition
country_religions["South Sudan"] = ["Christian", "Islam"]

# Adding Languages and Religions to Dataframe

In [134]:
# for creating a df
country_name_list = []
country_alpha_2_list = []
country_alpha_3_list = []
languages_list = []
religion_list = []

for country, language_list in country_languages.items():
    # use the pycountry package to obtain the common names, and the alpha2 and alpha3 abbreviations
    pycountry_object = pycountry.countries.search_fuzzy(country)[0]
    country_name_list.append(pycountry_object.name)
    country_alpha_2_list.append(pycountry_object.alpha_2)
    country_alpha_3_list.append(pycountry_object.alpha_3)
    languages_list.append(language_list)
    religion_list.append(country_religions[country])
    
# create it
country_df = df = pd.DataFrame(list(zip(country_name_list, country_alpha_2_list, country_alpha_3_list, 
                                             languages_list, religion_list)), 
                                   columns =["name", "alpha_2", "alpha_3", "languages", "religions"]) 

# save it
country_df.to_pickle("countries.pkl")

# display it
country_df

Unnamed: 0,name,alpha_2,alpha_3,languages,religions
0,Afghanistan,AF,AFG,"[Afghan, Pashto, Uzbek]",[Islam]
1,Albania,AL,ALB,"[Albanian, Greek]","[Islam, Roman Catholic, Orthodox, Catholic]"
2,Algeria,DZ,DZA,"[Arabic, French, Berber]","[Islam, Jewish, Christian]"
3,Andorra,AD,AND,"[Catalan, French, Castilian]","[Roman Catholic, Catholic]"
4,Angola,AO,AGO,"[Portuguese, Umbundu, Kikongo]","[Roman Catholic, Protestant, Catholic, Protest..."
5,Antigua and Barbuda,AG,ATG,"[English, Antiguan]","[Roman Catholic, Protestant, Christian, Anglic..."
6,Argentina,AR,ARG,"[Spanish, Italian, English]","[Roman Catholic, Jewish, Protestant, Catholic,..."
7,Armenia,AM,ARM,"[Armenian, Kurdish]",[Christian]
8,Australia,AU,AUS,"[English, Mandarin, Arabic]","[Islam, Roman Catholic, Buddhist, Christian, A..."
9,Austria,AT,AUT,"[German, Turkish, Serbian]","[Islam, Roman Catholic, Protestant, Catholic, ..."
