In [88]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import pycountry
import itertools

# Acquiring Most Common Languages per Country

In [89]:
# url of a website containg most common languages per country
language_url = "https://www.infoplease.com/world/countries/languages-spoken-in-each-country-of-the-world"

In [90]:
# get html page
page = requests.get(language_url)

# transform to soup object
soup = BeautifulSoup(page.content, 'html.parser')
    
# find the table with all information (right, top of wikipedia article)
table = soup.find('table')

# get all the rows of the table, discard the header line
rows = table.find_all("tr")[1:]

In [91]:

country_languages = {}

# regular expressions
find_country_regex = r">.*<"
remove_bracket_regex = r'\([^\]]*\)'

# extract countries and corresponding languages
for row in rows:
    # get the country names
    country = str(re.findall(find_country_regex, str(row.findChild("a")))[0][1:-1])
    # remove hexadecimals
    country = country.encode('ascii', errors='ignore').decode("UTF-8")
    # remove brackets
    country = re.sub(remove_bracket_regex, "", country)
    # replace st. by sant
    country = country.replace("St.", "Saint")
    #country = country.decode('utf8').encode('ascii', errors='ignore')
    # contains the whole row of one country
    temp_list = str(row.findAll('p')[1])[3:-4].split(",")
    # extract only the language names
    languages = []
    for language in temp_list:
        # only get 3 languages
        if len(languages) < 3:
            languages.append(re.findall(r"([^\s]+)", language)[0])
    # add to dictionary
    country_languages[country] = languages



In [92]:
# fix wrong words
remove_words = ["other", "and", "widely", "tribal", "24", "60", "declining", "Qazaq)", 
                "multilingual", "indigenous", "which", "small", "unspecified", "but", "universal",
               "regular", "120", "the", "only", "one", "taught", "used", "local", "some", "numerous", 
                "a", "Standard", "based", "junior", "modified", "according", "also", "native", "only",
               "regional", "script", "second", "dialect"]

for country, language in country_languages.items():
    for illegal_word in remove_words:
        if illegal_word in country_languages[country]:
            country_languages[country].remove(illegal_word)
################################################################################################
# fix country names for pycountry
replace_country_names_dic = {"Congo, Democratic Republic of the":"Congo, The Democratic Republic of the",     
                        "Congo, Republic of": "Congo",
                        "Cape Verde":"Cabo Verde",
                        "Palestinian State ":"Palestine, State of",
                        "East Timor":"Timor-Leste",
                        "Korea, North":"Korea, Democratic People's Republic of",
                        "Korea, South":"Korea, Republic of",
                        "Laos":"Lao People's Democratic Republic",
                        "So Tom and Prncipe":"Sao Tome and Principe",
                        "Cte d'Ivoire":"Côte d'Ivoire"}
# since we cannot change the size of the dictionary withing a loop, save the ones that are supposed
# to be deleted and their replacements
to_delete = []
to_add = {}
# depending on replace_country_names_dic, look at what to replace and what to replace with
for country, language_list in country_languages.items():
    if country in replace_country_names_dic.keys():
        to_add[replace_country_names_dic[country]] = language_list
        to_delete.append(country)
        
# delete old names
for country in to_delete:
    del country_languages[country]
    
# add new ones
country_languages = {**country_languages, **to_add}

################################################################################################
# manual fixes
country_languages["China"].extend(["Mandarin", "Chinese"])
country_languages["Western Sahara"].extend(["Arabic"])
country_languages["Philippines"].extend(["English"])
del country_languages["Kosovo"] # not included in pycountry

################################################################################################
# split "/" and only get first language
for key in country_languages.keys():
    languages = country_languages[key]

    new_languages = []
    for l in languages:
        if "/" in l:
            new_languages.extend(l.split("/"))
        else:
            new_languages.append(l)
    languages = new_languages

for key in country_languages.keys():
    languages = country_languages[key]
    print("\n")
    print(languages)
    country_languages[key] = languages[0]
    print(languages[0])



['Afghan', 'Pashto', 'Uzbek']
Afghan


['Albanian', 'Greek']
Albanian


['Arabic', 'French', 'Berber']
Arabic


['Catalan', 'French', 'Castilian']
Catalan


['Portuguese', 'Umbundu', 'Kikongo']
Portuguese


['English', 'Antiguan']
English


['Spanish', 'Italian', 'English']
Spanish


['Armenian', 'Kurdish']
Armenian


['English', 'Mandarin', 'Arabic']
English


['German', 'Turkish', 'Serbian']
German


['Azerbaijani', 'Russian', 'Armenian']
Azerbaijani


['Azerbaijani', 'Russian', 'Armenian']
Azerbaijani


['Arabic', 'English', 'Farsi']
Arabic


['Bangla']
Bangla


['English', 'Bajan']
English


['Russian', 'Belarusian']
Russian


['Dutch', 'French', 'German']
Dutch


['English', 'Spanish', 'Creole']
English


['French', 'Fon']
French


['Sharchhopka', 'Dzongkha', 'Lhotshamkha']
Sharchhopka


['Spanish', 'Quechua', 'Aymara']
Spanish


['Bosnian', 'Serbian', 'Croatian']
Bosnian


['Setswana', 'Sekalanga', 'Shekgalagadi']
Setswana


['Portuguese']
Portuguese


['Malay', 'English', 'Chi

In [93]:
set(country_languages.values())

{'Afghan',
 'Albanian',
 'Arabic',
 'Armenian',
 'Asante',
 'Azerbaijani',
 'Bahasa',
 'Bangla',
 'Bemba',
 'Bislama',
 'Bokmal',
 'Bosnian',
 'Bulgarian',
 'Castilian',
 'Catalan',
 'Cebuano',
 'Creole',
 'Crioulo',
 'Croatian',
 'Danish',
 'Dhivehi',
 'Dutch',
 'English',
 'Estonian',
 'Finnish',
 'French',
 'Georgian',
 'German',
 'Greek',
 'Hassaniya',
 'Hebrew',
 'Hindi',
 'Hungarian',
 'I-Kiribati',
 'Icelandic',
 'Italian',
 'Japanese',
 'Kazakh',
 'Khmer',
 'Kinyarwanda',
 'Kirundi',
 'Kiswahili',
 'Korean',
 'Kyrgyz',
 'Lao',
 'Latvian',
 'Lithuanian',
 'Luxembourgish',
 'Macedonian',
 'Makhuwa',
 'Malay',
 'Maltese',
 'Mandarin',
 'Marshallese',
 'Melanesian',
 'Moldovan/Romanian',
 'Mongolian',
 'Nauruan',
 'Nepali',
 'Oromo',
 'Oshiwambo',
 'Palauan',
 'Persian',
 'Polish',
 'Portuguese',
 'Punjabi',
 'Romanian',
 'Russian',
 'Samoan',
 'Serbian',
 'Sesotho',
 'Setswana',
 'Seychellois',
 'Sharchhopka',
 'Shona',
 'Sinhala',
 'Slovak',
 'Slovene',
 'Somali',
 'Spanish',
 'S

In [94]:
country_languages

{'Afghanistan': 'Afghan',
 'Albania': 'Albanian',
 'Algeria': 'Arabic',
 'Andorra': 'Catalan',
 'Angola': 'Portuguese',
 'Antigua and Barbuda': 'English',
 'Argentina': 'Spanish',
 'Armenia': 'Armenian',
 'Australia': 'English',
 'Austria': 'German',
 'Azerbaijan': 'Azerbaijani',
 'Bahamas': 'Azerbaijani',
 'Bahrain': 'Arabic',
 'Bangladesh': 'Bangla',
 'Barbados': 'English',
 'Belarus': 'Russian',
 'Belgium': 'Dutch',
 'Belize': 'English',
 'Benin': 'French',
 'Bhutan': 'Sharchhopka',
 'Bolivia': 'Spanish',
 'Bosnia and Herzegovina': 'Bosnian',
 'Botswana': 'Setswana',
 'Brazil': 'Portuguese',
 'Brunei': 'Malay',
 'Bulgaria': 'Bulgarian',
 'Burkina Faso': 'French',
 'Burundi': 'Kirundi',
 'Cambodia': 'Khmer',
 'Cameroon': 'English',
 'Canada': 'English',
 'Central African Republic': 'French',
 'Chad': 'French',
 'Chile': 'Spanish',
 'China': 'Yue',
 'Colombia': 'Spanish',
 'Comoros': 'Arabic',
 'Costa Rica': 'Spanish',
 'Croatia': 'Croatian',
 'Cuba': 'Spanish',
 'Cyprus': 'Greek',
 '

# Acquiring Most Common Religions per Country

In [108]:
# url of a website containg most common languages per country
language_url = "https://www.infoplease.com/world/social-statistics/world-religions"
religions_permitted = ["Islam", "Roman Catholic", "Jewish", "Buddhist", "Protestant", "Christian", 
                       "Orthodox", "Buddhism", "Hindu", "Anglican", "Catholic", "Evangelical",
                      "Church of Tuvalu"]

In [109]:
# get html page
page = requests.get(language_url)

# transform to soup object
soup = BeautifulSoup(page.content, 'html.parser')
    
# find the table with all information (right, top of wikipedia article)
table = soup.find('table')

# get all the rows of the table, discard the header line
rows = table.find_all("tr")

In [110]:
country_religions = {}

# regular expressions
find_country_regex = r">.*<"
find_country_regex_alt = r"<td>(.*?)</td>"
remove_bracket_regex = r"\([^\]]*\)"
get_percentage_regex = r"/d{,3}\%"
get_first_country_regex = r"(.+?)\d" 
# extract countries and corresponding languages
for row in rows:
    # get the country names depending how the text is embedded in the html
    if row.findChild("a"):

        country = str(re.findall(find_country_regex, str(row.findChild("a")))[0][1:-1])
    else:
        country = str(re.findall(find_country_regex_alt, str(row))[0])
    
    # remove hexadecimals
    country = country.encode('ascii', errors='ignore').decode("UTF-8")
    # remove brackets
    country = re.sub(remove_bracket_regex, "", country)
    # replace st. by sant
    country = country.replace("St.", "Saint")
    # removing trailing whitespace
    country = country.rstrip()
    
    # fill dictionary
    country_religions[country] = []
    print("="*10)
    # HORRIFICX 
    # string of all languages with percentages
    languages_text = str(str(re.findall(find_country_regex_alt, str(row))[1]))
    print(languages_text)
    if not re.findall(get_first_country_regex, languages_text):
        first_language = languages_text
        found_one = False
        for r in religions_permitted:
                        if found_one:
                            break
                        if r in str(first_language):
                            found_one = True
                            if not r in country_religions[country]:
                                country_religions[country] = r
                                print(r)
        
        
    else:
                
        found_one = False
        languages = re.findall(get_first_country_regex, languages_text)
        
        for first_language in languages:
            # remove possible brackets
            if "(" in first_language:
                brckt_indx = first_language.find("(")
                first_language = first_language[:brckt_indx]

    
            for r in religions_permitted:
                if found_one:
                    break
                if r in str(first_language):
                    found_one = True
                    # merge branches of one religion
                    if r == "Buddhist":
                        r = "Buddhism"
                        print
                    elif r == "Hindu":
                        r = "Hinduism"

                    if not r in country_religions[country]:
                        country_religions[country] = r
                        print(r)
            if found_one:
                break

Islam (Sunni 80%, Shiite 19%), other 1%
Islam
Islam 70%, Albanian Orthodox 20%, Roman Catholic 10% (est.)
Islam
Islam (Sunni) 99% (state religion), Christian and Jewish 1%
Islam
Roman Catholic (predominant)
Roman Catholic
Indigenous 47%, Roman Catholic 38%, Protestant 15% (1998 est.)
Roman Catholic
Christian (predominantly Anglican and other Protestant; some Roman Catholic)
Roman Catholic
Roman Catholic 92%, Protestant 2%, Jewish 2%, other 4%
Roman Catholic
Armenian Apostolic 95%, other Christian 4%, Yezidi 1%
Christian
Roman Catholic 26%, Anglican 21%, other Christian 21%, Buddhist 2%, Islam 2%, other 1%, none 15% (2001)
Roman Catholic
Roman Catholic 74%, Protestant 5%, Islam 4%, none 12% (2001)
Roman Catholic
Islam 93%, Russian Orthodox 3%, Armenian Orthodox 2%, other 2% (1995 est.)
Islam
Baptist 35%, Anglican 15%, Roman Catholic 14%, Pentecostal 8%, Church of God 5%, Methodist 4%, other Christian 15% (2000)
Anglican
Islam (Shiite and Sunni) 81%, Christian 9%
Islam
Islam 83%, Hindu 1

In [111]:
# fix country names for pycountry
replace_country_names_dic = {"Congo, Democratic Republic of the":"Congo, The Democratic Republic of the",     
                        "Congo, Republic of": "Congo",
                        "Cape Verde":"Cabo Verde",
                        "Palestinian State":"Palestine, State of",
                        "East Timor":"Timor-Leste",
                        "Korea, North":"Korea, Democratic People's Republic of",
                        "Korea, South":"Korea, Republic of",
                        "Laos":"Lao People's Democratic Republic",
                        "So Tom and Prncipe":"Sao Tome and Principe",
                        "Cte d'Ivoire":"Côte d'Ivoire",
                        "Macedonia":"North Macedonia",
                        "Swaziland":"Eswatini"}
# since we cannot change the size of the dictionary withing a loop, save the ones that are supposed
# to be deleted and their replacements
to_delete = []
to_add = {}

# depending on replace_country_names_dic, look at what to replace and what to replace with
for country, religion_list in country_religions.items():
    if country in replace_country_names_dic.keys():
        to_add[replace_country_names_dic[country]] = religion_list
        to_delete.append(country)
        
# delete old names
for country in to_delete:
    del country_religions[country]
    
# add new ones
country_religions = {**country_religions, **to_add}

# manual addition
country_religions["South Sudan"] = "Christianity"
country_religions["China"] = "Buddhism"
country_religions["Sweden"] = "Christianity"
country_religions["Israel"] = "Christianity"

# Adding Languages and Religions to Dataframe

In [120]:
# for creating a df
country_name_list = []
country_alpha_2_list = []
country_alpha_3_list = []
languages_list = []
religion_list = []

for country, language_list in country_languages.items():
    # use the pycountry package to obtain the common names, and the alpha2 and alpha3 abbreviations
    pycountry_object = pycountry.countries.search_fuzzy(country)[0]
    country_name_list.append(pycountry_object.name)
    country_alpha_2_list.append(pycountry_object.alpha_2)
    country_alpha_3_list.append(pycountry_object.alpha_3)
    languages_list.append(language_list)
    religion_list.append(country_religions[country])
    
# create it
country_df = pd.DataFrame(list(zip(country_name_list, country_alpha_2_list, country_alpha_3_list, 
                                             languages_list, religion_list)), 
                                   columns =["name", "alpha_2", "alpha_3", "languages", "main_religion"]) 

# last fixes
country_df.loc[country_df.main_religion == "Christian", "main_religion"] = "Christianity"
country_df.loc[country_df.main_religion == "Buddhist", "main_religion"] = "Buddhism"

country_df.loc[country_df.languages == "Castilian", "languages"] = "Spanish"

# save it
country_df.to_pickle("data/countries.pkl")

# display it
country_df

Unnamed: 0,name,alpha_2,alpha_3,languages,main_religion
0,Afghanistan,AF,AFG,Afghan,Islam
1,Albania,AL,ALB,Albanian,Islam
2,Algeria,DZ,DZA,Arabic,Islam
3,Andorra,AD,AND,Catalan,Roman Catholic
4,Angola,AO,AGO,Portuguese,Roman Catholic
...,...,...,...,...,...
192,"Korea, Democratic People's Republic of",KP,PRK,Korean,Buddhism
193,"Korea, Republic of",KR,KOR,Korean,Christianity
194,Lao People's Democratic Republic,LA,LAO,Lao,Buddhism
195,"Palestine, State of",PS,PSE,Palauan,Islam
