In [None]:
# Importing necessary libraries. You will  most likely need to install: JUPYTER, PANDAS, ISO3166
import pandas as pd
import numpy as np
from dictionaries.country_dictionary import country_dictionary
from dictionaries.iso_mapping import ISO_mapping
from dictionaries.sub_continents_dictionary import sub_continents_dictionary
from dictionaries.continents_dictionary import continents_dictionary
from dictionaries.country_avg_income import income_dictionary
from iso3166 import countries
import unicodedata
from collections import defaultdict
import re
pd.set_option("display.max_rows", 500)

In [None]:
#loading data into a dataframe (a type of data structure)
#if you get errors, make sure the file is a csv, try changing the "sep=" to a comma ",".
df_country = pd.read_csv('country_data.csv', sep=';')

In [None]:
# Method to normalize the data detected. This means first capitalizing the string and removing all punctuation. 
# However we have to be careful here. Blindly removing punctuation will remove ALL special characters, including 
# chinese characters, arab scripts etc. Regular accents tend to have an ascii number lower than 900, so this 
#differentiates accents from different writings/scripts.
def normalize_data(country):
#     if type(country) is float: return # find better way to check for this
    country_upper_case = country.upper().strip()
    country_removed_punctuation = re.sub("[-|_|:|·|.|,|\"|\“|\”|/|+|(|)|?|'|’|&|)|(|?|!|*|%|·]", " ", country_upper_case)
    for char in country_removed_punctuation:
        if ord(char) < 900:
            continue
        else:
            return country_removed_punctuation
    country_removed_accents = unicodedata.normalize('NFD', country_removed_punctuation).encode('ascii', 'ignore').decode("utf-8")
    return country_removed_accents.strip()

In [None]:
# Method to dynamically reverse the dictionnaries to increase run time. This is only stored in memory 
# during the run time of the program. Easier to look up a key in a dict then to loop through all the values 
def reverse_dict(dict_to_reverse):
    reversed_dict = defaultdict(list)
    for key, value in dict_to_reverse.items():
        for val in value:
            reversed_dict[val] = (key)
        reversed_dict[key] = (key)
    return reversed_dict

# 4 dictionnaries we need (all reversed)
country_rev = reverse_dict(country_dictionary)
sub_continents_rev = reverse_dict(sub_continents_dictionary)
continents_rev = reverse_dict(continents_dictionary)
income_rev = reverse_dict(income_dictionary)

In [None]:
# Some countries get misclassified because of similar names. For example, "republic of china" gets classified
# as China and not as Taiwan. Similar problem with congo & democratic republic of congo, northern ireland & ireland,
# guinea & papua new guinea & equatorial guinea etc.
def check_overlapping_country_names(country, key):
    country_split = country.split()
    if key == "CHINA":
        taiwan_flags = ["TAIWAN", "REPUBLIC"]
        if any(element in taiwan_flags for element in country_split):
            return "TAIWAN"
    if key == "GUINEA":
        equatorial_guinea_flags = ["EQUATORIAL", "EQUATORIAL"]
        papua_new_guinea_flags = ["PAPUA", "NEW"]
        if any(element in equatorial_guinea_flags for element in country_split):
            return "EQUATORIAL_GUINEA"
        if any(element in papua_new_guinea_flags for element in country_split):
            return "PAPUA_NEW_GUINEA"
    if key == "IRELAND":
        northern_ireland_flags = ["NORTHERN", "NORTH"]
        if any(element in northern_ireland_flags for element in country_split):
            return "UNITED_KINGDOM"
    if key == "CONGO":
        democratic_republic_congo_flags = ["REPUBLIC", "REPUBLIQUE", "DEMOCRATIQUE", "DEMOCRATIC"]
        if any(element in democratic_republic_congo_flags for element in country_split):
            return "DEMOCRATIC_REPUBLIC_OF_CONGO"
    if key == "MISSING":
        return None
    return key

In [None]:
# Looks up country in the dictionary, returns the normalized coutnry name if found
def find_country(country):
    return country_rev.get(country, None)

In [None]:
# If a response has multiple words (ex: "I am from Canada"), this will not get detected by a simple lookup
# so split the string into all its words and look up each word individually
def find_country_by_value(country_normalized, country_matched):
    if str(country_matched) != "None": return country_matched
    country_split = country_normalized.split()
    # country_split.append(str(country))
    for elt in country_split:
        if elt in country_rev.keys():
            return country_rev.get(elt)

In [None]:
# Get the ISO code for the country detected. Made my own mapping ISO_mapping (more efficient)
def get_iso(country):
    return ISO_mapping.get(country)

In [None]:
# Get the sub continent the country is located in 
def classify_country_as_sub_continent(iso_code):
    return sub_continents_rev.get(iso_code)

In [None]:
# Get the continent the country is located in 
def classify_country_as_continent(iso_code):
    return continents_rev.get(iso_code)

In [None]:
# Get the average income of the country (scale is 1 - 4, where 1 is low and 4 is high income)
def get_avg_income(iso):
    return income_rev.get(iso)

In [None]:
# Removing null entries from the dataframe (to avoid looking up blank responses)
df_country = df_country[~df_country['country'].isna()]

In [None]:
# Normalizing the responses and saving it in a new column called country_normalized
df_country['country_normalized'] = np.vectorize(normalize_data)(df_country['country'])

In [None]:
# First attempt to try and match the response by using the whole response, 
# saving it in column country_matched_by_key
df_country['country_matched_by_key'] = np.vectorize(find_country)(df_country['country_normalized'])

In [None]:
# Second attempt to try and match the response by splitting the response, and looking up each individual word 
# saving it in column country_matched_by_value
df_country['country_matched_value'] = np.vectorize(find_country_by_value)(df_country['country_normalized'], df_country['country_matched_by_key'])

In [None]:
# Making slight adjustments for possible mis-classifications by calling the function check_overlapping_country_names
# Saving the results in column country_overlapping_names
df_country['country_overlapping_names'] = np.vectorize(check_overlapping_country_names)(df_country['country_normalized'], df_country['country_matched_value'])

In [None]:
# Getting the ISO code of each country and saving it in the ISO column
df_country['ISO'] = np.vectorize(get_iso)(df_country['country_overlapping_names'])

In [None]:
# Getting the sub_continent of each country and saving it in the sub_continent column
df_country['sub_continent'] = np.vectorize(classify_country_as_sub_continent)(df_country['ISO'])

In [None]:
# Getting the continent of each country and saving it in the continent column
df_country['continent'] = np.vectorize(classify_country_as_continent)(df_country['sub_continent'])

In [None]:
# Getting the average income of each country and saving it in the avg_column
df_country['avg_income'] = np.vectorize(get_avg_income)(df_country['ISO'])

In [None]:
# Dropping all the extra columns we created that are needed for the final dataset
df_country = df_country.drop(['country_normalized', 'country_matched_by_key', 'country_matched_value'], axis = 1)

In [None]:
# Renaming column with the normalizedc country name.
df_country = df_country.rename(columns={'country_overlapping_names':'country_normalized'})

In [None]:
# Printing the list of countries that were not categorized. Usually gibberish.
# If there is a name that should have been detected, add it to the dictionnary under the right country.
unclassified_countries = df_country.loc[df_country['country_normalized'] == "None"]
unclassified_countries_list = unclassified_countries['country'].tolist()
print("There are a total of", len(unclassified_countries_list), " different unclassified Responses and ", 
      len(unclassified_countries_list), " total unclassified responses...")
print(unclassified_countries_list)

In [None]:
# Saving our dataframe to a csv file (this can be to another format as well such as SAS, excel).
# This will be in the same directory as the code and inital dataset.
df_country.to_csv('country_sorted.csv')