In [5]:
'''Install pre-req modules'''
!pip install lxml pandas rapidfuzz ipython plotly



## Import Modules

In [6]:
import pandas as pd
import plotly.express as px
from rapidfuzz import process, fuzz
from IPython.display import display, HTML

## Custom Functions
### Display Functions

In [7]:
def write(text,colour='black',style='normal',header=0):
    htag=dict()
    if header > 0:
        htag['open'] = f"<h{header}>"
        htag['close'] = f"</h{header}>"
    else:
        htag['open'] =  htag['close'] = ""
    display(HTML(f'<span style="color: {colour};font-style:{style}"><br>{htag['open']}{text}{htag['close']}</span>'))

def visual_check_dataframe(input_df,dataset_name,rows=3,sample_only=False):
    '''Run some visual checks on dataframe so we can see how it looks'''
    write(f"Starting Visual Check : {dataset_name}",'green',header=4)
    if not sample_only:
        write(f'Check top {rows} row(s), followed by a sample of {rows} row(s) and then the bottom {rows} row(s)')
        display(input_df.head(rows))
    if sample_only: write(f'Checking random sample of {rows} rows')
    display(input_df.sample(rows))
    if not sample_only:
        display(input_df.tail(rows))
    write(f"Ending Visual Check : {dataset_name}",'red',style='italic',header=5)

### Data Import Functions
#### Functions for importing supplementary Datasets providing additional geoographic information

In [8]:
def get_us_state_abbreviation_dataset_from_web():
    output_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States')[1]
    return output_df

def get_tld_country_dataset():
    '''Retrieve all TLD country data from Wikipedia'''
    wiki_page = "https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2"
    output_df = pd.read_html(wiki_page)[4]
    return output_df

def get_m49_country_dataset():
    #https://unstats.un.org/unsd/methodology/m49/overview/#
    output_df = pd.read_csv('imported-datasets/UNSD — Methodology.csv', sep=';')
    return output_df

#### Functions for importing supplementary Datasets providing location detail for Cloud provider Datacentres

In [9]:
def get_aws_region_dataset():
    output_df = pd.read_html('https://docs.aws.amazon.com/global-infrastructure/latest/regions/aws-regions.html')[0]
    return output_df

def azure_region_dataset():
    output_df = pd.read_html("https://learn.microsoft.com/en-us/azure/reliability/regions-list")
    return output_df[1]

def get_google_region_dataset():
    output_df = pd.read_html('https://en.wikipedia.org/wiki/Google_data_centers')[1]
    return output_df

### Data Cleansing/Sanitizing Functions
#### Functions for cleansing of supplementary Datasets providing additional geographic information

In [24]:
def clean_us_state_abbreviation_dataset(input_df):
    output_df = input_df.copy()
    '''Drop everything after first two columns'''
    output_df.drop(columns=[output_df.columns[i] for i in range(2,len(output_df.columns))], inplace=True)
    '''Set columns to something easier'''
    output_df.columns=['County','County Code']
    '''Some entries like Virginia have a [B] on the end due to weblinks from being imported from webpage'''
    output_df['County'] = output_df['County'].str.replace(r'\[[A-Z]+\]$', '', regex=True)
    return output_df

def clean_tld_country_data(input_df):
    output_df = input_df.copy()
    columns_to_drop = ['Year','Notes']
    columns_to_rename = {'Code': 'A2 Code', 'ccTLD':'tld'}
    output_df.drop(columns=columns_to_drop, inplace=True)
    output_df.rename(columns=columns_to_rename, inplace=True)
    return output_df

def clean_m49_country_data(input_df):
    output_df = input_df.copy()
    columns_to_drop = [
        'Global Code',
        'Global Name',
        'Land Locked Developing Countries (LLDC)',
        'Least Developed Countries (LDC)',
        'Small Island Developing States (SIDS)',
        'Intermediate Region Code',
        'Intermediate Region Name',]
    output_df.drop(columns=columns_to_drop,inplace=True)
    return output_df



#### Functions for cleansing of supplementary Datasets providing location detail for Cloud provider Datacentres

In [11]:
def clean_aws_region_dataset(input_df):
    columns_to_rename = { 'Code': 'Cloud Region' }
    output_df = input_df.copy().rename(columns=columns_to_rename)
    return output_df

def clean_azure_region_dataset(input_df):
    columns_to_drop = ['Availability zone support','Paired region']
    columns_to_rename = {'Region':'cloud-region'}
    '''Drop columns we don't need'''
    output_df = input_df.copy().drop(columns_to_drop, axis=1)
    '''Set cloud region column name to match our master datasets
       then remove all spaces from the entries, and convert to
       lowercase, so the content matches too'''
    output_df.rename(columns=columns_to_rename, inplace=True)
    output_df['cloud-region'] = output_df.apply(
        lambda row: row['cloud-region'].lower().replace(" ", ""), axis=1)
    output_df['Locality'] = output_df['country'] = output_df['Subdivision'] = ""
    return output_df

def clean_google_region_dataset(input_df,us_states_df):
    columns_to_drop = ['Geo','Timeline','Description','Products Location']
    output_df = input_df.copy().drop(columns=columns_to_drop)

    '''Some entries in Location have the reference number link from the webpage applied to
   the end of the name - e.g. Germany[61].   Use a regex to remove this;
   replace any substring that contains left & right square brackets, surrounding
   one or more numbers'''
    output_df['Location'] = output_df['Location'].str.replace(r'\[\d+\]$', '', regex=True)

    '''Extract the country from the end of the location cells'''
    output_df['country'] = output_df.apply(lambda row: row['Location'].split(',')[-1], axis=1)

    '''Remove rows where cloud location is null or a hyphen as there is no DC there'''
    output_df = output_df[output_df['Cloud Location'].notnull()]
    output_df = output_df[output_df['Cloud Location'] != "-"]

    '''Extract the Cloud Region from Cloud Location'''
    output_df['Cloud Region'] = output_df.apply(
        lambda row: row['Cloud Location'].split('(')[-1].split(')')[0], axis=1)

    '''Extract Locality from Location'''
    output_df['Locality'] = output_df.apply(
        lambda row: row['Location'].split(',')[0], axis=1)

    '''Extract subdivision code (county/province) from Locality'''
    output_df['Subdivision Code'] = output_df.apply(
        lambda row: row['Locality'].split('(')[-1].strip(')') if '(' in str(row['Locality']) else None, axis=1)

    '''Extract subdivision name (county/province) from Location'''
    output_df['Subdivision'] = output_df.apply(
        lambda row: row['Location'].split(',')[1].strip() if str(row['Location']).count(',') >= 2 else None, axis=1)

    '''Remove the subdivision code from locality now we have it in our own column
       - use regex to find a space, then left bracket, followed by 2 instances of letters,
       followed by a right bracket and the end of the string - then remove this'''
    output_df['Locality'] = output_df['Locality'].str.replace(r' \([A-Z]{2}\)$', '', regex=True)

    '''After creating new columns based on the values of others, we can now drop the
       original source columns as we don't need them'''
    output_df.drop(columns=['Location','Cloud Location'], inplace=True)

    '''Match the Subdivision Code from the list of US state abbreviations'''
    output_df = output_df.merge(
        us_states_df,
        left_on='Subdivision Code',
        right_on='County Code',
        how='left')

    output_df['Subdivision'] = output_df['Subdivision'].combine_first(output_df['County'])

    return output_df

### Data Merging Functions
#### Merging Dataframes

In [12]:
def merge_two_datasets(input_df1, input_df2, left,right,method='left'):
    output_df = pd.merge(input_df1, input_df2, left_on=left, right_on=right, how=method)
    output_df.fillna(0, inplace=True)
    return output_df

def merge_tld_and_m49_datasets(input_df_left, input_df_right):
    output_df = merge_two_datasets(input_df_left, input_df_right, 'A2 Code', 'ISO-alpha2 Code')
    output_df = output_df.astype(
        {'M49 Code': int,
         'Sub-region Code': int,
         'Region Code': int})
    return output_df

#### Functions for cleansing of merged supplementary Datasets

In [13]:
def cleanup_combined_tld_m49_dataset(input_df):
    output_df = input_df.copy()
    '''Check that every entry has a 'Country or Area' entry as this comes from the m49 dataset'''
    display(print("Country or area that is null:"))
    display(output_df[output_df['Country or Area'].isnull()])
    display("Country or area that is set to zero:")
    display(output_df[output_df['Country or Area'] == 0])
    display("Country or Area cell which is set to int type;")
    display(output_df[
        output_df['Country or Area'].apply(lambda x: isinstance(x, int))
    ])
    write("Taiwan is the only Country without an entry in the M49 - there are political reasons for this.  For now, we will correct this by assigning the Country name to the Country and Area name")

    '''Use conditional to find any rows that have Country or Area set to 0'''

    output_df.loc[
        output_df['Country or Area'] == 0,
        'Country or Area'
        ] = output_df.loc[
                output_df['Country or Area'] == 0,
                'Country name (using title case)'
                ]
    display("Country or area that is set to zero:")
    display(output_df[output_df['Country or Area'] == 0])
    display("Check Taiwan specifically :")
    display(output_df[output_df['A2 Code'] == 'TW'])

## Load Datasets
We have a number of datasets to load in here, some containing the actual data we want, and some used for sanitizing our data or adding additional context

### Supplementary Datasets

* Top Level Domain (TLD) listing by Country, from wikipedia
* M49 "Standard country or area codes for statistical use" dataset from the United Nations Statistics Division (UNSD)
* American States abbreviation codes to name mapping from wikipedia
* AWS Cloud Region names to location mapping from Amazons online documentation
* Google cloud Region names to location mapping from wikipedia

### Main Datasets

In [62]:

aws_region_df = get_aws_region_dataset()

azure_df = azure_region_dataset()
azure_df = clean_azure_region_dataset(azure_df)
google_region_df = get_google_region_dataset()
google_region_df = clean_google_region_dataset(google_region_df,usa_states_df)

## Supplementary Datasets

### USA States Abbreviation Code Dataset
#### Load in Data

In [16]:
usa_states_df = get_us_state_abbreviation_dataset_from_web()
visual_check_dataframe(usa_states_df,"USA State Code Dataset", rows=1)

Unnamed: 0_level_0,"Flag, name and postal abbreviation[8]","Flag, name and postal abbreviation[8]",Cities,Cities,Ratification or admission[A],Population (2020)[10],Total area[11],Total area[11],Reps.
Unnamed: 0_level_1,"Flag, name and postal abbreviation[8]","Flag, name and postal abbreviation[8].1",Capital,Largest[12],Ratification or admission[A],Population (2020)[10],mi2,km2,Reps.
0,Alabama,AL,Montgomery,Huntsville,"Dec 14, 1819",5024279,52420,135767,7


Unnamed: 0_level_0,"Flag, name and postal abbreviation[8]","Flag, name and postal abbreviation[8]",Cities,Cities,Ratification or admission[A],Population (2020)[10],Total area[11],Total area[11],Reps.
Unnamed: 0_level_1,"Flag, name and postal abbreviation[8]","Flag, name and postal abbreviation[8].1",Capital,Largest[12],Ratification or admission[A],Population (2020)[10],mi2,km2,Reps.
29,New Jersey,NJ,Trenton,Newark,"Dec 18, 1787",9288994,8723,22591,12


Unnamed: 0_level_0,"Flag, name and postal abbreviation[8]","Flag, name and postal abbreviation[8]",Cities,Cities,Ratification or admission[A],Population (2020)[10],Total area[11],Total area[11],Reps.
Unnamed: 0_level_1,"Flag, name and postal abbreviation[8]","Flag, name and postal abbreviation[8].1",Capital,Largest[12],Ratification or admission[A],Population (2020)[10],mi2,km2,Reps.
49,Wyoming,WY,Cheyenne,Cheyenne,"Jul 10, 1890",576851,97813,253335,1


#### Pre-processing of USA State Code dataset

In [17]:
usa_states_df = clean_us_state_abbreviation_dataset(usa_states_df)
visual_check_dataframe(usa_states_df,"USA State Code Dataset", rows=1)

Unnamed: 0,County,County Code
0,Alabama,AL


Unnamed: 0,County,County Code
24,Missouri,MO


Unnamed: 0,County,County Code
49,Wyoming,WY


### TLD Dataset
#### Load in Data

In [20]:
tld_df = get_tld_country_dataset()
visual_check_dataframe(tld_df,"TLD pre-cleanup", rows=1)

Unnamed: 0,Code,Country name (using title case),Year,ccTLD,Notes
0,AD,Andorra,1974,.ad,


Unnamed: 0,Code,Country name (using title case),Year,ccTLD,Notes
118,KM,Comoros,1974,.km,Code taken from name in Comorian: Komori Previ...


Unnamed: 0,Code,Country name (using title case),Year,ccTLD,Notes
248,ZW,Zimbabwe,1980,.zw,Name changed from Southern Rhodesia (RH)


### Pre-processing of TLD dataset

In [21]:
tld_df = clean_tld_country_data(tld_df)
write("Re-check TLD data once it's been cleaned")
visual_check_dataframe(tld_df, "TLD post-cleanup",rows=1,sample_only=True)


Unnamed: 0,A2 Code,Country name (using title case),tld
73,FO,Faroe Islands,.fo


### M49 Dataset
#### Load in Data

In [22]:
m49_df = get_m49_country_dataset()
visual_check_dataframe(m49_df,dataset_name="M49 - pre-cleanup", rows=1)

Unnamed: 0,Global Code,Global Name,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS)
0,1,World,2.0,Africa,15.0,Northern Africa,,,Algeria,12,DZ,DZA,,,


Unnamed: 0,Global Code,Global Name,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS)
17,1,World,2.0,Africa,202.0,Sub-Saharan Africa,14.0,Eastern Africa,Mauritius,480,MU,MUS,,,x


Unnamed: 0,Global Code,Global Name,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS)
247,1,World,9.0,Oceania,61.0,Polynesia,,,Wallis and Futuna Islands,876,WF,WLF,,,


### Pre-processing of M49 Dataset

In [25]:
m49_df = clean_m49_country_data(m49_df)
write("Re-check M49 data once it's been cleaned")
visual_check_dataframe(m49_df, dataset_name="M49 - post-cleanup", rows=1, sample_only=True)

Unnamed: 0,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
169,150.0,Europe,151.0,Eastern Europe,Bulgaria,100,BG,BGR


### Combine TLD and M49 Country Datasets

We have two supplementary datasets representing Geographic location that we will now combine

In [97]:
country_df = merge_tld_and_m49_datasets(tld_df, m49_df)
visual_check_dataframe(country_df, "Combined TLD & M49 Dataset",rows=1)

Unnamed: 0,A2 Code,Country name (using title case),tld,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
0,AD,Andorra,.ad,150,Europe,39,Southern Europe,Andorra,20,AD,AND


Unnamed: 0,A2 Code,Country name (using title case),tld,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
151,MS,Montserrat,.ms,19,Americas,419,Latin America and the Caribbean,Montserrat,500,MS,MSR


Unnamed: 0,A2 Code,Country name (using title case),tld,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
248,ZW,Zimbabwe,.zw,2,Africa,202,Sub-Saharan Africa,Zimbabwe,716,ZW,ZWE


We should confirm that every row in our tld dataframe was able to merge with one from the m49 dataset

In [98]:


cleanup_combined_tld_m49_dataset(country_df)



Country or area that is null:


None

Unnamed: 0,A2 Code,Country name (using title case),tld,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code


'Country or area that is set to zero:'

Unnamed: 0,A2 Code,Country name (using title case),tld,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
227,TW,"Taiwan, Province of China",.tw,0,0,0,0,0,0,0,0


'Country or Area cell which is set to int type;'

Unnamed: 0,A2 Code,Country name (using title case),tld,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
227,TW,"Taiwan, Province of China",.tw,0,0,0,0,0,0,0,0


'Country or area that is set to zero:'

Unnamed: 0,A2 Code,Country name (using title case),tld,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code


'Check Taiwan specifically :'

Unnamed: 0,A2 Code,Country name (using title case),tld,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
227,TW,"Taiwan, Province of China",.tw,0,0,0,0,"Taiwan, Province of China",0,0,0


## Pre-processing of AWS region dataset

In [30]:
visual_check_dataframe(aws_region_df)
aws_region_df = clean_aws_region_dataset(aws_region_df)
write("Re-check AWS Region data once it's been cleaned")
visual_check_dataframe(aws_region_df, sample_only=True)

Unnamed: 0,Code,Name,AZs,Geography,Opt-in status
0,us-east-1,US East (N. Virginia),6,United States of America,Not required
1,us-east-2,US East (Ohio),3,United States of America,Not required
2,us-west-1,US West (N. California),3 †,United States of America,Not required


Unnamed: 0,Code,Name,AZs,Geography,Opt-in status
14,ap-southeast-2,Asia Pacific (Sydney),3,Australia,Not required
4,af-south-1,Africa (Cape Town),3,South Africa,Required
7,ap-southeast-3,Asia Pacific (Jakarta),3,Indonesia,Required


Unnamed: 0,Code,Name,AZs,Geography,Opt-in status
30,me-south-1,Middle East (Bahrain),3,Bahrain,Required
31,me-central-1,Middle East (UAE),3,United Arab Emirates,Required
32,sa-east-1,South America (São Paulo),3,Brazil,Not required


Unnamed: 0,Cloud Region,Name,AZs,Geography,Opt-in status
1,us-east-2,US East (Ohio),3,United States of America,Not required
2,us-west-1,US West (N. California),3 †,United States of America,Not required
26,eu-north-1,Europe (Stockholm),3,Sweden,Not required


## Combine AWS region dataframe with countries dataframe
The AWS region dataframe doesn't have full details of where the cloud dc is based.  So, we will combine it with the more comprehensive list compiled earlier

In [11]:
def combine_cloud_region_details_with_location_info(
        input_cloud_region_df,
        left='Geography',
        right='Country or Area',
        input_country_df = country_df,
):
    return pd.merge(
        input_cloud_region_df,
        input_country_df,
        left_on=left,
        right_on=right,
        how='left')

aws_combined_df = combine_cloud_region_details_with_location_info(aws_regions_from_html)
aws_combined_df.head(10)

Unnamed: 0,Cloud Region,Name,AZs,Geography,Opt-in status,Code,Country name (using title case),ccTLD,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
0,us-east-1,US East (N. Virginia),6,United States of America,Not required,US,United States of America,.us,19.0,Americas,21.0,Northern America,0.0,0,United States of America,840.0,US,USA
1,us-east-2,US East (Ohio),3,United States of America,Not required,US,United States of America,.us,19.0,Americas,21.0,Northern America,0.0,0,United States of America,840.0,US,USA
2,us-west-1,US West (N. California),3 †,United States of America,Not required,US,United States of America,.us,19.0,Americas,21.0,Northern America,0.0,0,United States of America,840.0,US,USA
3,us-west-2,US West (Oregon),4,United States of America,Not required,US,United States of America,.us,19.0,Americas,21.0,Northern America,0.0,0,United States of America,840.0,US,USA
4,af-south-1,Africa (Cape Town),3,South Africa,Required,ZA,South Africa,.za,2.0,Africa,202.0,Sub-Saharan Africa,18.0,Southern Africa,South Africa,710.0,ZA,ZAF
5,ap-east-1,Asia Pacific (Hong Kong),3,Hong Kong,Required,,,,,,,,,,,,,
6,ap-south-2,Asia Pacific (Hyderabad),3,India,Required,IN,India,.in,142.0,Asia,34.0,Southern Asia,0.0,0,India,356.0,IN,IND
7,ap-southeast-3,Asia Pacific (Jakarta),3,Indonesia,Required,ID,Indonesia,.id,142.0,Asia,35.0,South-eastern Asia,0.0,0,Indonesia,360.0,ID,IDN
8,ap-southeast-5,Asia Pacific (Malaysia),3,Malaysia,Required,MY,Malaysia,.my,142.0,Asia,35.0,South-eastern Asia,0.0,0,Malaysia,458.0,MY,MYS
9,ap-southeast-4,Asia Pacific (Melbourne),3,Australia,Required,AU,Australia,.au,9.0,Oceania,53.0,Australia and New Zealand,0.0,0,Australia,36.0,AU,AUS


Check for Naan Values

In [17]:
'''Look for any records that didnt merge correctly'''
def identify_cloud_regions_with_incomplete_country_info(input_df):
    '''create a list of any rows with null values for Country or Area'''
    unmerged_cloud_regions = input_df[input_df['Country or Area'].isnull()]['Cloud Region'].to_list()
    '''Create a DataFrame collating details of the unmerged regions'''
    unmerged_details_df = input_df[input_df['Cloud Region'].isin(unmerged_cloud_regions)][['Cloud Region', 'Geography','Country name (using title case)','Country or Area','ccTLD','Region Code','Region Name']]
    return {'list': unmerged_cloud_regions, 'df': unmerged_details_df}

'''Report AWS regions missing or incomplete country info'''
unmerged_aws_cloud_regions = identify_cloud_regions_with_incomplete_country_info(aws_combined_df)
display("Unmerged Cloud Regions for AWS:")
display(unmerged_aws_cloud_regions['df'])

'Unmerged Cloud Regions for AWS:'

Unnamed: 0,Cloud Region,Geography,Country name (using title case),Country or Area,ccTLD,Region Code,Region Name
5,ap-east-1,Hong Kong,,,,,
12,ap-northeast-2,South Korea,,,,,
15,ap-east-2,Taiwan,,,,,
22,eu-west-2,United Kingdom,,,,,


Get a list of all country-or-area names from the M49 list

In [13]:
country_or_area_list = country_df['Country or Area'].tolist()
country_or_area_list

['Andorra',
 'United Arab Emirates',
 'Afghanistan',
 'Antigua and Barbuda',
 'Anguilla',
 'Albania',
 'Armenia',
 'Angola',
 'Antarctica',
 'Argentina',
 'American Samoa',
 'Austria',
 'Australia',
 'Aruba',
 'Åland Islands',
 'Azerbaijan',
 'Bosnia and Herzegovina',
 'Barbados',
 'Bangladesh',
 'Belgium',
 'Burkina Faso',
 'Bulgaria',
 'Bahrain',
 'Burundi',
 'Benin',
 'Saint Barthélemy',
 'Bermuda',
 'Brunei Darussalam',
 'Bolivia (Plurinational State of)',
 'Bonaire, Sint Eustatius and Saba',
 'Brazil',
 'Bahamas',
 'Bhutan',
 'Bouvet Island',
 'Botswana',
 'Belarus',
 'Belize',
 'Canada',
 'Cocos (Keeling) Islands',
 'Democratic Republic of the Congo',
 'Central African Republic',
 'Congo',
 'Switzerland',
 'Côte d’Ivoire',
 'Cook Islands',
 'Chile',
 'Cameroon',
 'China',
 'Colombia',
 'Costa Rica',
 'Cuba',
 'Cabo Verde',
 'Curaçao',
 'Christmas Island',
 'Cyprus',
 'Czechia',
 'Germany',
 'Djibouti',
 'Denmark',
 'Dominica',
 'Dominican Republic',
 'Algeria',
 'Ecuador',
 'Esto

Try and do a fuzzy filter to match the country names

In [37]:
!pip install rapidfuzz
from rapidfuzz import process, fuzz

def do_fuzzy_match_on_countries(
        match_input_df,
        match_on='Geography',
        country_list=country_or_area_list):
    fuzzy_df = match_input_df.copy()
    fuzzy_df[['fuzzy_country_match','fuzzy_similarity','fuzzy_index']]= fuzzy_df[match_on].apply(lambda x: process.extractOne(x, country_list, scorer=fuzz.partial_ratio)).apply(pd.Series)
    '''drop the fuzzy index column as this is only the place the name was in the list of counties we searched through'''
    fuzzy_df.drop(columns=['fuzzy_index'], inplace=True)
    return fuzzy_df

def get_possible_country_matches(input_value,country_list=country_or_area_list):
    '''take in a string and return a list of possible matches
       each possible match is returned as a tuple, containing the match string,
       the similarity score and the index from the list'''
    possible_country_matches = process.extract(input_value, country_list, scorer=fuzz.partial_ratio)
    '''iterate through our list of tuples to form a dictionary, with each matched string
       as the key, and the similarity score as the value'''
    my_dict = {matched_string: similarity for matched_string, similarity, _ in possible_country_matches
            }
    return pd.Series(my_dict).to_frame().rename(columns={0:'similarity'})

def check_fuzzy_search_results_against_previously_unmerged_rows(input_df, unmerged_region_list):
    fuzzy_search_df = do_fuzzy_match_on_countries(input_df)
    '''Check the rows where we had failed to merge previously to see how it looks now'''
    fuzzy_search_filtered_df = fuzzy_search_df[fuzzy_search_df['Cloud Region'].isin(unmerged_region_list)][["Cloud Region","Geography","fuzzy_country_match",'fuzzy_similarity']]
    possible_matches = dict()
    for _, df_row in fuzzy_search_filtered_df.iterrows():
        cloud_region = str(df_row['Cloud Region'])
        geography = str(df_row['Geography'])

        possible_matches[geography] = get_possible_country_matches(geography)
    return {
        'top_matches' : fuzzy_search_filtered_df,
        'possible_matches' : possible_matches
    }


results = check_fuzzy_search_results_against_previously_unmerged_rows(aws_combined_df, unmerged_aws_cloud_regions['list'])

'''Display results of match'''
display(results['top_matches'])





Unnamed: 0,Cloud Region,Geography,fuzzy_country_match,fuzzy_similarity
5,ap-east-1,Hong Kong,"China, Hong Kong Special Administrative Region",100.0
12,ap-northeast-2,South Korea,South Georgia and the South Sandwich Islands,76.190476
15,ap-east-2,Taiwan,"Taiwan, Province of China",100.0
22,eu-west-2,United Kingdom,United Kingdom of Great Britain and Northern I...,100.0


The matches above all look ok, except for South Korea, which we can see only matched with a similarity of 76%.
Lets see if there are other possibilities;

In [38]:

display(results['possible_matches']['South Korea'])

Unnamed: 0,similarity
South Georgia and the South Sandwich Islands,76.190476
Democratic People's Republic of Korea,73.684211
Republic of Korea,73.684211
South Sudan,70.588235
South Africa,70.588235
