In [9]:
'''Install pre-req modules'''
!pip install lxml pandas rapidfuzz ipython plotly



## Import Modules

In [10]:
import pandas as pd
import plotly.express as px
from rapidfuzz import process, fuzz
from IPython.display import display, HTML

## Custom Functions
### Helper / Utility Functions

In [11]:
def write(text,colour='black',style='normal',header=0):
    htag=dict()
    if header > 0:
        htag['open'] = f"<h{header}>"
        htag['close'] = f"</h{header}>"
    else:
        htag['open'] =  htag['close'] = ""
    display(HTML(f'<span style="color: {colour};font-style:{style}"><br>{htag['open']}{text}{htag['close']}</span>'))

def check_if_variable_is_defined_globally(variable_name):
    if variable_name in globals():
        return True
    return False


### Validation Functions

In [12]:
def visual_check_dataframe(input_df,dataset_name,rows=3,sample_only=False):
    '''Run some visual checks on dataframe so we can see how it looks'''
    write(f"Starting Visual Check : {dataset_name}",'green',header=4)
    if not sample_only:
        write(f'Check top {rows} row(s), followed by a sample of {rows} row(s) and then the bottom {rows} row(s)')
        display(input_df.head(rows))
    if sample_only: write(f'Checking random sample of {rows} rows')
    display(input_df.sample(rows))
    if not sample_only:
        display(input_df.tail(rows))
    write(f"Ending Visual Check : {dataset_name}",'red',style='italic',header=5)

### Data Import Functions
#### Functions for importing supplementary Datasets providing additional geoographic information

In [13]:
def get_us_state_abbreviation_dataset_from_web():
    output_df = pd.read_html('https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States')[1]
    return output_df

def get_tld_country_dataset():
    '''Retrieve all TLD country data from Wikipedia'''
    wiki_page = "https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2"
    output_df = pd.read_html(wiki_page)[4]
    return output_df

def get_m49_country_dataset():
    #https://unstats.un.org/unsd/methodology/m49/overview/#
    output_df = pd.read_csv('imported-datasets/UNSD — Methodology.csv', sep=';')
    return output_df

#### Functions for importing supplementary Datasets providing location detail for Cloud provider Datacentres

In [14]:
def get_aws_region_dataset():
    output_df = pd.read_html('https://docs.aws.amazon.com/global-infrastructure/latest/regions/aws-regions.html')[0]
    return output_df

def azure_region_dataset():
    output_df = pd.read_html("https://learn.microsoft.com/en-us/azure/reliability/regions-list")
    return output_df[1]

def get_google_region_dataset():
    output_df = pd.read_html('https://en.wikipedia.org/wiki/Google_data_centers')[1]
    return output_df

### Data Cleansing/Sanitizing Functions
#### Functions for cleansing of supplementary Datasets providing additional geographic information

In [15]:
def clean_us_state_abbreviation_dataset(input_df):
    output_df = input_df.copy()
    '''Drop everything after first two columns'''
    output_df.drop(columns=[output_df.columns[i] for i in range(2,len(output_df.columns))], inplace=True)
    '''Set columns to something easier'''
    output_df.columns=['County','County Code']
    '''Some entries like Virginia have a [B] on the end due to weblinks from being imported from webpage'''
    output_df['County'] = output_df['County'].str.replace(r'\[[A-Z]+\]$', '', regex=True)
    return output_df

def clean_tld_country_data(input_df):
    output_df = input_df.copy()
    columns_to_drop = ['Year','Notes']
    columns_to_rename = {'Code': 'A2 Code', 'ccTLD':'tld'}
    output_df.drop(columns=columns_to_drop, inplace=True)
    output_df.rename(columns=columns_to_rename, inplace=True)
    return output_df

def clean_m49_country_data(input_df):
    output_df = input_df.copy()
    columns_to_drop = [
        'Global Code',
        'Global Name',
        'Land Locked Developing Countries (LLDC)',
        'Least Developed Countries (LDC)',
        'Small Island Developing States (SIDS)',
        'Intermediate Region Code',
        'Intermediate Region Name',]
    output_df.drop(columns=columns_to_drop,inplace=True)
    return output_df



#### Functions for cleansing of supplementary Datasets providing location detail for Cloud provider Datacentres

In [16]:
def clean_aws_region_dataset(input_df):
    columns_to_rename = { 'Code': 'cloud-region' }
    columns_to_drop = ['Opt-in status']
    output_df = input_df.copy().rename(columns=columns_to_rename)
    output_df.drop(columns=columns_to_drop,inplace=True)
    return output_df

def clean_azure_region_dataset(input_df):
    columns_to_drop = ['Availability zone support','Paired region']
    columns_to_rename = {'Region':'cloud-region','Physical location':'Locality','Geography':'country'}
    '''Drop columns we don't need'''
    output_df = input_df.copy().drop(columns_to_drop, axis=1)
    '''Set cloud region column name to match our master datasets
       then remove all spaces from the entries, and convert to
       lowercase, so the content matches too'''
    output_df.rename(columns=columns_to_rename, inplace=True)
    output_df['cloud-region'] = output_df.apply(
        lambda row: row['cloud-region'].lower().replace(" ", ""), axis=1)
    output_df['Region Name'] = ""
    return output_df

def clean_google_region_dataset(input_df,us_states_df=None):
    '''If us_states_df is set to None, try to assign it a copy of
       the usa_states_df variable if it exists globally'''
    if us_states_df is None:
        if check_if_variable_is_defined_globally("usa_states_df"):
            us_states_df=usa_states_df.copy()
        
    columns_to_drop = ['Geo','Timeline','Description','Products Location']
    output_df = input_df.copy().drop(columns=columns_to_drop)

    '''Some entries in Location have the reference number link from the webpage applied to
   the end of the name - e.g. Germany[61].   Use a regex to remove this;
   replace any substring that contains left & right square brackets, surrounding
   one or more numbers'''
    output_df['Location'] = output_df['Location'].str.replace(r'\[\d+\]$', '', regex=True)

    '''Extract the country from the end of the location cells'''
    output_df['country'] = output_df.apply(lambda row: row['Location'].split(',')[-1], axis=1)

    '''Remove rows where cloud location is null or a hyphen as there is no DC there'''
    output_df = output_df[output_df['Cloud Location'].notnull()]
    output_df = output_df[output_df['Cloud Location'] != "-"]

    '''Extract the Cloud Region from Cloud Location'''
    output_df['cloud-region'] = output_df.apply(
        lambda row: row['Cloud Location'].split('(')[-1].split(')')[0], axis=1)

    '''Extract Locality from Location'''
    output_df['Locality'] = output_df.apply(
        lambda row: row['Location'].split(',')[0], axis=1)

    '''Extract subdivision code (county/province) from Locality'''
    output_df['Subdivision Code'] = output_df.apply(
        lambda row: row['Locality'].split('(')[-1].strip(')') if '(' in str(row['Locality']) else None, axis=1)

    '''Extract subdivision name (county/province) from Location'''
    output_df['Subdivision'] = output_df.apply(
        lambda row: row['Location'].split(',')[1].strip() if str(row['Location']).count(',') >= 2 else None, axis=1)

    '''Remove the subdivision code from locality now we have it in our own column
       - use regex to find a space, then left bracket, followed by 2 instances of letters,
       followed by a right bracket and the end of the string - then remove this'''
    output_df['Locality'] = output_df['Locality'].str.replace(r' \([A-Z]{2}\)$', '', regex=True)

    '''After creating new columns based on the values of others, we can now drop the
       original source columns as we don't need them'''
    output_df.drop(columns=['Location','Cloud Location'], inplace=True)

    '''Match the Subdivision Code from the list of US state abbreviations'''
    if us_states_df is not None:
        output_df = output_df.merge(
            us_states_df,
            left_on='Subdivision Code',
            right_on='County Code',
            how='left')
        
        '''use county column to fill in gaps in subdivision.  combine_first won't overwrite any existing values in subdivision
           This will go through the subdivision column looking for any NaN, and if found, it will assign the value of the county cell
           in that row to the subdivision cell.  Any populated values in subdivision are left as-is'''
        output_df['Subdivision'] = output_df['Subdivision'].combine_first(output_df['County'])
        output_df.drop(columns=['County','County Code'],inplace=True)
        '''Fill any remaining NaNs with the text "None"'''
        output_df[['Subdivision','Subdivision Code']] = output_df[['Subdivision','Subdivision Code']].fillna('None')

    return output_df

### Data Merging Functions
#### Merging Dataframes

In [17]:
def merge_two_datasets(input_df1, input_df2, left,right,method='left'):
    output_df = pd.merge(input_df1, input_df2, left_on=left, right_on=right, how=method)
    output_df.fillna(0, inplace=True)
    return output_df

def merge_tld_and_m49_datasets(input_df_left, input_df_right):
    output_df = merge_two_datasets(input_df_left, input_df_right, 'A2 Code', 'ISO-alpha2 Code')
    output_df = output_df.astype(
        {'M49 Code': int,
         'Sub-region Code': int,
         'Region Code': int})
    return output_df

def combine_cloud_region_details_with_location_info(
        input_cloud_region_df,
        left='Geography',
        right='short-name',
        input_locations_df = None,
):
    if not input_locations_df:
        if check_if_variable_is_defined_globally("locations_df"):
            input_locations_df=locations_df.copy()
        else:
            return None
   
    return pd.merge(
        input_cloud_region_df,
        input_locations_df,
        left_on=left,
        right_on=right,
        how='left')

#### Functions for cleansing of merged supplementary location related Datasets

In [18]:
def cleanup_combined_tld_m49_dataset(input_df):
    output_df = input_df.copy()
    '''Rename the column with the country name to one easier to work with'''
    output_df.rename(columns={'Country name (using title case)':'source-name'}, inplace=True)

    '''Extract the shortened country name from the cell with title case included'''
    output_df['short-name'] = \
        output_df.apply(
            lambda row: (
                row['source-name'].split(',')[0].strip()
                if str(row['source-name']).count(',') >= 1
                else row['source-name'].split(' of ')[0].strip()
                if ' of ' in str(row['source-name'])
                else row['source-name']),
            axis=1)
    '''Reorder the string in the cell with the title case, so we get a long-form country name'''
    output_df['long-name'] = \
        output_df.apply(
            lambda row: (
                f"{row['source-name'].split(',')[1].strip()} {row['source-name'].split(',')[0].strip()}"
                if str(row['source-name']).count(',') >= 1
                else row['source-name']),
            axis=1)

    '''Use conditional to find any rows that have ISO-Alpha2 set to 0'''
    write("Assigning 'ISO-alpha2' to same value as 'A2 Code' for any entries that are set to 0")
    output_df.loc[
        output_df['ISO-alpha2 Code'] == 0,
        'ISO-alpha2 Code'
        ] = output_df.loc[
                output_df['ISO-alpha2 Code'] == 0,
                'A2 Code'
                ]
    '''Drop redundant columns'''
    output_df.drop(columns=['source-name','Country or Area','A2 Code'], inplace=True)
    '''Re-order columns in dataframe so most important info is first'''
    new_column_order = ['short-name', 'long-name','Sub-region Name','Region Name', 'tld', 'Sub-region Code','Region Code', 'M49 Code' ,'ISO-alpha2 Code', 'ISO-alpha3 Code']
    output_df = output_df[new_column_order]

    return output_df

def manually_fix_location_short_names(input_df):
    '''Do some manual correction of the short-names'''
    output_df = input_df.copy()
    output_df.loc[output_df['ISO-alpha2 Code'] == 'CD', 'short-name'] = 'DR Congo'
    output_df.loc[output_df['ISO-alpha2 Code'] == 'KP', 'short-name'] = 'North Korea'
    output_df.loc[output_df['ISO-alpha2 Code'] == 'KR', 'short-name'] = 'South Korea'
    return output_df

#### Functions for checking consistency of supplementary location related datasets

In [19]:
def check_consistency_tld_m49_dataset_pre_cleanup(input_df):
    '''Check that no entries are missing both the ISO-alpha2 and ISO-alpha3 codes as these comes from the m49 dataset'''

    results = input_df[(input_df['ISO-alpha2 Code'].isnull()) & (input_df['ISO-alpha3 Code'].isnull())]
    write("ISO-alpha2 and ISO-alpha3 codes that are null:")
    display(results)
    results = input_df[(input_df['ISO-alpha2 Code'] == 0) & (input_df['ISO-alpha3 Code'] == 0)]
    write("ISO-alpha2 & 3 Codes that are both set to zero:")
    display(results)
    write("Taiwan is the only Country without an entry in the M49 - there are political reasons for this.  For now, this can be corrected by assigning the ISO-alpha2 Code to the A2 Code")

def check_consistency_tld_m49_dataset_post_cleanup(input_df):
    results = input_df[(input_df['ISO-alpha2 Code'] == 0) & (input_df['ISO-alpha3 Code'] == 0)]
    write("ISO-alpha2 & 3 Codes that are both set to zero:")
    display(results)
    write("Check Taiwan specifically :")
    display(input_df[input_df['ISO-alpha2 Code'] == 'TW'])
    write("Look for duplicate short country names :")
    input_df['short-name'].value_counts().to_frame()


'''Look for any records that didnt merge correctly'''
def identify_cloud_regions_with_incomplete_country_info(input_df,dataset_name):
    '''create a list of any rows with null values for short-name'''
    unmerged_cloud_regions = input_df[input_df['short-name'].isnull()]['cloud-region'].to_list()
    '''Create a DataFrame collating details of the unmerged regions'''
    unmerged_details_df = input_df[input_df['cloud-region'].isin(unmerged_cloud_regions)][['cloud-region', 'Geography','short-name','long-name','tld','Region Code','Region Name']]
    display(f"Unmerged Cloud Regions for {dataset_name}:")
    display(unmerged_details_df)
    return unmerged_details_df

def look_for_duplicate_locations(input_df):
    output_df = input_df.copy()
    count_df = output_df['short-name'].value_counts().to_frame()
    count_df = count_df[count_df['count'] >= 2]
    write("Duplicate Short Names:")
    display(count_df)
    write("Full rows with duplicates")
    output_df = output_df[output_df['short-name'].duplicated(keep=False)]
    display(output_df)

def show_potential_matches_for_countries(input_df=None,column_name='Geography'):
    if input_df is not None:
        country_list = input_df[column_name].drop_duplicates().to_list()

        for country_name in country_list:
            write(f"Potential matches for : {country_name}",header=3,colour="blue")
            display(results['possible_matches'][country_name])


#### Functions for doing fuzzy search on unmerged country names, when merging cloud-region and location datasets

In [20]:
def do_fuzzy_match_on_countries(
        match_input_df,
        match_on='Geography',
        country_list=None):
    if country_list is None:
        return None
    fuzzy_df = match_input_df.copy()
    fuzzy_df[['fuzzy_country_match','fuzzy_similarity','fuzzy_index']]= fuzzy_df[match_on].apply(lambda x: process.extractOne(x, country_list, scorer=fuzz.partial_ratio)).apply(pd.Series)
    '''drop the fuzzy index column as this is only the place the name was in the list of counties we searched through'''
    fuzzy_df.drop(columns=['fuzzy_index'], inplace=True)
    return fuzzy_df

def get_possible_country_matches(input_value,country_list=None):
    if country_list is None:
        return None
    '''take in a string and return a list of possible matches
       each possible match is returned as a tuple, containing the match string,
       the similarity score and the index from the list'''
    possible_country_matches = process.extract(input_value, country_list, scorer=fuzz.partial_ratio)
    '''iterate through our list of tuples to form a dictionary, with each matched string
       as the key, and the similarity score as the value'''
    my_dict = {matched_string: similarity for matched_string, similarity, _ in possible_country_matches
            }
    return pd.Series(my_dict).to_frame().rename(columns={0:'similarity'})

def run_fuzzy_search_against_previously_unmerged_rows(input_df, unmerged_regions, match_on='Geography', country_list=None):
    if country_list is None:
        return None
    fuzzy_search_df = do_fuzzy_match_on_countries(
        match_input_df=input_df,
        country_list=country_list)
    unmerged_region_list = unmerged_regions['cloud-region'].tolist()
    '''Check the rows where we had failed to merge previously to see how it looks now'''
    fuzzy_search_filtered_df = fuzzy_search_df[fuzzy_search_df['cloud-region'].isin(unmerged_region_list)][["cloud-region","Geography","fuzzy_country_match",'fuzzy_similarity']]
    possible_matches = dict()
    for _, df_row in fuzzy_search_filtered_df.iterrows():
        cloud_region = str(df_row['cloud-region'])
        geography = str(df_row['Geography'])

        possible_matches[geography] = get_possible_country_matches(geography,country_list)
    return {
        'top_matches' : fuzzy_search_filtered_df,
        'possible_matches' : possible_matches
    }

def merge_rows_using_fuzzy_search_results(
    input_df1, # dataframe containing cloud region and location merged data
    input_df2, # dataframe containing results of fuzzy search for cloud-regions missing location info
    input_df3 # dataframe containing combined location info.  Used as source when doing the 1st pass of merging cloud-region and location info.
    ):
    '''make a copy of all passed in dataframes so we don't hit any weird issues with missing
       columns etc while re-running cells adhoc in the jupyter notebook'''
    input_df1 = input_df1.copy()
    input_df2 = input_df2.copy()
    input_df3 = input_df3.copy()
    
    '''merge the fuzzy search results with our combined cloud-region table, so we can match up
       the short-name returned by our fuzzy search with our cloud-region'''
    output_df = pd.merge(input_df1,
                       input_df2[['cloud-region','fuzzy_country_match']],
                       on='cloud-region',how='left')
    '''once the dataframes are merged, we can populate the missing values in the main dataframe
       with the results that came from the fuzzy search'''
    output_df['short-name'] = output_df['short-name'].combine_first(output_df['fuzzy_country_match'])

    '''now we can merge our location dataframe back onto our main dataframe again, this time using the
       location short-name provided by fuzzy search to match on.  _temporary is added on the end
       so they can be distinguished from the current ones in the dataframe'''
    output_df = pd.merge(output_df,input_df3, how='left', on='short-name', suffixes=('', '_temporary'))

    '''create a list of columns to drop when cleaning up after all these merge operations'''
    columns_to_drop = ['fuzzy_country_match']
    '''we now go through each column merged in from the location table, and combine that temporary,
       merged in column with the original one in our main dataframe.  Any rows in our original columns
       with NaN as a value will be populated from the temporary, merged in equivalent column'''
    for column_name in input_df3.columns:
        if column_name != 'short-name' and column_name in output_df.columns:
            temporary_column=f"{column_name}_temporary"
            output_df[column_name] = output_df[column_name].combine_first(output_df[temporary_column])
            '''once we've combined it, add the temporary column to the list of those to drop'''
            columns_to_drop.append(temporary_column)

    '''with both the fuzzy search and location dataframes merged onto our main one again, we have a
   load of columns to drop'''
    output_df.drop(columns=columns_to_drop,inplace=True)
    return output_df

## Load Datasets
We have a number of datasets to load in here, some containing the actual data we want, and some used for sanitizing our data or adding additional context

### Supplementary Datasets

* Top Level Domain (TLD) listing by Country, from wikipedia
* M49 "Standard country or area codes for statistical use" dataset from the United Nations Statistics Division (UNSD)
* American States abbreviation codes to name mapping from wikipedia
* AWS Cloud Region names to location mapping from Amazons online documentation
* Google cloud Region names to location mapping from wikipedia

### Main Datasets

## Supplementary Datasets

### USA States Abbreviation Code Dataset
#### Load in Data

In [21]:
usa_states_df = get_us_state_abbreviation_dataset_from_web()
visual_check_dataframe(usa_states_df,"USA State Code Dataset", rows=1)

Unnamed: 0_level_0,"Flag, name and postal abbreviation[8]","Flag, name and postal abbreviation[8]",Cities,Cities,Ratification or admission[A],Population (2020)[10],Total area[11],Total area[11],Reps.
Unnamed: 0_level_1,"Flag, name and postal abbreviation[8]","Flag, name and postal abbreviation[8].1",Capital,Largest[12],Ratification or admission[A],Population (2020)[10],mi2,km2,Reps.
0,Alabama,AL,Montgomery,Huntsville,"Dec 14, 1819",5024279,52420,135767,7


Unnamed: 0_level_0,"Flag, name and postal abbreviation[8]","Flag, name and postal abbreviation[8]",Cities,Cities,Ratification or admission[A],Population (2020)[10],Total area[11],Total area[11],Reps.
Unnamed: 0_level_1,"Flag, name and postal abbreviation[8]","Flag, name and postal abbreviation[8].1",Capital,Largest[12],Ratification or admission[A],Population (2020)[10],mi2,km2,Reps.
9,Georgia,GA,Atlanta,Atlanta,"Jan 2, 1788",10711908,59425,153910,14


Unnamed: 0_level_0,"Flag, name and postal abbreviation[8]","Flag, name and postal abbreviation[8]",Cities,Cities,Ratification or admission[A],Population (2020)[10],Total area[11],Total area[11],Reps.
Unnamed: 0_level_1,"Flag, name and postal abbreviation[8]","Flag, name and postal abbreviation[8].1",Capital,Largest[12],Ratification or admission[A],Population (2020)[10],mi2,km2,Reps.
49,Wyoming,WY,Cheyenne,Cheyenne,"Jul 10, 1890",576851,97813,253335,1


#### Pre-processing of USA State Code dataset

In [22]:
usa_states_df = clean_us_state_abbreviation_dataset(usa_states_df)
visual_check_dataframe(usa_states_df,"USA State Code Dataset", rows=1)

Unnamed: 0,County,County Code
0,Alabama,AL


Unnamed: 0,County,County Code
9,Georgia,GA


Unnamed: 0,County,County Code
49,Wyoming,WY


### TLD Dataset
#### Load in Data

In [23]:
tld_df = get_tld_country_dataset()
visual_check_dataframe(tld_df,"TLD pre-cleanup", rows=1)

Unnamed: 0,Code,Country name (using title case),Year,ccTLD,Notes
0,AD,Andorra,1974,.ad,


Unnamed: 0,Code,Country name (using title case),Year,ccTLD,Notes
130,LR,Liberia,1974,.lr,


Unnamed: 0,Code,Country name (using title case),Year,ccTLD,Notes
248,ZW,Zimbabwe,1980,.zw,Name changed from Southern Rhodesia (RH)


#### Pre-processing of TLD dataset

In [24]:
tld_df = clean_tld_country_data(tld_df)
write("Re-check TLD data once it's been cleaned")
visual_check_dataframe(tld_df, "TLD post-cleanup",rows=1,sample_only=True)


Unnamed: 0,A2 Code,Country name (using title case),tld
64,EG,Egypt,.eg


### M49 Dataset
#### Load in Data

In [25]:
m49_df = get_m49_country_dataset()
visual_check_dataframe(m49_df,dataset_name="M49 - pre-cleanup", rows=1)

Unnamed: 0,Global Code,Global Name,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS)
0,1,World,2.0,Africa,15.0,Northern Africa,,,Algeria,12,DZ,DZA,,,


Unnamed: 0,Global Code,Global Name,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS)
162,1,World,142.0,Asia,145.0,Western Asia,,,Saudi Arabia,682,SA,SAU,,,


Unnamed: 0,Global Code,Global Name,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code,Least Developed Countries (LDC),Land Locked Developing Countries (LLDC),Small Island Developing States (SIDS)
247,1,World,9.0,Oceania,61.0,Polynesia,,,Wallis and Futuna Islands,876,WF,WLF,,,


#### Pre-processing of M49 Dataset

In [26]:
m49_df = clean_m49_country_data(m49_df)
write("Re-check M49 data once it's been cleaned")
visual_check_dataframe(m49_df, dataset_name="M49 - post-cleanup", rows=1, sample_only=True)

Unnamed: 0,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
49,2.0,Africa,202.0,Sub-Saharan Africa,Guinea,324,GN,GIN


#### Combine TLD and M49 Location Datasets
We have two supplementary datasets representing Geographic location that we will now combine
##### Merge the TLD and M49 Location Datasets

In [27]:
locations_df = merge_tld_and_m49_datasets(tld_df, m49_df)
visual_check_dataframe(locations_df, "Combined Locations Dataset",rows=1)
check_consistency_tld_m49_dataset_pre_cleanup(locations_df)

Unnamed: 0,A2 Code,Country name (using title case),tld,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
0,AD,Andorra,.ad,150,Europe,39,Southern Europe,Andorra,20,AD,AND


Unnamed: 0,A2 Code,Country name (using title case),tld,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
157,MY,Malaysia,.my,142,Asia,35,South-eastern Asia,Malaysia,458,MY,MYS


Unnamed: 0,A2 Code,Country name (using title case),tld,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
248,ZW,Zimbabwe,.zw,2,Africa,202,Sub-Saharan Africa,Zimbabwe,716,ZW,ZWE


Unnamed: 0,A2 Code,Country name (using title case),tld,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code


Unnamed: 0,A2 Code,Country name (using title case),tld,Region Code,Region Name,Sub-region Code,Sub-region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
227,TW,"Taiwan, Province of China",.tw,0,0,0,0,0,0,0,0


##### Cleanup of Combined TLD & M49 Location Dataset

In [28]:
locations_df = cleanup_combined_tld_m49_dataset(locations_df)
check_consistency_tld_m49_dataset_post_cleanup(locations_df)

Unnamed: 0,short-name,long-name,Sub-region Name,Region Name,tld,Sub-region Code,Region Code,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code


Unnamed: 0,short-name,long-name,Sub-region Name,Region Name,tld,Sub-region Code,Region Code,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
227,Taiwan,Province of China Taiwan,0,0,.tw,0,0,0,TW,0


In [29]:
write("Identifying duplicate rows:",header=4,colour="blue")
look_for_duplicate_locations(locations_df)

write("Manually correcting duplicates ...",header=5,colour="red")
'''Do some manual correction of the short-names'''
locations_df = manually_fix_location_short_names(locations_df)
write("... completed",header=5,colour="red")
write("Confirming there are no longer duplicate rows:",header=4,colour="blue")
'''Look for duplicates again'''
look_for_duplicate_locations(locations_df)


Unnamed: 0_level_0,count
short-name,Unnamed: 1_level_1
Congo,2
Korea,2


Unnamed: 0,short-name,long-name,Sub-region Name,Region Name,tld,Sub-region Code,Region Code,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
39,Congo,Democratic Republic of the Congo,Sub-Saharan Africa,Africa,.cd,202,2,180,CD,COD
41,Congo,Congo,Sub-Saharan Africa,Africa,.cg,202,2,178,CG,COG
120,Korea,Democratic People's Republic of Korea,Eastern Asia,Asia,.kp,30,142,408,KP,PRK
121,Korea,Republic of Korea,Eastern Asia,Asia,.kr,30,142,410,KR,KOR


Unnamed: 0_level_0,count
short-name,Unnamed: 1_level_1


Unnamed: 0,short-name,long-name,Sub-region Name,Region Name,tld,Sub-region Code,Region Code,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code


### Azure Region Dataset
#### Load Data

In [30]:
azure_region_df = azure_region_dataset()
visual_check_dataframe(azure_region_df, "Azure Region Dataset - pre-cleanup", rows=1)

Unnamed: 0,Region,Availability zone support,Paired region,Physical location,Geography
0,Australia Central,,Australia Central 2,Canberra,Australia


Unnamed: 0,Region,Availability zone support,Paired region,Physical location,Geography
44,UAE Central,,UAE North,Abu Dhabi,UAE


Unnamed: 0,Region,Availability zone support,Paired region,Physical location,Geography
53,West US 3,,East US,Phoenix,United States


#### Pre-processing of Azure region dataset

In [31]:
azure_region_df = clean_azure_region_dataset(azure_region_df)
visual_check_dataframe(azure_region_df, "Azure Region Dataset - post cleanup", sample_only=True)

Unnamed: 0,cloud-region,Locality,country,Region Name
9,centralindia,Pune,India,
12,eastasia,Hong Kong SAR,Asia Pacific,
10,centralus,Iowa,United States,


### Google Region Datasset
#### Load Data

In [32]:
google_region_df = get_google_region_dataset()
visual_check_dataframe(google_region_df, "Google Region Dataset - pre-cleanup", rows=1)

Unnamed: 0,Continent,Location,Geo,Products Location,Cloud Location,Timeline,Description
0,North America,"Arcola (VA), USA",38°56′35.99″N 77°31′27.61″W﻿ / ﻿38.9433306°N 7...,Loudoun County,N. Virginia (us-east4),2017 - announced[4][5],


Unnamed: 0,Continent,Location,Geo,Products Location,Cloud Location,Timeline,Description
31,North America,"Moncks Corner (SC), USA",33°03′50.8″N 80°02′36.1″W﻿ / ﻿33.064111°N 80.0...,Berkeley County,South Carolina (us-east1),2007 - launched 2013 - expanded,150 employees


Unnamed: 0,Continent,Location,Geo,Products Location,Cloud Location,Timeline,Description
71,South America,"Canelones, Uruguay",34°48′56″S 55°59′44″W﻿ / ﻿34.815481°S 55.995683°W,,,2024 - construction started[75] 2026 - inaugur...,


#### Pre-processing of Google region dataset

In [33]:
google_region_df = clean_google_region_dataset(google_region_df)
visual_check_dataframe(google_region_df,"Google Region Dataset - post cleanup", rows=5,sample_only=True)

Unnamed: 0,Continent,country,cloud-region,Locality,Subdivision Code,Subdivision
18,North America,USA,us-west2,Los Angeles,CA,California
2,Asia,Taiwan,asia-east1,Changhua County,,
35,North America,Canada,northamerica-northeast2,Toronto,,
44,Asia,Taiwan,asia-east1,Yunlin County,,
32,Pacific,Australia,australia-southeast1,Sydney,,


### AWS Region Dataset
#### Load Data

In [34]:
aws_region_df = get_aws_region_dataset()
visual_check_dataframe(aws_region_df,"AWS Region Dataset - pre-cleanup", rows=1)

Unnamed: 0,Code,Name,AZs,Geography,Opt-in status
0,us-east-1,US East (N. Virginia),6,United States of America,Not required


Unnamed: 0,Code,Name,AZs,Geography,Opt-in status
6,ap-south-2,Asia Pacific (Hyderabad),3,India,Required


Unnamed: 0,Code,Name,AZs,Geography,Opt-in status
32,sa-east-1,South America (São Paulo),3,Brazil,Not required


#### Pre-processing of AWS region dataset

In [35]:
aws_region_df = clean_aws_region_dataset(aws_region_df)
write("Re-check AWS Region data once it's been cleaned")
visual_check_dataframe(aws_region_df, "AWS Region Dataset : post-cleanup",rows=5,sample_only=True)

Unnamed: 0,cloud-region,Name,AZs,Geography
12,ap-northeast-2,Asia Pacific (Seoul),4,South Korea
13,ap-southeast-1,Asia Pacific (Singapore),3,Singapore
5,ap-east-1,Asia Pacific (Hong Kong),3,Hong Kong
17,ap-northeast-1,Asia Pacific (Tokyo),4,Japan
10,ap-south-1,Asia Pacific (Mumbai),3,India


#### Combine AWS region dataframe with locations dataframe
The AWS region dataframe doesn't have full details of where the cloud dc is based.  So, we will combine it with the more comprehensive list compiled earlier
##### Merge Data

In [36]:
aws_combined_df = combine_cloud_region_details_with_location_info(aws_region_df)
visual_check_dataframe(aws_combined_df,"AWS Regions combined with location",rows=5)
'''Report AWS regions missing or incomplete country info'''
unmerged_aws_cloud_regions = identify_cloud_regions_with_incomplete_country_info(aws_combined_df,"AWS region combined with location dataset")


Unnamed: 0,cloud-region,Name,AZs,Geography,short-name,long-name,Sub-region Name,Region Name,tld,Sub-region Code,Region Code,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
0,us-east-1,US East (N. Virginia),6,United States of America,,,,,,,,,,
1,us-east-2,US East (Ohio),3,United States of America,,,,,,,,,,
2,us-west-1,US West (N. California),3 †,United States of America,,,,,,,,,,
3,us-west-2,US West (Oregon),4,United States of America,,,,,,,,,,
4,af-south-1,Africa (Cape Town),3,South Africa,South Africa,South Africa,Sub-Saharan Africa,Africa,.za,202.0,2.0,710.0,ZA,ZAF


Unnamed: 0,cloud-region,Name,AZs,Geography,short-name,long-name,Sub-region Name,Region Name,tld,Sub-region Code,Region Code,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
25,eu-south-2,Europe (Spain),3,Spain,Spain,Spain,Southern Europe,Europe,.es,39.0,150.0,724.0,ES,ESP
12,ap-northeast-2,Asia Pacific (Seoul),4,South Korea,South Korea,Republic of Korea,Eastern Asia,Asia,.kr,30.0,142.0,410.0,KR,KOR
28,il-central-1,Israel (Tel Aviv),3,Israel,Israel,Israel,Western Asia,Asia,.il,145.0,142.0,376.0,IL,ISR
8,ap-southeast-5,Asia Pacific (Malaysia),3,Malaysia,Malaysia,Malaysia,South-eastern Asia,Asia,.my,35.0,142.0,458.0,MY,MYS
23,eu-south-1,Europe (Milan),3,Italy,Italy,Italy,Southern Europe,Europe,.it,39.0,150.0,380.0,IT,ITA


Unnamed: 0,cloud-region,Name,AZs,Geography,short-name,long-name,Sub-region Name,Region Name,tld,Sub-region Code,Region Code,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
28,il-central-1,Israel (Tel Aviv),3,Israel,Israel,Israel,Western Asia,Asia,.il,145.0,142.0,376.0,IL,ISR
29,mx-central-1,Mexico (Central),3,Mexico,Mexico,Mexico,Latin America and the Caribbean,Americas,.mx,419.0,19.0,484.0,MX,MEX
30,me-south-1,Middle East (Bahrain),3,Bahrain,Bahrain,Bahrain,Western Asia,Asia,.bh,145.0,142.0,48.0,BH,BHR
31,me-central-1,Middle East (UAE),3,United Arab Emirates,United Arab Emirates,United Arab Emirates,Western Asia,Asia,.ae,145.0,142.0,784.0,AE,ARE
32,sa-east-1,South America (São Paulo),3,Brazil,Brazil,Brazil,Latin America and the Caribbean,Americas,.br,419.0,19.0,76.0,BR,BRA


'Unmerged Cloud Regions for AWS region combined with location dataset:'

Unnamed: 0,cloud-region,Geography,short-name,long-name,tld,Region Code,Region Name
0,us-east-1,United States of America,,,,,
1,us-east-2,United States of America,,,,,
2,us-west-1,United States of America,,,,,
3,us-west-2,United States of America,,,,,


#### Resolve unmmerged records for combined AWS region & locations dataset
Get a list of all country-or-area names from the locations dataset

In [37]:
country_list = locations_df['short-name'].tolist()
write("CHecking first 5 entries of the country list",header=4,colour="blue")
country_list[0:5]

['Andorra',
 'United Arab Emirates',
 'Afghanistan',
 'Antigua and Barbuda',
 'Anguilla']

Try and do a fuzzy filter to match the country names

In [38]:
results = run_fuzzy_search_against_previously_unmerged_rows(
    input_df=aws_combined_df,
    unmerged_regions=unmerged_aws_cloud_regions,
    match_on='Geography',
    country_list=country_list)

'''Display results of match'''
display(results['top_matches'])

Unnamed: 0,cloud-region,Geography,fuzzy_country_match,fuzzy_similarity
0,us-east-1,United States of America,United States,100.0
1,us-east-2,United States of America,United States,100.0
2,us-west-1,United States of America,United States,100.0
3,us-west-2,United States of America,United States,100.0


The matches above all look ok, with 100% similarity.
We can be sure if we look at other potential matches;

In [39]:
show_potential_matches_for_countries(unmerged_aws_cloud_regions)

Unnamed: 0,similarity
United States,100.0
United States Minor Outlying Islands,75.0
Serbia,72.727273
Armenia,71.428571
Algeria,71.428571


Above we can see that while the 2nd-5th choices do somewhat match, if only by a few letter, the most accurate country selection by far is the first one

In [40]:
aws_combined_df = merge_rows_using_fuzzy_search_results(aws_combined_df, results['top_matches'], locations_df)
aws_combined_df.head()

Unnamed: 0,cloud-region,Name,AZs,Geography,short-name,long-name,Sub-region Name,Region Name,tld,Sub-region Code,Region Code,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
0,us-east-1,US East (N. Virginia),6,United States of America,United States,United States of America,Northern America,Americas,.us,21.0,19.0,840.0,US,USA
1,us-east-2,US East (Ohio),3,United States of America,United States,United States of America,Northern America,Americas,.us,21.0,19.0,840.0,US,USA
2,us-west-1,US West (N. California),3 †,United States of America,United States,United States of America,Northern America,Americas,.us,21.0,19.0,840.0,US,USA
3,us-west-2,US West (Oregon),4,United States of America,United States,United States of America,Northern America,Americas,.us,21.0,19.0,840.0,US,USA
4,af-south-1,Africa (Cape Town),3,South Africa,South Africa,South Africa,Sub-Saharan Africa,Africa,.za,202.0,2.0,710.0,ZA,ZAF


In [41]:
write("aws:",header=3,colour='orange')
display(aws_combined_df.head())
write("azure:",header=3,colour='blue')
display(azure_region_df.head())
write("google:",header=3,colour='green')
display(google_region_df.head())

Unnamed: 0,cloud-region,Name,AZs,Geography,short-name,long-name,Sub-region Name,Region Name,tld,Sub-region Code,Region Code,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
0,us-east-1,US East (N. Virginia),6,United States of America,United States,United States of America,Northern America,Americas,.us,21.0,19.0,840.0,US,USA
1,us-east-2,US East (Ohio),3,United States of America,United States,United States of America,Northern America,Americas,.us,21.0,19.0,840.0,US,USA
2,us-west-1,US West (N. California),3 †,United States of America,United States,United States of America,Northern America,Americas,.us,21.0,19.0,840.0,US,USA
3,us-west-2,US West (Oregon),4,United States of America,United States,United States of America,Northern America,Americas,.us,21.0,19.0,840.0,US,USA
4,af-south-1,Africa (Cape Town),3,South Africa,South Africa,South Africa,Sub-Saharan Africa,Africa,.za,202.0,2.0,710.0,ZA,ZAF


Unnamed: 0,cloud-region,Locality,country,Region Name
0,australiacentral,Canberra,Australia,
1,australiacentral2,Canberra,Australia,
2,australiaeast,New South Wales,Australia,
3,australiasoutheast,Victoria,Australia,
4,austriaeast,Vienna,Austria,


Unnamed: 0,Continent,country,cloud-region,Locality,Subdivision Code,Subdivision
0,North America,USA,us-east4,Arcola,VA,Virginia
1,South America,Chile,southamerica-west1,Cerrillos,,Santiago
2,Asia,Taiwan,asia-east1,Changhua County,,
3,North America,USA,us-east5,Columbus,OH,Ohio
4,North America,USA,us-central1,Council Bluffs,IA,Iowa


In [42]:
output_df = azure_region_df.copy()
output_df.rename(columns={'Physical location':'Locality','Geography':'country'},inplace=True)