In [2]:
!pip install lxml
import pandas as pd
wiki_page = "https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2"
tld_df = pd.read_html(wiki_page)[4]




In [3]:
tld_df.head(10)

Unnamed: 0,Code,Country name (using title case),Year,ccTLD,Notes
0,AD,Andorra,1974,.ad,
1,AE,United Arab Emirates,1974,.ae,
2,AF,Afghanistan,1974,.af,
3,AG,Antigua and Barbuda,1974,.ag,
4,AI,Anguilla,1985,.ai,AI previously represented French Afars and Issas
5,AL,Albania,1974,.al,
6,AM,Armenia,1992,.am,
7,AO,Angola,1974,.ao,
8,AQ,Antarctica,1974,.aq,Covers the territories south of 60° south lati...
9,AR,Argentina,1974,.ar,


In [4]:
tld_df.drop(columns=['Year','Notes'], inplace=True)
tld_df.head(10)

Unnamed: 0,Code,Country name (using title case),ccTLD
0,AD,Andorra,.ad
1,AE,United Arab Emirates,.ae
2,AF,Afghanistan,.af
3,AG,Antigua and Barbuda,.ag
4,AI,Anguilla,.ai
5,AL,Albania,.al
6,AM,Armenia,.am
7,AO,Angola,.ao
8,AQ,Antarctica,.aq
9,AR,Argentina,.ar


In [5]:
'''unstats.org.'''
#https://unstats.un.org/unsd/methodology/m49/overview/#
m49_df = pd.read_csv('imported-datasets/UNSD — Methodology.csv', sep=';')
'''Dropping Global Name/Code as they don't change.
Dropping Developing countries/states columns'''
m49_df.drop(columns=['Global Code','Global Name','Land Locked Developing Countries (LLDC)','Least Developed Countries (LDC)','Small Island Developing States (SIDS)'],inplace=True)

m49_df.head(10)

Unnamed: 0,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
0,2.0,Africa,15.0,Northern Africa,,,Algeria,12,DZ,DZA
1,2.0,Africa,15.0,Northern Africa,,,Egypt,818,EG,EGY
2,2.0,Africa,15.0,Northern Africa,,,Libya,434,LY,LBY
3,2.0,Africa,15.0,Northern Africa,,,Morocco,504,MA,MAR
4,2.0,Africa,15.0,Northern Africa,,,Sudan,729,SD,SDN
5,2.0,Africa,15.0,Northern Africa,,,Tunisia,788,TN,TUN
6,2.0,Africa,15.0,Northern Africa,,,Western Sahara,732,EH,ESH
7,2.0,Africa,202.0,Sub-Saharan Africa,14.0,Eastern Africa,British Indian Ocean Territory,86,IO,IOT
8,2.0,Africa,202.0,Sub-Saharan Africa,14.0,Eastern Africa,Burundi,108,BI,BDI
9,2.0,Africa,202.0,Sub-Saharan Africa,14.0,Eastern Africa,Comoros,174,KM,COM


In [6]:
country_df = pd.merge(tld_df, m49_df, left_on='Code', right_on='ISO-alpha2 Code', how='left')
country_df.fillna(0, inplace=True)
country_df = country_df.astype(
    {'M49 Code': int, 'Sub-region Code': int, 'Region Code': int, 'Intermediate Region Code' : int}
)
country_df.head(10)

Unnamed: 0,Code,Country name (using title case),ccTLD,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
0,AD,Andorra,.ad,150,Europe,39,Southern Europe,0,0,Andorra,20,AD,AND
1,AE,United Arab Emirates,.ae,142,Asia,145,Western Asia,0,0,United Arab Emirates,784,AE,ARE
2,AF,Afghanistan,.af,142,Asia,34,Southern Asia,0,0,Afghanistan,4,AF,AFG
3,AG,Antigua and Barbuda,.ag,19,Americas,419,Latin America and the Caribbean,29,Caribbean,Antigua and Barbuda,28,AG,ATG
4,AI,Anguilla,.ai,19,Americas,419,Latin America and the Caribbean,29,Caribbean,Anguilla,660,AI,AIA
5,AL,Albania,.al,150,Europe,39,Southern Europe,0,0,Albania,8,AL,ALB
6,AM,Armenia,.am,142,Asia,145,Western Asia,0,0,Armenia,51,AM,ARM
7,AO,Angola,.ao,2,Africa,202,Sub-Saharan Africa,17,Middle Africa,Angola,24,AO,AGO
8,AQ,Antarctica,.aq,0,0,0,0,0,0,Antarctica,10,AQ,ATA
9,AR,Argentina,.ar,19,Americas,419,Latin America and the Caribbean,5,South America,Argentina,32,AR,ARG


We should confirm that every row in our tld dataframe was able to merge with one from the m49 dataset

In [8]:
'''Check that every entry has a 'Country or Area' entry as this comes from the m49 dataset'''
display(print("Country or area that is null:"))
display(country_df[country_df['Country or Area'].isnull()])
display("Country or area that is set to zero:")
display(country_df[country_df['Country or Area'] == 0])
display("Country or Area cell which is set to int type;")
country_df[
    country_df['Country or Area'].apply(lambda x: isinstance(x, int))
]

Country or area that is null:


None

Unnamed: 0,Code,Country name (using title case),ccTLD,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code


'Country or area that is set to zero:'

Unnamed: 0,Code,Country name (using title case),ccTLD,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
227,TW,"Taiwan, Province of China",.tw,0,0,0,0,0,0,0,0,0,0


'Country or Area cell which is set to int type;'

Unnamed: 0,Code,Country name (using title case),ccTLD,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
227,TW,"Taiwan, Province of China",.tw,0,0,0,0,0,0,0,0,0,0


Taiwan is the only Country without an entry in the M49 - there are political reasons for this.
For now, we will correct this by assigning the Country name to the Country and Area name

In [9]:
'''Use conditional to find any rows that have Country or Area set to 0'''

country_df.loc[
    country_df['Country or Area'] == 0,
    'Country or Area'
    ] = country_df.loc[
            country_df['Country or Area'] == 0,
            'Country name (using title case)'
            ]
display("Country or area that is set to zero:")
display(country_df[country_df['Country or Area'] == 0])
display("Check Taiwan specifically :")
display(country_df[country_df['Code'] == 'TW'])

'Country or area that is set to zero:'

Unnamed: 0,Code,Country name (using title case),ccTLD,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code


'Check Taiwan specifically :'

Unnamed: 0,Code,Country name (using title case),ccTLD,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
227,TW,"Taiwan, Province of China",.tw,0,0,0,0,0,0,"Taiwan, Province of China",0,0,0


## Get AWS Regions from Web URL
Read the AWS regions from the aws docs webpage and convert to dataframe

In [10]:
aws_regions_from_html = pd.read_html('https://docs.aws.amazon.com/global-infrastructure/latest/regions/aws-regions.html')[0]
aws_regions_from_html.rename(columns={'Code': 'Cloud Region'}, inplace=True)
aws_regions_from_html.head(10)

Unnamed: 0,Cloud Region,Name,AZs,Geography,Opt-in status
0,us-east-1,US East (N. Virginia),6,United States of America,Not required
1,us-east-2,US East (Ohio),3,United States of America,Not required
2,us-west-1,US West (N. California),3 †,United States of America,Not required
3,us-west-2,US West (Oregon),4,United States of America,Not required
4,af-south-1,Africa (Cape Town),3,South Africa,Required
5,ap-east-1,Asia Pacific (Hong Kong),3,Hong Kong,Required
6,ap-south-2,Asia Pacific (Hyderabad),3,India,Required
7,ap-southeast-3,Asia Pacific (Jakarta),3,Indonesia,Required
8,ap-southeast-5,Asia Pacific (Malaysia),3,Malaysia,Required
9,ap-southeast-4,Asia Pacific (Melbourne),3,Australia,Required


## Combine AWS region dataframe with countries dataframe
The AWS region dataframe doesn't have full details of where the cloud dc is based.  So, we will combine it with the more comprehensive list compiled earlier

In [11]:
def combine_cloud_region_details_with_location_info(
        input_cloud_region_df,
        left='Geography',
        right='Country or Area',
        input_country_df = country_df,
):
    return pd.merge(
        input_cloud_region_df,
        input_country_df,
        left_on=left,
        right_on=right,
        how='left')

aws_combined_df = combine_cloud_region_details_with_location_info(aws_regions_from_html)
aws_combined_df.head(10)

Unnamed: 0,Cloud Region,Name,AZs,Geography,Opt-in status,Code,Country name (using title case),ccTLD,Region Code,Region Name,Sub-region Code,Sub-region Name,Intermediate Region Code,Intermediate Region Name,Country or Area,M49 Code,ISO-alpha2 Code,ISO-alpha3 Code
0,us-east-1,US East (N. Virginia),6,United States of America,Not required,US,United States of America,.us,19.0,Americas,21.0,Northern America,0.0,0,United States of America,840.0,US,USA
1,us-east-2,US East (Ohio),3,United States of America,Not required,US,United States of America,.us,19.0,Americas,21.0,Northern America,0.0,0,United States of America,840.0,US,USA
2,us-west-1,US West (N. California),3 †,United States of America,Not required,US,United States of America,.us,19.0,Americas,21.0,Northern America,0.0,0,United States of America,840.0,US,USA
3,us-west-2,US West (Oregon),4,United States of America,Not required,US,United States of America,.us,19.0,Americas,21.0,Northern America,0.0,0,United States of America,840.0,US,USA
4,af-south-1,Africa (Cape Town),3,South Africa,Required,ZA,South Africa,.za,2.0,Africa,202.0,Sub-Saharan Africa,18.0,Southern Africa,South Africa,710.0,ZA,ZAF
5,ap-east-1,Asia Pacific (Hong Kong),3,Hong Kong,Required,,,,,,,,,,,,,
6,ap-south-2,Asia Pacific (Hyderabad),3,India,Required,IN,India,.in,142.0,Asia,34.0,Southern Asia,0.0,0,India,356.0,IN,IND
7,ap-southeast-3,Asia Pacific (Jakarta),3,Indonesia,Required,ID,Indonesia,.id,142.0,Asia,35.0,South-eastern Asia,0.0,0,Indonesia,360.0,ID,IDN
8,ap-southeast-5,Asia Pacific (Malaysia),3,Malaysia,Required,MY,Malaysia,.my,142.0,Asia,35.0,South-eastern Asia,0.0,0,Malaysia,458.0,MY,MYS
9,ap-southeast-4,Asia Pacific (Melbourne),3,Australia,Required,AU,Australia,.au,9.0,Oceania,53.0,Australia and New Zealand,0.0,0,Australia,36.0,AU,AUS


Check for Naan Values

In [17]:
'''Look for any records that didnt merge correctly'''
def identify_cloud_regions_with_incomplete_country_info(input_df):
    '''create a list of any rows with null values for Country or Area'''
    unmerged_cloud_regions = input_df[input_df['Country or Area'].isnull()]['Cloud Region'].to_list()
    '''Create a DataFrame collating details of the unmerged regions'''
    unmerged_details_df = input_df[input_df['Cloud Region'].isin(unmerged_cloud_regions)][['Cloud Region', 'Geography','Country name (using title case)','Country or Area','ccTLD','Region Code','Region Name']]
    return {'list': unmerged_cloud_regions, 'df': unmerged_details_df}

'''Report AWS regions missing or incomplete country info'''
unmerged_aws_cloud_regions = identify_cloud_regions_with_incomplete_country_info(aws_combined_df)
display("Unmerged Cloud Regions for AWS:")
display(unmerged_aws_cloud_regions['df'])

'Unmerged Cloud Regions for AWS:'

Unnamed: 0,Cloud Region,Geography,Country name (using title case),Country or Area,ccTLD,Region Code,Region Name
5,ap-east-1,Hong Kong,,,,,
12,ap-northeast-2,South Korea,,,,,
15,ap-east-2,Taiwan,,,,,
22,eu-west-2,United Kingdom,,,,,


Get a list of all country-or-area names from the M49 list

In [13]:
country_or_area_list = country_df['Country or Area'].tolist()
country_or_area_list

['Andorra',
 'United Arab Emirates',
 'Afghanistan',
 'Antigua and Barbuda',
 'Anguilla',
 'Albania',
 'Armenia',
 'Angola',
 'Antarctica',
 'Argentina',
 'American Samoa',
 'Austria',
 'Australia',
 'Aruba',
 'Åland Islands',
 'Azerbaijan',
 'Bosnia and Herzegovina',
 'Barbados',
 'Bangladesh',
 'Belgium',
 'Burkina Faso',
 'Bulgaria',
 'Bahrain',
 'Burundi',
 'Benin',
 'Saint Barthélemy',
 'Bermuda',
 'Brunei Darussalam',
 'Bolivia (Plurinational State of)',
 'Bonaire, Sint Eustatius and Saba',
 'Brazil',
 'Bahamas',
 'Bhutan',
 'Bouvet Island',
 'Botswana',
 'Belarus',
 'Belize',
 'Canada',
 'Cocos (Keeling) Islands',
 'Democratic Republic of the Congo',
 'Central African Republic',
 'Congo',
 'Switzerland',
 'Côte d’Ivoire',
 'Cook Islands',
 'Chile',
 'Cameroon',
 'China',
 'Colombia',
 'Costa Rica',
 'Cuba',
 'Cabo Verde',
 'Curaçao',
 'Christmas Island',
 'Cyprus',
 'Czechia',
 'Germany',
 'Djibouti',
 'Denmark',
 'Dominica',
 'Dominican Republic',
 'Algeria',
 'Ecuador',
 'Esto

Try and do a fuzzy filter to match the country names

In [37]:
!pip install rapidfuzz
from rapidfuzz import process, fuzz

def do_fuzzy_match_on_countries(
        match_input_df,
        match_on='Geography',
        country_list=country_or_area_list):
    fuzzy_df = match_input_df.copy()
    fuzzy_df[['fuzzy_country_match','fuzzy_similarity','fuzzy_index']]= fuzzy_df[match_on].apply(lambda x: process.extractOne(x, country_list, scorer=fuzz.partial_ratio)).apply(pd.Series)
    '''drop the fuzzy index column as this is only the place the name was in the list of counties we searched through'''
    fuzzy_df.drop(columns=['fuzzy_index'], inplace=True)
    return fuzzy_df

def get_possible_country_matches(input_value,country_list=country_or_area_list):
    '''take in a string and return a list of possible matches
       each possible match is returned as a tuple, containing the match string,
       the similarity score and the index from the list'''
    possible_country_matches = process.extract(input_value, country_list, scorer=fuzz.partial_ratio)
    '''iterate through our list of tuples to form a dictionary, with each matched string
       as the key, and the similarity score as the value'''
    my_dict = {matched_string: similarity for matched_string, similarity, _ in possible_country_matches
            }
    return pd.Series(my_dict).to_frame().rename(columns={0:'similarity'})

def check_fuzzy_search_results_against_previously_unmerged_rows(input_df, unmerged_region_list):
    fuzzy_search_df = do_fuzzy_match_on_countries(input_df)
    '''Check the rows where we had failed to merge previously to see how it looks now'''
    fuzzy_search_filtered_df = fuzzy_search_df[fuzzy_search_df['Cloud Region'].isin(unmerged_region_list)][["Cloud Region","Geography","fuzzy_country_match",'fuzzy_similarity']]
    possible_matches = dict()
    for _, df_row in fuzzy_search_filtered_df.iterrows():
        cloud_region = str(df_row['Cloud Region'])
        geography = str(df_row['Geography'])

        possible_matches[geography] = get_possible_country_matches(geography)
    return {
        'top_matches' : fuzzy_search_filtered_df,
        'possible_matches' : possible_matches
    }


results = check_fuzzy_search_results_against_previously_unmerged_rows(aws_combined_df, unmerged_aws_cloud_regions['list'])

'''Display results of match'''
display(results['top_matches'])





Unnamed: 0,Cloud Region,Geography,fuzzy_country_match,fuzzy_similarity
5,ap-east-1,Hong Kong,"China, Hong Kong Special Administrative Region",100.0
12,ap-northeast-2,South Korea,South Georgia and the South Sandwich Islands,76.190476
15,ap-east-2,Taiwan,"Taiwan, Province of China",100.0
22,eu-west-2,United Kingdom,United Kingdom of Great Britain and Northern I...,100.0


The matches above all look ok, except for South Korea, which we can see only matched with a similarity of 76%.
Lets see if there are other possibilities;

In [38]:

display(results['possible_matches']['South Korea'])

Unnamed: 0,similarity
South Georgia and the South Sandwich Islands,76.190476
Democratic People's Republic of Korea,73.684211
Republic of Korea,73.684211
South Sudan,70.588235
South Africa,70.588235
