In [None]:
import pandas as pd


def getregions(country_codes):
    regions = [codes[code.strip()] for code in country_codes.split('<') if code.strip() in codes.keys()]
    
    # correct for South America's lack of regions
    if len(regions) == 1:
        regions.append(regions[0])
    
    return regions



In [None]:
# first table on this page gives region codes
un_codes = pd.read_html('https://en.wikipedia.org/wiki/UN_M49')[0]

# assign column names
un_codes.columns = ['Code', 'Area']

# drop any rows that are NA or not numeric in Code column
un_codes = un_codes[pd.to_numeric(un_codes['Code'], errors='coerce').notnull()]

# remove bracketed text
un_codes.Area = un_codes.Area.str.replace(r'(\[.*)','')
# remove parenthetical text
un_codes.Area = un_codes.Area.str.replace(r'(\(.*)','')
# trim whitespace
un_codes.Area = un_codes.Area.str.strip()

# remove duplicate rows
un_codes.drop_duplicates(inplace=True)

# remove non-helpful codes world, americas, subsarahan africa, LAC, channel islands
del_codes = ['001', '019', '202', '419', '830']
un_codes.drop(un_codes[un_codes.Code.isin(del_codes)].index, inplace=True)

# sort by code
un_codes.sort_values(by='Code', inplace=True)

# convert to dict
un_codes.set_index('Code', inplace=True)
codes = un_codes.to_dict()['Area']

In [None]:
# first table in page gives countries and codes
country_codes = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_United_Nations_geoscheme')[0]

# simplify column names
country_codes.columns = [tup[1] for tup in country_codes.columns]

# drop unused columns
country_codes.drop(columns=['Capital', 'Alpha-3[1][2]', 'Numeric[1][2]'], inplace=True)

# rename columns
country_codes.columns = ['Country', 'Short_Name', 'Code']

# apply getregions to return subregion/region
regions = country_codes.Code.apply(getregions)

# create df from series of lists
regions_df = pd.DataFrame(regions.to_list(), columns=['Sub_Region', 'Region'])

# append to country_codes dataset
country_codes['Sub_Region'] = regions_df['Sub_Region'].astype(str)
country_codes['Region'] = regions_df['Region'].astype(str)

# drop code column
country_codes.drop(columns=['Code'], inplace=True)

# not sure why it doesn't parse this correctly, so we're manually writing it
country_codes['Short_Name'][0] = 'DZ'

# adding Taiwan manually because the dataset calls it out, but the UN doesn't recognize it
taiwan = {'Country': 'Taiwan', 'Short_Name': 'TW', 'Sub_Region': 'Eastern Asia', 'Region': 'Asia'}
country_codes = country_codes.append(taiwan, ignore_index=True)

# Write to csv
country_codes.to_csv('Cleaned_Data/country_regions.csv', index=False)