# Converts the JHU Cumulative Case Count Data to Daily Case Count Data

# TO DO:

- [X] Figure out how to fix the control measure obs that are recorded as '.'
- [X] Need to explore the missingness of the Oxford data. Sort the countries by GDP and examine what the missingness matrix looks like. **If you could run imputation on this data then you would have a major leg up on the other paper working on the similar topic. (on to of the other benefits to your paper)**
- [X] Write the code that merges in the time series data for the diffent control measures
- [X] Write the code merges in the Country Safety Index data
- [X] Continue to update this **[file](https://1drv.ms/x/s!AjWX5HOdYY23kf9x5S7g8LKLGlseVg?e=992nsi)** of data source locations 
- [X] Write the code that lets you convert the US data to long
- [X] Write the code that converts the column names in the Oxford data set to match the column names in the JHU dat
- [X] Write the code that merges the countries to their offical alpha 3 code in the JHU: **[Link to Codes](https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes)**
- [X] **ALTERNATIVE TO ABOVE** use python-Levenshtein [Docs](https://rawgit.com/ztane/python-Levenshtein/master/docs/Levenshtein.html) distance to match similar country names 
> would still need to pair the high probability matches 
- [X] Write the code transposes the combined data with the control measures included

In [9]:
import pandas as pd
import os
import sys
import shutil
import re

In [10]:
def stats_col_renamer(dframe):
    """
    Tips: This fuction will remove special characters from column headers, replace spaces with columns, 
    and make all heading lower case

    Parameters
    ----------
    dframe : TYPE Pandas dataframe
        DESCRIPTION.

    Returns
    -------
    None.

    """
    
    dframe.columns = dframe.columns.str.lower()
    dframe.columns = dframe.columns.str.replace('\s{2,}',' ',regex=True).str.replace('-',' ').str.replace(' ','_').str.replace('[^A-Za-z0-9_]+','',regex=True)
    return dframe
data_path = r'..\csse_covid_19_data\csse_covid_19_time_series'
out_data_path = r'..\Modified Data Sets'
control_data_path = '..\Control Data'
case_pre = pd.read_csv(f'{data_path}/time_series_covid19_confirmed_global.csv')
death_pre = pd.read_csv(f'{data_path}/time_series_covid19_deaths_global.csv')

## Importing the JHU COVID Case Data

In [11]:
# case_pre_us = pd.read_csv(f'{data_path}/time_series_covid19_confirmed_US.csv')
# death_pre_us = pd.read_csv(f'{data_path}/time_series_covid19_deaths_US.csv')


file_list = os.listdir(out_data_path)
for files in file_list:
    if files.find('.xlsx') >= 0:
        shutil.move(f'{out_data_path}/{files}',f'{out_data_path}/ARCHIVE/{files}')
def DF_Transform(df, outcome):
    global data_path
    global out_data_path
    
    # Data Cleaning
    df.drop(labels={'Lat','Long'},axis=1, inplace = True)
#     df.loc[df['Country/Region'].str.contains('Congo'), 'Country/Region'] ='Congo'
    df.loc[df['Country/Region'].str.contains('Korea, South',flags= re.IGNORECASE), 'Country/Region']= 'South Korea'
    df.loc[df['Country/Region'] == ('US'), 'Country/Region']= 'United States'
    df.loc[df['Country/Region'].str.contains('taiwan',flags= re.IGNORECASE), 'Country/Region']= 'taiwan'
    # Data Manipulation
    df = df.groupby(by='Country/Region').sum().T.apply(lambda x: x-x.shift(1),axis=0)
    df.rename(columns={'Country/Region':'Date'},inplace=True)
    df.columns = df.columns.str.lower()
    df = df.reset_index()
    df.rename(columns={'index':'date'},inplace= True)
    df.date = pd.to_datetime(df.date).dt.date
#     df['var'] = outcome[16:]
    
    filename = f'{out_data_path}/{outcome} (Through {df.date.max()}).xlsx'
    df.to_excel(filename, index=False)
    return df
case = DF_Transform(case_pre, 'Global COVID-19 Case Count')
death = DF_Transform(death_pre, 'Global COVID-19 Death Count')
# case = DF_Transform(case_pre_us, 'US COVID-19 Case Count')
# death = DF_Transform(death_pre_us, 'US COVID-19 Death Count')

In [12]:
def long_maker(dset, var):
    name_list = dset.columns.to_list()[1:]
    long_df = pd.DataFrame(columns = {'date', var, 'country'})
    for name in name_list:
        df = dset.filter(items={name, 'date'})
        df['country'] = name
        df.rename(columns={name:var},inplace = True)
        long_df = pd.concat([long_df, df],axis=0)
    long_df.date = pd.to_datetime(long_df.date)
    return long_df.reset_index(drop=True)

In [13]:
long_case = long_maker(case, 'case_count')
long_death = long_maker(death, 'death_count')

## Working on the Oxford Data Set

In [14]:
def Correcting_Col_Names(infile_path, dset):
    infile = pd.read_csv(infile_path)
    # og_col_list = infile.columns.to_list()
    df = pd.DataFrame(infile.columns.to_list())
    df['col'] = df[2:].apply(lambda x: pd.to_datetime(x).dt.strftime('X%m/X%d/%Y').str.replace('X0','').str.replace('X',''))
    df['col'][0] = 'country'
    df['col'][1] = 'country_code'
    cols = df.col.to_list()
    control = pd.read_csv(infile_path, names = cols, skiprows={0:1})
    indexNames = control[ control['country_code'].isna()].index
    control.drop(indexNames, inplace=True)
    df = controls_transpose(control, dset)
    return df

In [15]:
def controls_transpose(dset, var):
    import re
    df = dset.T
    df.columns = df.iloc[0]
    df.columns =df.columns.str.lower()
    df.drop(axis=0, index = {'country', 'country_code'},inplace = True)
    df = df.reset_index().rename(columns={'index':'date'})
    df = long_maker(df, var)
    df.loc[df['country'].str.contains('cape verde',flags= re.IGNORECASE), 'country']= 'cabo verde'
    df.loc[df['country'].str.contains('taiwan',flags= re.IGNORECASE), 'country']= 'taiwan'
    return df

In [16]:
c_measures_path = r'../../../covid-policy-tracker/data'
school = Correcting_Col_Names(f'{c_measures_path}/timeseries/c1_schoolclosing.csv', 'school_close')
work = Correcting_Col_Names(f'{c_measures_path}/timeseries/c2_workplaceclosing.csv', 'work_close')
pub_events = Correcting_Col_Names(f'{c_measures_path}/timeseries/c3_cancelpublicevents.csv', 'public_events')
gatherings = Correcting_Col_Names(f'{c_measures_path}/timeseries/c4_restrictionsongatherings.csv', 'large_gather')
pub_transpo = Correcting_Col_Names(f'{c_measures_path}/timeseries/c5_closepublictransport.csv', 'public_transpo')
stay_home = Correcting_Col_Names(f'{c_measures_path}/timeseries/c6_stayathomerequirements.csv' ,'stay_home')
domestic_travel = Correcting_Col_Names(f'{c_measures_path}/timeseries/c7_domestictravel.csv' ,'domestic_travel')
int_travel = Correcting_Col_Names(f'{c_measures_path}/timeseries/c8_internationaltravel.csv' ,'internat_travel')

In [17]:
school.date.max()

Timestamp('2020-09-22 00:00:00')

In [18]:
long_df = long_case.merge(long_death, on=['date', 'country'])
long_df = long_df.merge(school, on=['date', 'country'],how='outer')
long_df = long_df.merge(work, on=['date', 'country'],how='outer')
long_df = long_df.merge(pub_events, on=['date', 'country'],how='outer')
long_df = long_df.merge(gatherings, on=['date', 'country'],how='outer')
long_df = long_df.merge(pub_transpo, on=['date', 'country'],how='outer')
long_df = long_df.merge(stay_home, on=['date', 'country'],how='outer')
long_df = long_df.merge(domestic_travel, on=['date', 'country'],how='outer')
long_df = long_df.merge(int_travel, on=['date', 'country'],how='outer')
long_df[['school_close',
       'work_close', 'public_events', 'large_gather', 'public_transpo',
       'stay_home', 'domestic_travel', 'internat_travel']] = long_df[['school_close',
       'work_close', 'public_events', 'large_gather', 'public_transpo',
       'stay_home', 'domestic_travel', 'internat_travel']].apply(lambda x: x.str.replace('.','999'))
long_df[['school_close',
       'work_close', 'public_events', 'large_gather', 'public_transpo',
       'stay_home', 'domestic_travel', 'internat_travel']] = long_df[['school_close',
       'work_close', 'public_events', 'large_gather', 'public_transpo',
       'stay_home', 'domestic_travel', 'internat_travel']].fillna(999)
long_df[['school_close',
       'work_close', 'public_events', 'large_gather', 'public_transpo',
       'stay_home', 'domestic_travel', 'internat_travel']] = long_df[['school_close',
       'work_close', 'public_events', 'large_gather', 'public_transpo',
       'stay_home', 'domestic_travel', 'internat_travel']].astype(int)

In [19]:
case_summary = long_df.groupby('country').case_count.agg(['mean','median','std', 'max'])
death_summary = long_df.groupby('country').case_count.agg(['mean','median','std', 'max'])

# Importing Control Variables

In [22]:
control_var_path = '../Control Data'

In [23]:
def country_name_clean(df):
    import re
#     df.loc[(df['country'].str.contains('Congo',re.IGNORECASE)) & (~df['country'].str.contains('dem',re.IGNORECASE)) , 'country'] ='congo'
    df.loc[(df['country'].str.contains('korea, s',flags= re.IGNORECASE)) & (df['country'].str.contains('south',flags= re.IGNORECASE)), 'country']= 'south korea'
    df.loc[df['country'].str.contains('rep')& (df['country'].str.contains('congo',flags= re.IGNORECASE)) ,'country']= 'democratic republic of congo'
    df.loc[df['country'] == ('US'), 'country']= 'united states'
    df.loc[df['country'].str.contains('ivoire|ivory coast',flags= re.IGNORECASE), 'country']= 'cote d\'ivoire'
    df.loc[df['country'].str.contains('venezuela',flags= re.IGNORECASE), 'country']= 'venezuela'
    df.loc[df['country'].str.contains('and principe',flags= re.IGNORECASE), 'country']= 'sao tome and principe'
    df.loc[df['country'].str.contains('and the grenadines',flags= re.IGNORECASE), 'country']= 'saint vincent and the grenadines'
    df.loc[df['country'].str.contains('kitts and nevis',flags= re.IGNORECASE), 'country']= 'saint kitts and nevis'
    df.loc[df['country'].str.contains('bahamas',flags= re.IGNORECASE), 'country']= 'bahamas'
    df.loc[df['country'].str.contains('yemen',flags= re.IGNORECASE), 'country']= 'yemen'
    df.loc[df['country'].str.contains('gambia',flags= re.IGNORECASE), 'country']= 'gambia'
    df.loc[df['country'].str.contains('hong kong',flags= re.IGNORECASE), 'country']= 'hong kong'
    df.loc[df['country'].str.contains('macao',flags= re.IGNORECASE), 'country']= 'macao'
    df.loc[df['country'].str.contains('iran',flags= re.IGNORECASE), 'country']= 'iran'
    df.loc[df['country'].str.contains('lucia',flags= re.IGNORECASE), 'country']= 'saint lucia'
    df.loc[df['country'].str.contains('lao pdr',flags= re.IGNORECASE), 'country']= 'laos'
    df.loc[df['country'].str.contains('egypt',flags= re.IGNORECASE), 'country']= 'egypt'
    df.loc[df['country'].str.contains('korea, rep.',flags= re.IGNORECASE), 'country']= 'south korea'
    df.loc[df['country'].str.contains('states of america',flags= re.IGNORECASE), 'country']= 'united states'
    df.loc[df['country'].str.contains('east timor',flags= re.IGNORECASE), 'country']= 'timor-leste'
    df.loc[df['country'].str.contains('russia',flags= re.IGNORECASE), 'country']= 'russia'
    df.loc[df['country'].str.contains('brunei',flags= re.IGNORECASE), 'country']= 'brunei'
    df.loc[df['country'].str.contains('korea, dem. people\'s rep',flags= re.IGNORECASE), 'country']= 'north korea'
    dset = df.copy()
    return dset


In [24]:
#COVID Regional Safety Assessment Data
safety = pd.read_excel(f'{control_var_path}/COVID-19_Regional_Safety_Assessment.xlsx')
safety.columns = safety.columns.str.lower()
safety.rename(columns={'country/ region':'country'}, inplace=True)
safety.country = safety.country.str.lower()
safety = country_name_clean(safety)

#World Bank GDP Data
gdp = pd.read_excel(f'{control_var_path}/Global GDP.xlsx')
gdp.drop(columns='Unnamed: 4', inplace=True)
gdp.columns = gdp.columns.str.lower()
gdp.rename(columns={'economy':'country'}, inplace=True)
gdp.country = gdp.country.str.lower()

og_data = pd.read_excel('..\..\Country Response Paper\Original Documents\Country Responses-selected\Country Responses Dataset 7.28.20.xlsx',sheet_name = 'Country Responses')
og_data.index = og_data['Country/Region']
og_data = og_data.iloc[0:,19:]
og_data = og_data.reset_index(drop=False).rename(columns={'Country/Region':'country'})
og_data.columns = og_data.columns.str.lower()
og_data = stats_col_renamer(og_data)
og_data.dropna(how='all',inplace=True)
og_data.country = og_data.country.str.lower()

safety = country_name_clean(safety)
gdp = country_name_clean(gdp)
og_data = country_name_clean(og_data)

In [25]:
og_data.columns

Index(['country', 'population', 'ages_65_and_above__of_total_population',
       'female__of_total', 'diabetes_prevalence__of_population_ages_20_to_79',
       'obese__of_adult_population', 'htn_prevalence',
       'smoking_prevalence_ages_15', 'cancer_prevalence_',
       'hiv_prevalence__of_population_ages_15_49', 'copd_dalys_per_100000',
       'sars_experience_0_no_1_yes'],
      dtype='object')

# Importing and evaluating the smoking dataset from Our World Data

In [26]:
def our_world_importer(filename,sheet, year=2017):
    df = pd.read_excel(f'{control_data_path}\\{filename}.xlsx', sheet_name = sheet)
    df = stats_col_renamer(df)
    df = df.loc[df.year == year]
    df.drop(columns=['entity','year'],inplace=True)
    df.dropna(how='any',inplace=True)
    return df

In [27]:
#Smoking Data
smk_die_rate = our_world_importer('share-deaths-smoking', 'death-rate-smoking')
smk_die_prop = our_world_importer('share-deaths-smoking', 'share-deaths-smoking')
#HIV Data
hiv_prev = our_world_importer('hiv-data','share-of-population-infected-wi')
hiv_death = our_world_importer('hiv-data','hiv-death-rates')
#Obesity Data
obese = our_world_importer('share-of-deaths-obesity','share-of-deaths-obesity')
#Population Data
pop = our_world_importer('projected-population-by-country', 'projected-population-by-country',year=2020)
#Age Data
age = our_world_importer('median-age','median-age',year=2020)
#Diabetes Data
diabet = our_world_importer('diabetes-prevalence', 'diabetes-prevalence')
mers = pd.read_excel(f'{control_data_path}\MERS-SARS.xlsx',sheet_name='MERS')
sars = pd.read_excel(f'{control_data_path}\MERS-SARS.xlsx',sheet_name='SARS')
mers = stats_col_renamer(mers)
sars = stats_col_renamer(sars)
sars.drop(columns=['number_of_hcw_affected_', 'date_onset_first_probable_case',
       'date_onset_last_probable_case','case_fatality_ratio_','female', 'male','areas','median_age_range'],inplace=True)
mers.drop(columns='country',inplace=True)

### How to Merge a lot of DataFrames
> Method 1

In [28]:
from functools import partial, reduce

In [29]:
dfs = [smk_die_rate, smk_die_prop,hiv_prev, hiv_death, obese, pop, age, diabet, mers, sars]
merge = partial(pd.merge,on='code', how='outer')
control = reduce(merge,dfs)
control[['mers_case_count',
       'sars_case_count', 'number_of_deathsa', 'number_of_imported_cases',
       'percent_of_imported_cases']] = control[['mers_case_count',
       'sars_case_count', 'number_of_deathsa', 'number_of_imported_cases',
       'percent_of_imported_cases']].fillna(0)

> Method 2

In [30]:
# control = smk_die_rate.copy()
# control = control.merge(smk_die_prop, on='code', how='outer')
# control = control.merge(hiv_prev, on='code', how='outer')
# control = control.merge(hiv_death, on='code', how='outer')
# control = control.merge(obese, on='code', how='outer')
# control = control.merge(pop, on='code', how='outer')
# control = control.merge(age, on='code', how='outer')
# control = control.merge(diabet, on='code', how='outer')
# control = control.merge(mers, on='code', how='outer')
# control = control.merge(sars, on='code', how='outer')

# Final Merges

In [66]:
long_df_newcontrol = control.merge(gdp,left_on='code',right_on='countrycode',how='outer')
long_df_newcontrol = long_df_newcontrol.merge(long_df, on=['country'],how='outer')
long_df_newcontrol = long_df_newcontrol.merge(safety, on=['country'],how='outer')
long_df_newcontrol = long_df_newcontrol.merge(og_data, on=['country'],how='outer')
long_df_newcontrol.rename(columns={'us_dollars_in_mil':'gdp_in_mil_us', 'ranking':'gdp_rank'},inplace=True)
long_df_newcontrol = stats_col_renamer(long_df_newcontrol)
long_df_newcontrol.drop(columns={'global_region','code'},inplace=True)
long_df_newcontrol.columns = long_df_newcontrol.columns.str.replace('__','_')

In [32]:
long_df_order = ['date',  'country','case_count', 'death_count', 'school_close',
       'work_close', 'public_events', 'large_gather', 'public_transpo',
       'stay_home', 'domestic_travel', 'internat_travel', 'population',
       'ages_65_and_above_of_total_population', 'female_of_total',
       'diabetes_prevalence_of_population_ages_20_to_79',
       'obese_of_adult_population', 'htn_prevalence',
       'smoking_prevalence_ages_15', 'cancer_prevalence_',
       'hiv_prevalence_of_population_ages_1549', 'copd_dalys_per_100000',
       'sars_experience_0_no_1_yes', 'quarantine_efficiency', 'gov_efficiency',
       'monitoring_and_detection', 'healthcare_readiness',
       'country_vulnerability', 'emergency_preparedness', 'total_score',
       'countrycode', 'gdp_rank', 'gdp_in_mil_us'
]

In [33]:
l = ['country','countrycode','date','case_count','death_count',
'school_close', 'domestic_travel','internat_travel','large_gather', 'public_events',
'stay_home', 'work_close' 'public_transpo',
'gdp_rank', 'us dollars in mil','smoking_ihme_2019', 'population_by_country_and_region_historic_and_projections_gapminder_hyde__un',
'prevalence_hivaids_sex_both_age_15_49_years_percent', 'deaths_hivaids_sex_both_age_age_standardized_rate', 'deaths_smoking_sex_both_age_age_standardized_rate',
'ages_65_and_above_of_total_population','htn_prevalence', 'copd_dalys_per_100000', 'obesity_ihme_2019',
'country_vulnerability','emergency_preparedness','gov_efficiency','healthcare readiness',
'monitoring and detection', 'quarantine_efficiency', 'total_score',
'mers_case_count', 'sars_case_count', 'number_of_deathsa', 'number_of_imported_cases', 'percent_of_imported_cases']

In [58]:
long_df_newcontrol.countrycode = long_df_newcontrol.countrycode.str.upper()

In [61]:
long_df_newcontrol.shape

(58262, 30)

In [67]:
long_df_newcontrol.columns

Index(['deaths_smoking_sex_both_age_age_standardized_rate',
       'smoking_ihme_2019',
       'prevalence_hivaids_sex_both_age_15_49_years_percent',
       'deaths_hivaids_sex_both_age_age_standardized_rate',
       'obesity_ihme_2019',
       'population_by_country_and_region_historic_and_projections_gapminder_hyde_un',
       'un_population_division_median_age_2017',
       'diabetes_prevalence_of_population_ages_20_to_79_x', 'mers_case_count',
       'sars_case_count', 'number_of_deathsa', 'number_of_imported_cases',
       'percent_of_imported_cases', 'countrycode', 'gdp_rank', 'country',
       'us_dollars_in_mil', 'case_count', 'date', 'death_count',
       'school_close', 'work_close', 'public_events', 'large_gather',
       'public_transpo', 'stay_home', 'domestic_travel', 'internat_travel',
       'quarantine_efficiency', 'gov_efficiency', 'monitoring_and_detection',
       'healthcare_readiness', 'country_vulnerability',
       'emergency_preparedness', 'total_score', 'popul

In [68]:
long_df_newcontrol = long_df_newcontrol.filter(items=['country','countrycode','date','case_count','death_count',
'school_close', 'domestic_travel','internat_travel','large_gather', 'public_events',
'stay_home', 'work_close' 'public_transpo',
'gdp_rank', 'us dollars in mil','smoking_ihme_2019', 'population_by_country_and_region_historic_and_projections_gapminder_hyde_un',
'un_population_division_median_age_2017','ages_65_and_above_of_total_population',
'prevalence_hivaids_sex_both_age_15_49_years_percent',
'deaths_hivaids_sex_both_age_age_standardized_rate', 
'diabetes_prevalence_of_population_ages_20_to_79_x',
'deaths_smoking_sex_both_age_age_standardized_rate','cancer_prevalence_',
'htn_prevalence', 'copd_dalys_per_100000', 'obesity_ihme_2019',
'country_vulnerability','emergency_preparedness','gov_efficiency','healthcare readiness',
'monitoring and detection', 'quarantine_efficiency', 'total_score',
'mers_case_count', 'sars_case_count', 'number_of_deathsa', 'number_of_imported_cases', 'percent_of_imported_cases'])

In [72]:
long_df_newcontrol.rename(columns={'population_by_country_and_region_historic_and_projections_gapminder_hyde_un':'pop_2020','deaths_hivaids_sex_both_age_age_standardized_rate':'deaths_hivaids_sex_both_age',
                                  'diabetes_prevalence_of_population_ages_20_to_79_x':'diabetes_prev_ages_20_to_79',
                                  'cancer_prevalence_':'cancer_prevalence'},inplace=True)

In [69]:
date = long_df_newcontrol.date.max().date()

In [73]:
long_df_newcontrol.to_excel(f'{out_data_path}\\Final COVID Data Set (Through {date}) (ver2).xlsx',index=False)

## trying to work out country linking stuff

In [38]:
def similar_check(q,l):
    from fuzzywuzzy import fuzz,process
    from Levenshtein import distance,ratio
    import pandas as pd
    jhu = []
    similar = []
    query = []
    for countries in q:
        results = process.extractOne(countries,list(jhu_country)) 
        if results[1] < 100:
            jhu.append(results[0])
            similar.append(results[1])
            query.append(countries)
    output = pd.DataFrame({'query_country':query, 
                          'similarity':similar,
                           'jhu_country':jhu                       
                          })
    return output

In [39]:
# a list of the country names from each data set
jhu_country = long_case.country.unique()
ox_country = school.country.unique()
gdp_country = gdp.country
safe_country = safety.country
og_country = og_data.country

### Country Name Cleaner

## Oxford Data Country Names

In [40]:
ox_check = similar_check(ox_country ,jhu_country)
ox_check.sort_values(by='similarity', ascending=False)

Unnamed: 0,query_country,similarity,jhu_country
4,congo,90,congo (brazzaville)
12,kyrgyz republic,86,central african republic
3,democratic republic of congo,86,congo (brazzaville)
20,slovak republic,86,central african republic
6,czech republic,86,central african republic
10,guam,77,nicaragua
9,greenland,75,grenada
22,turkmenistan,75,turkey
13,macao,73,monaco
0,aruba,72,barbados


## GDP Data Country Names

In [41]:
gdp_check = similar_check(gdp_country ,jhu_country)
gdp_check.sort_values(by='similarity', ascending=False)

Unnamed: 0,query_country,similarity,jhu_country
40,syrian arab republic,90,syria
6,democratic republic of congo,86,congo (brazzaville)
2,slovak republic,86,central african republic
1,czech republic,86,central african republic
9,kyrgyz republic,86,central african republic
8,democratic republic of congo,86,congo (brazzaville)
36,north korea,82,south korea
11,guam,77,nicaragua
7,turkmenistan,75,turkey
16,greenland,75,grenada


## Safety Data Country Names

In [42]:
safe_check = similar_check(safe_country ,jhu_country)
safe_check.sort_values(by='similarity', ascending=False).head(50)

Unnamed: 0,query_country,similarity,jhu_country
69,somaliland,90,mali
64,northern cyprus,90,cyprus
44,aland,90,new zealand
58,south georgia and the islands,90,georgia
28,united states virgin islands,90,united states
62,indian ocean territories,90,india
61,british indian ocean territory,90,india
31,caribbean netherlands,90,netherlands
4,republic of serbia,90,serbia
59,heard island and mcdonald islands,86,antigua and barbuda


## OG Data Country Names

In [43]:
og_check = similar_check(og_country ,jhu_country)
og_check.sort_values(by='similarity', ascending=False)

Unnamed: 0,query_country,similarity,jhu_country
0,us,90,australia


# Code Grave Yard

### Importing the Oxford Control Measures Data Set (Ordinal)

In [44]:

# from fuzzywuzzy import fuzz,process
# from Levenshtein import distance,ratio
# import pandas as pd
# jhu = []
# similar = []
# ox = []
# for countries in ox_country:
#     results = process.extractOne(countries,list(jhu_country)) 
#     if results[1] < 100:
#         jhu.append(results[0])
#         similar.append(results[1])
#         ox.append(countries)
# output = pd.DataFrame({'oxford_country':ox, 
#                       'similarity':similar,
#                        'jhu_country':jhu                       
#                       })