In [8]:

import pandas as pd

fertility_2016_df = pd.read_fwf('https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2127.txt',
                               index_col=0, header=None, names=['Country', '2016'])
fertility_2016_df.set_index(keys='Country', inplace=True)

In [9]:

fertility_2016_df.sample(5)

Unnamed: 0_level_0,2016
Country,Unnamed: 1_level_1
Brazil,1.76
Germany,1.44
Argentina,2.28
Puerto Rico,1.64
Antigua and Barbuda,2.01


In [10]:

from difflib import SequenceMatcher
import time


def similar(a, b):
    return SequenceMatcher(None, str(a), str(b)).ratio()

#Check the closest names for typos
def check_for_typos(first_list, second_list):
    rows_list = []
    for first_item in first_list:
        max_similarity = 0.0
        max_item = first_item
        for second_item in second_list:
            this_similarity = similar(first_item, second_item)
            if this_similarity > max_similarity:
                max_similarity = this_similarity
                max_item = second_item

        # Get input row in dictionary format; key = col_name
        row_dict = {}
        row_dict['first_item'] = first_item
        row_dict['second_item'] = max_item
        row_dict['max_similarity'] = max_similarity

        rows_list.append(row_dict)

    column_list = ['first_item', 'second_item', 'max_similarity']
    name_similarities_df = pd.DataFrame(rows_list, columns=column_list)
    
    return name_similarities_df

In [26]:

# Set up fertility data
encoding = ['latin1', 'iso8859-1', 'utf-8'][1]
fertility_df = pd.read_csv('./data/csv/fertility_df.csv', encoding=encoding)
fertility_df.set_index(keys='Country', inplace=True)

In [28]:

import os

# Closest fertility 2016 countries to the countries in the fertility dataframe that are not paired up
t0 = time.time()
file_path = './data/csv/fertility_2016_df.csv'
if os.path.isfile(file_path):
    fertility_2016_df = pd.read_csv(file_path, encoding=encoding)
    fertility_2016_df.set_index(keys='Country', inplace=True)
name_similarities_df = check_for_typos(fertility_2016_df.index, fertility_df.index.difference(fertility_2016_df.index))
t1 = time.time()
print(t1-t0, time.ctime(t1))
fertility_2016_df.to_csv(file_path, index=True)
name_similarities_df.sort_values(['max_similarity'], ascending=False).head(10)

0.42205214500427246 Sun Jul 23 21:03:29 2017


Unnamed: 0,first_item,second_item,max_similarity
47,Cook Is.,Cocos Is.,0.823529
75,Germany,East Germany,0.736842
158,Poland,Åland,0.727273
142,Netherlands,Netherlands Antilles,0.709677
183,South Africa,South Korea,0.695652
21,Benin,Reunion,0.666667
54,Czechia,Czechoslovakia,0.666667
91,Iceland,Åland,0.666667
68,Finland,Åland,0.666667
38,Cayman Is.,Channel Is.,0.666667


In [16]:

fertility_df['2016'] = fertility_2016_df['2016']

In [17]:

fertility_df.to_csv('./data/csv/fertility_df.csv', index=True)

In [19]:

fertility_2016_df.columns

Index(['Country', '2016'], dtype='object')