In [None]:
import pandas as pd
import pycountry
import numpy as np

# GOAL

At the end, I want one dataframe that contains:
- the index of the clue
- the clue itself
- the iso-2 name of the countries
- a zero for all guidebook clues and a one for all travel guide clues.

### GUIDEBOOK

The guidebook contains clues and for each clue a list of countries it is applicable to.

In [None]:
guidebook = pd.read_json("~/Downloads/guidebook.json")

guidebook = guidebook.drop('CLUE_ID', axis=1)

remove_idx = [395, 779, 805, 3499, 3500, 3502, 3503, 3504, 3505, 3506, 3507, 3508, 3509, 3510, 3512]
guidebook = guidebook.drop(remove_idx)

guidebook['ISO3'] = None
gb_countries = []

for idx, clue in guidebook['geoparsed'].items():
    iso_list = sorted(list(set([parsed['ISO3'] for parsed in clue])))

    while 'CHN' in iso_list:
        iso_list.remove('CHN')
    while 'IND' in iso_list:
        iso_list.remove('IND')
    gb_countries.extend([parsed['Country'].lower().replace(' ', '-') for parsed in clue])
    if 'UAE' in iso_list:
        iso_list[iso_list.index('UAE')] = 'ARE'
    guidebook.at[idx, 'ISO3'] = iso_list
    if 'D' in iso_list:
        print([parsed['Country'].lower().replace(' ', '-') for parsed in clue])
        


guidebook = guidebook.replace('UAE', 'ARE')

iso_mapping = {} # a mapping from country name to iso3
for idx, clue in guidebook['geoparsed'].items():
    for geoparsed in clue:
        iso_mapping[geoparsed['Country'].lower().replace(' ', '-')] = geoparsed['ISO3']


guidebook.drop('geoparsed', inplace=True, axis=1)
guidebook['clue_type'] = 0

In [None]:
gb_countries = sorted(list(set(gb_countries)))

In [None]:
print(guidebook.columns)

### TRAVEL GUIDE

The travel guide contains a description for each country.

In [None]:
travel = pd.read_csv("~/Downloads/country_info_filtered.csv")
travel = travel.rename(columns={'Weather and Geography': 'text'})
travel.drop('Continent', inplace=True, axis=1)
travel = travel.dropna()
travel['clue_type'] = 1

tr_countries = sorted(list(set(travel['Country'])))

In [None]:
print(travel.columns)

In [None]:
country_mapping = {
    'united-states-of-america': 'united-states',
    'cape_verde': 'cabo-verde',
    'french-overseas-possessions': 'french-southern-territories',
    'palestinian-national-authority': 'palestinian-territory',
    'syrian-arab-republic': 'syria',
    'us-virgin-islands': 'u.s.-virgin-islands',
    'vatican-city': 'vatican'
}

pycountry_ios3_mapping = {}
for country in pycountry.countries:
    pycountry_ios3_mapping[country.name.lower().replace(' ', '-')] = country.alpha_3

official_pycountry_ios3_mapping  = {}
for country in pycountry.countries:
    try:
        official_pycountry_ios3_mapping[country.official_name.lower().replace(' ', '-')] = country.alpha_3
    except Exception:
        continue

manual_mapping = {
    'republic-of-congo': official_pycountry_ios3_mapping['republic-of-the-congo'],
    'democratic-republic-of-congo': official_pycountry_ios3_mapping['republic-of-the-congo'],
    'cape-verde': official_pycountry_ios3_mapping['republic-of-cabo-verde'],
    'sao-tome-e-principe': official_pycountry_ios3_mapping['democratic-republic-of-sao-tome-and-principe'],
    'tanzania': official_pycountry_ios3_mapping['united-republic-of-tanzania'],
    'bonaire': official_pycountry_ios3_mapping['bonaire,-sint-eustatius-and-saba'],
    'saba': official_pycountry_ios3_mapping['bonaire,-sint-eustatius-and-saba'],
    'st-eustatius': official_pycountry_ios3_mapping['bonaire,-sint-eustatius-and-saba'],
    'pacific-islands-of-micronesia': official_pycountry_ios3_mapping['federated-states-of-micronesia'],
    'iran': official_pycountry_ios3_mapping['islamic-republic-of-iran'],
    'surinam': official_pycountry_ios3_mapping['republic-of-suriname'],
    'st-maarten': official_pycountry_ios3_mapping['sint-maarten-(dutch-part)']    
}


In [None]:
travel['ISO3'] = None
for idx, country in travel['Country'].items():
    tmp = country.lower().replace(' ', '-')
    if tmp in country_mapping:
        tmp = country_mapping[tmp]
    #     p = True
    # else:
    #     p = False
    if tmp in pycountry_ios3_mapping:
        travel.at[idx, 'ISO3'] = [pycountry_ios3_mapping.get(tmp, None)]
    elif tmp in official_pycountry_ios3_mapping:
        travel.at[idx, 'ISO3'] = [official_pycountry_ios3_mapping.get(tmp, None)]
    elif tmp in iso_mapping:
        travel.at[idx, 'ISO3'] = [iso_mapping[tmp]]
    else:
        iso3 = [manual_mapping.get(tmp, None)]
        if None in iso3:
            continue
        travel.at[idx, 'ISO3'] = [manual_mapping.get(tmp, None)]

In [None]:
print(travel.isnull().sum().sum())
countries_with_null_iso3 = travel.loc[travel['ISO3'].isnull(), 'Country']
list(countries_with_null_iso3)

In [None]:
t = travel.loc[travel['ISO3'] == 'D', 'Country']
t

In [None]:
travel = travel.dropna()
travel.drop('Country', inplace=True, axis=1)

### CLUES

In [None]:
clues = pd.DataFrame(columns=['text', 'ISO2', 'clue_type'])

In [None]:
clues = pd.concat([clues, guidebook, travel], ignore_index=True)

In [None]:
clues 

In [None]:
all_isos = []
clues['ISO2'] = None
for idx, iso3_list in clues['ISO3'].items():
    iso2_list = []
    if len(iso3_list) > 0:
        for iso3 in iso3_list:
            if iso3 == 'XKX':
                iso2_list.append('XK')
            elif iso3 == 'UAE':
                iso2_list.append('AE')
            else:
                iso2_list.append(pycountry.countries.get(alpha_3=iso3).alpha_2)
        all_isos.extend(iso2_list)
    clues.at[idx, 'ISO2'] = iso2_list

clues.reset_index(inplace=True)
clues = clues.rename(columns={'index': 'CLUE_ID'})

In [None]:
clues

In [None]:
clues.to_csv('./all_clues.csv', index=False)

In [None]:
all_isos = sorted(list(set(all_isos)))
encodings = np.eye(len(all_isos)).astype('int').tolist()

print(len(all_isos))
print(all_isos)

In [None]:
df = {
    'ISO2': all_isos,
    'encoding': encodings
    }
df = pd.DataFrame(df)

In [None]:
df

In [None]:
df.to_csv("./encodings.csv", index=False)