In [None]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.insert(0, '..')

In [None]:
from src.util import sample_from_dict, load_saved_data

In [None]:
import json
import random
from rich import print
import geopandas as gpd
import pandas as pd

## Load data

In [None]:
stations = pd.DataFrame(load_saved_data())
datamaps = pd.read_json("../data/datamaps.json")
ne = gpd.read_file("../data/ne_50m_admin_0_countries.geojson")
centers = gpd.read_file("centers.geojson")

In [None]:
radio_countries = set(stations["country"])
len(radio_countries)

In [None]:
df = ne.loc[:, ["ADMIN", "ADM0_TLC", "ISO_A2"]]

In [None]:
# only inlcude countries in radio dataset
df = df[df["ADMIN"].isin(radio_countries)]
df.shape

In [None]:
# merge with datamaps
df = pd.merge(df, datamaps, left_on="ADM0_TLC", right_on="id")
df.shape

In [None]:
# merge with centers
df = pd.merge(df, centers, left_on="ISO_A2", right_on="AFF_ISO")
df.shape

In [None]:
df = df[["ADMIN", "AFF_ISO", "id"]]
df.columns = ["name", "two_code", "three_code",]

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.to_json("../data/out/source.json", orient="records")

#### Prepare center

In [None]:
centers = gpd.read_file("../data/centers.geojson")

In [None]:
centers_main = centers[centers["COUNTRY"] == centers["COUNTRYAFF"]]

In [None]:
centers_main = centers_main[["COUNTRYAFF", "AFF_ISO", "geometry"]]

In [None]:
centers_main.to_file("centers.geojson", driver='GeoJSON')

In [None]:
# ADM0_ISO, ISO_A2
# list(ne.columns)

In [None]:
len(ne["ISO_A2"].unique())

In [None]:
centers

## Match data

### Natural Earth

In [None]:
p = ne["geometry"][0]

In [None]:
ne.shape

In [None]:
# seems to be the best unique key?
len(ne["ADM0_TLC"].unique())

In [None]:
ne["MATCH"] = False
stations["ne_idx"] = -1

In [None]:
# radio_countries = list(set(stations.country))

In [None]:
def find_idx(country):
    """Find ne index that corresponds to country name"""
    cols = ["NAME", "ADMIN", "NAME_LONG"]
    for col in cols:
        match_idxs = ne.index[ne[col].str.fullmatch(c)].tolist()
        if match_idxs:
            idx = match_idxs[0]
            ne.loc[idx, ("MATCH")] = True
            return idx
    return -1

In [None]:
no_match = []
match = []

for i, c in stations.country.iteritems():
    idx = find_idx(c)
    stations.loc[i, ("ne_idx")] = idx
    
    if idx > -1:
        match.append((c, idx))
    else:
        no_match.append(c)    

In [None]:
# # --- OLD WAY ----

# no_match = []
# match = []

# for c in radio_countries:
#     idx = find(c)
#     if idx:
#         match.append((c, idx))
#     else:
#         no_match.append(c)

In [None]:
matched = stations[stations["ne_idx"] > -1]

In [None]:
matched

### Finish up

In [None]:
code_col = "ADM0_TLC"

In [None]:
data = pd.DataFrame(matched)
data["code"] = list(ne[code_col].iloc[matched["ne_idx"]])

In [None]:
data.to_json("../data/out/radio.json", orient='records')

In [None]:
missing = stations[stations["ne_idx"] == -1]

In [None]:
# missing

### Fuzzy Matching

In [None]:
from thefuzz import fuzz
from thefuzz import process

In [None]:
missing = stations[stations["ne_idx"] == -1]

In [None]:
ne_still = ne[ne["MATCH"] == False]

In [None]:
ne_still.shape

In [None]:
def fuzzy_find(country, col="ADMIN"):
    found = process.extract(
        country, ne_still[col].to_dict(), limit=3, scorer=fuzz.partial_ratio)
    for f in found:
        if f[1] > 80:
            return f
    return None

In [None]:
fuzzy_find('Guadeloupe')

In [None]:
# now_found = []
# still_no_match = []

# for c in no_match:
#     match = fuzzy_find(c, "ADMIN")
#     if match:
#         now_found.append((c, match[-1]) + (match[0], match[1]))
#     else:
#         still_no_match.append(c)

In [None]:
now_found = []
still_no_match = []

for i, c in missing.country.iteritems():
    match = fuzzy_find(c, "ADMIN")
    if match:
        # stations.loc[i, ("ne_idx")] = idx    
        now_found.append((c, match[-1]) + (match[0], match[1]))
    else:
        still_no_match.append(c)

In [None]:
fuzzy_find("Zimbabwe")

In [None]:
pd.DataFrame(now_found, columns=["radio", "ne_idx", "ne_admin", "score"])

Everything good except:

- Iraq
- Guyana

In [None]:
for c in no_match:
    match1 = fuzzy_find(c, "ADMIN")
    match2 = fuzzy_find(c, "NAME_LONG")
    
    if match1 or match2:
        print("-------")
        print(c)
    
    if match1:
        print("ADMIN", match1)
    if match2:
        print("NAME_LONG", match2)
    

In [None]:
cols = ["NAME", "ADMIN", "NAME_LONG"]
ne_full = set(pd.concat([ne[col] for col in cols]))

In [None]:
fuzzy = {}
for c in no_match:
    # found = process.extract(c, ne_full, limit=3, scorer=fuzz.partial_ratio)
    found = process.extract(c, ne_full, limit=3)
    fuzzy[c] = found

In [None]:
for k, v in fuzzy.items():
    print(k, sorted(v, key=lambda x: x[1], reverse=True))

In [None]:
matches = []

no_match_datamaps_countries = []

for c1 in datamaps_countries:
    match = False
    for c2 in radio_countries:
        if c1.lower() == c2.lower():
            matches.append((c1, c2))
            match = True
    if not match:
        no_match_datamaps_countries.append(c1)

### Load datamaps data

In [None]:
with open("data/datamaps.json") as f:
    datamaps = json.load(f)

In [None]:
datamaps_countries = [
    x["properties"]["name"]
    for x in datamaps
]

In [None]:
len(datamaps_countries)

## Compare

In [None]:
matches = []

no_match_datamaps_countries = []

for c1 in datamaps_countries:
    match = False
    for c2 in radio_countries:
        if c1.lower() == c2.lower():
            matches.append((c1, c2))
            match = True
    if not match:
        no_match_datamaps_countries.append(c1)

In [None]:
len(matches)

In [None]:
no_match_datamaps_countries

In [None]:
matches = []

no_match_radio_countries = []

for c1 in radio_countries:
    match = False
    for c2 in datamaps_countries:
        if c1.lower() == c2.lower():
            matches.append((c1, c2))
            match = True
    if not match:
        no_match_radio_countries.append(c1)

In [None]:
no_match_radio_countries