# **Geoparsing: Disambiguation Evaluation**
---
**Prepared by**: Feyi Adesanya

**Submission Date**: April 30, 2024

In [1]:
import pandas as pd

In [2]:
pd.reset_option('display.max_columns')
pd.reset_option('display.width')

In [3]:
from Pre.Preprocess import Preprocess
from Gaz.Gazetteer import Gazetteer
from Dis.Disambiguate_Manager import Disambiguate_Manager

In [4]:
gaz = Gazetteer()

Retrieving Locations Array from Saved Data
Corpus has 133639 Locations
Retrieving BK Tree from Saved Data


In [5]:
preprocess = Preprocess(gaz)
preprocess.extract_train_data()

  from .autonotebook import tqdm as notebook_tqdm


Retrieving Corpus from Saved Data
Corpus has 588 documents


In [6]:
dis = Disambiguate_Manager(gaz, preprocess)

# Disambiguation Results Analysis

### Disambiguate by Population

In [7]:
print("@60km")
true_population, predicted_population = dis.disambiguate_corpus("population")

@60km
|████████████████████████████████████████| 588/588 [100%] in 6:32.9 (1.49/s)    ▅▇▇ 8/588 [1%] in 5s (~5:49, 1.6/s)  ▆█▆ 8/588 [1%] in 5s (~5:55, 1.6/s)  ▇▅▃ 9/588 [2%] in 6s (~5:59, 1.6/s)  ▄▂▂ 9/588 [2%] in 6s (~6:16, 1.5/s)  ▃▁▃ 9/588 [2%] in 6s (~6:25, 1.5/s)  ▂▄▆ 9/588 [2%] in 7s (~6:55, 1.4/s)  ▇▇▅ 10/588 [2%] in 8s (~7:37, 1.3/s)  ▃▅▇ 15/588 [3%] in 10s (~6:40, 1.5/s) ▄▆█ 15/588 [3%] in 10s (~6:36, 1.5/s) ▅▇▇ 15/588 [3%] in 10s (~6:34, 1.5/s) ▆▄▂ 16/588 [3%] in 11s (~6:36, 1.4/s) ▅▃▁ 16/588 [3%] in 12s (~6:39, 1.4/s) ▁▃▅ 16/588 [3%] in 12s (~6:59, 1.3/s) ▆█▆ 18/588 [3%] in 13s (~7:15, 1.3/s) █▆▄ 19/588 [3%] in 14s (~7:05, 1.4/s) ▇▅▃ 19/588 [3%] in 14s (~7:03, 1.4/s) ▆█▆ 21/588 [4%] in 16s (~7:04, 1.3/s) ▇▇▅ 21/588 [4%] in 16s (~7:07, 1.3/s) ▆▄▂ 21/588 [4%] in 17s (~7:21, 1.3/s) ▅▃▁ 21/588 [4%] in 17s (~7:26, 1.3/s) ▂▄▆ 21/588 [4%] in 18s (~7:53, 1.2/s) █▆▄ 341/588 [58%] in 5:08 (~3:44, 1.1 ▃▁▃ 348/588 [59%] in 5:19 (~3:40, 1.1 ▂▄▆ 349/588 [59%] in 5:20 (~3:39, 1.1 ▄▂▂ 353/

### Disambiguate by Distance

In [8]:
print("@60km")
true_distance, predicted_distance = dis.disambiguate_corpus("distance")

@60km
|                                        | ▁▃▅ 0/588 [0%] in 0s (~0s, 0.0/s) 

|████████████████████████████████████████| 588/588 [100%] in 4:36.1 (2.13/s)    
Accuracy: 77.06%
Mean Squared Error (MSE): 737.38
Root Mean Squared Error (RMSE): 27.15
Mean Absolute Error (MAE): 4.85


### Combination Disambiguation

In [9]:
print("@60km")
true_population, predicted_population = dis.disambiguate_corpus()

@60km
                                                                                

        normalized_distances = np.array(distances) / max(distances)
        normalized_distances = np.array(distances) / max(distances)


|████████████████████████████████████████| 588/588 [100%] in 4:30.7 (2.17/s)    ▅▇▇ 7/588 [1%] in 5s (~5:29, 1.7/s)  ▇▇▅ 19/588 [3%] in 13s (~6:24, 1.5/s) ▅▃▁ 21/588 [4%] in 16s (~7:06, 1.3/s)
Accuracy: 82.78%
Mean Squared Error (MSE): 618.51
Root Mean Squared Error (RMSE): 24.87
Mean Absolute Error (MAE): 4.05


In [10]:
# # examples for error = [9, 19, 16, 24, 25]
# # Upping distance examples = [21]
# #Certain places are referring to the exact same area but the lat lon is off by 1 degree, about 111km, maybe add a check if the geonames id is the same or within a certain distance
# from alive_progress import alive_bar
# correct_count = 0
# total_count = 0
# baseline = False
# MDE = 0
# wrong = []

# on = 24
# with alive_bar(len(dis.preprocess.corpus[on:on+1]), force_tty=True) as bar:
#     for book in dis.preprocess.corpus[on:on+1]:
#         print(book["text"])
#         locs = []
#         for loc in book["toponyms"]:
#             if loc.get("geonameid") is not None and loc.get("fcode") in dis.relevant_fcodes:
#                 if len(dis.all_pop[dis.all_pop["geonameid"] == loc.get("geonameid")]["population"]) == 1 and dis.all_pop[dis.all_pop["geonameid"] == loc.get("geonameid")]["population"].values[0] >= 1000:
#                     locs.append(loc)
#         loc_list = [loc["phrase"].lower() for loc in locs]
#         final_predictions = dis.disambiguate(book["text"],loc_list)
#         for item in locs:
#             for prediction in final_predictions:
#                 if item.get("start") == str(prediction[-1]):
#                     total_count += 1
#                     distance_apart = dis.get_distance(prediction[1][0], prediction[1][1], item["lat"], item["lon"])
#                     MDE += distance_apart**2
#                     if int(item.get("geonameid")) == int(prediction[2]) or distance_apart <= 25:
#                         correct_count += 1
#                     else:
#                         wrong.append(item.get("name"))
#                         print(item.get("name"))
#                         print(item)
#                         print(distance_apart)
#                         print(prediction)
#                         print("-"*50)
#         bar()

## Examples

In [11]:
locs = ["lethbridge","calgary", "red deer","calgary", "edmonton", "manitoba", "lethbridge"]
text = "I'm making my way to Calgary, Red Deer, Calgary, and Edmonton this summer. Don't forget about Manitoba"
dis.map_locations(text, locs)

--------------------------------------------------
[('Location Unknown: lethbridge', (0.0, 0.0), -1), ('Location Unknown: lethbridge', (0.0, 0.0), -1), ('calgary', (51.05011, -114.08529), 5913490, 21), ('red deer', (52.26682, -113.802), 6118158, 30), ('calgary', (51.05011, -114.08529), 5913490, 40), ('edmonton', (53.55014, -113.46871), 5946768, 53), ('manitoba', (55.00019, -97.00049), 6065171, 94)]
--------------------------------------------------


In [12]:
# Example 2
locs = ["Toronto", "Toronto"]
text = "Help I'm making my way down Toronto then Toronto"
dis.map_locations(text, locs)

--------------------------------------------------
[('toronto', (43.70643, -79.39864), 6167865, 28), ('toronto', (43.70643, -79.39864), 6167865, 41)]
--------------------------------------------------


  normalized_distances = np.array(distances) / max(distances)
