In [1]:
from spacy import displacy
from pathlib import Path
from tqdm.notebook import tqdm
import pandas as pd
import re
import json

In [2]:
# This is the module for river matching
from rivers_nlp import *

In [3]:
# create object of the module
# This tries to load rivers from a default file
# So by default, this object is ready to start matching texts
# Filename can be provided to load rivers from a different file
# Files created with rm.save_vocab() should be used to load from
rm = RiverMatch()

Loaded rivers from rivers_matcher.pkl


In [4]:
# This cell can be run to load river names from csv 
"""
# Read from the csv and store river names
df = pd.read_csv("H:\\Courses\\RA\\wikidata-water-features\\data\\wikidata_rivers.csv")
df.set_index('ID', inplace=True)
# there are other river names as well which are common/stop words.
stop_words = ["is"]
# some river just have IDs instead of names. Ignore those:
rivers = [river for river in df["Name"] if not (river.lower() in stop_words or re.search("^Q[0-9]+", river))]
"""

'\n# Read from the csv and store river names\ndf = pd.read_csv("H:\\Courses\\RA\\wikidata-water-features\\data\\wikidata_rivers.csv")\ndf.set_index(\'ID\', inplace=True)\n# there are other river names as well which are common/stop words.\nstop_words = ["is"]\n# some river just have IDs instead of names. Ignore those:\nrivers = [river for river in df["Name"] if not (river.lower() in stop_words or re.search("^Q[0-9]+", river))]\n'

In [5]:
# This can be done to complete reload from a new list of rivers
#rm.reload_rivers(rivers)

# This can be done to add new rivers to already loaded list
#rm.add_rivers(rivers)

# This can be done to save loaded rivers into a file. Filename can be provided
#rm.save_vocab()

In [6]:
# Object can be used to match any text against the list of rivers
matches, entities = rm.match("The Amazon is amazing and so is the Nile.")
# The results can be rendered as:
displacy.render(entities, style="ent", manual=True, jupyter=True)

In [5]:
# this cell can be run for performing the match for all abstracts

"""
abstracts_path = Path("H:\\Courses\\RA\\hydrology_abstracts\\core_abstracts")
files = abstracts_path.glob("*CORE.json")
results = {"matches": [], "entities": []}
for file in files:
    print (f"Processing {file.name}")
    with open(file) as f:
        lines = f.readlines()
    for line in tqdm(lines):
        data = json.loads(line)
        if data["title"]:
            matches, entities = rm.match(data["title"])
            results["matches"].append(matches)
            results["entities"].append(entities)
        if data["abstract"]:
            matches, entities = rm.match(data["abstract"])
            results["matches"].append(matches)
            results["entities"].append(entities)
"""

In [7]:
# I already ran the search on one of the files and stored the results.
# loading the results stored
with open("awr_1991_CORE.pickle", "rb") as file:
    results = pickle.load(file)

In [8]:
# A good/mixed result
displacy.render(results["entities"][3309], style="ent", manual=True, jupyter=True)

In [9]:
# A bad result
# As per the csv from wikidata, there is indeed a river named "One"
displacy.render(results["entities"][75], style="ent", manual=True, jupyter=True)