# 3. Identify and filter locations
Look for location candidates in the extracted text using NLTK and Stanford NER, then filter location candidates based on rules.

In [None]:
import os
import re
import pandas as pd

import nltk
import nltk.data
from nltk.tag import StanfordNERTagger
# nltk customization
nltk.data.path.append('data_NLTK')
StanfordBaseDir = ''
os.environ['CLASSPATH'] = StanfordBaseDir + 'data_NER\\stanford-ner-2020-11-17\\'
os.environ['STANFORD_MODELS'] = StanfordBaseDir + 'data_NER\\stanford-ner-2020-11-17\\classifiers'
os.environ['JAVAHOME'] = 'C:\\Program Files\\Java\\jre1.8.0_351\\bin\\java.exe'
# nltk initialization of Stanford NER tagger
tagger = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')

# own files
from pysci import docutils as du
from pysci import geoparse as gp

In [None]:
path_to_pickle = 'science_articles.pkl'
# if needed, re-serialize in addition to any CSV export
path_to_repickle = 'science_articles_geoparsed.pkl'

In [None]:
# load the serialized ScienceDocs
science_docs = du.load_data(path_to_pickle)

### Process articles

In [None]:
# start from the ScienceDoc instances
for scidoc in science_docs:
    print("\n### Processing article %s..." %scidoc.file_name)

    ### PARSE TITLE FROM XML ###
    if scidoc.has_xml:
        if scidoc.title:
            scidoc.title_locations = []
            title_string = scidoc.title
            #print("Title: %s" %title_string)
            # process title
            title_clean = gp.multireplace(title_string)
            sent_tok = nltk.word_tokenize(title_clean)
            sent_pos = nltk.pos_tag(sent_tok)
            sent_ner = tagger.tag(sent_tok)
            # customizable extract method
            extracted_chunks = gp.extract_chunks_from_sentence(
                sent_ner, include_cardinal=True, include_other_spatial=True, include_types=True)
            if extracted_chunks:
                print("### Found %s location candidates in title:" %len(extracted_chunks))
                for loc_chunk in extracted_chunks:
                    loc_chunk_str = gp.tuple_list_to_string(loc_chunk)
                    print("\t%s" %loc_chunk_str)
                extracted_chunks_pos = gp.filter_chunk_candidates(sent_tok, extracted_chunks, verbose=True)
                print("### Kept %s location chunks in title:" %len(extracted_chunks_pos))
                for loc_chunk_keep in extracted_chunks_pos:
                    loc_chunk_keep_str = gp.tuple_list_to_string(loc_chunk_keep)
                    # keep just the final filtered locations - empty list means we had none
                    scidoc.title_locations.append(loc_chunk_keep_str)
                    print("\t%s" %loc_chunk_keep_str)
        else:
            #print("No title for this article.")
            scidoc.title_locations = gp.NO_TITLE_STRING  
    else:
        #print("No xml file for this article.")
        scidoc.title_locations = gp.NO_XML_STRING

    ### Process article contents
    content_locations = []
    content_locations_filtered = []
    location_sentences = []
    for par in re.split('[\n]{2,}', scidoc.relevant_text):
        par_clean = gp.multireplace(par)
        #print("Clean paragraph: %s" % (par_clean))
        sentences = nltk.sent_tokenize(par_clean)
        for sent in sentences:
            sent_added = False
            sent_tok = nltk.word_tokenize(sent)
            sent_pos = nltk.pos_tag(sent_tok)
            sent_ner = tagger.tag(sent_tok)
            # customizable extract method
            extracted_chunks = gp.extract_chunks_from_sentence(
                sent_ner, include_cardinal=True, include_other_spatial=True, include_types=True)
            if extracted_chunks:
                #print("NER tagged sentence:\n %s" %sent_ner)
                print("### Found %s location candidates in sentence:" %len(extracted_chunks))
                for loc_chunk in extracted_chunks:
                    loc_chunk_str = gp.tuple_list_to_string(loc_chunk)
                    content_locations.append(loc_chunk_str)
                    print("\t%s" %loc_chunk_str)
                extracted_chunks_pos = gp.filter_chunk_candidates(sent_tok, extracted_chunks, verbose=True)
                print("### Kept %s location chunks:" %len(extracted_chunks_pos))
                for loc_chunk_keep in extracted_chunks_pos:
                    loc_chunk_keep_str = gp.tuple_list_to_string(loc_chunk_keep)
                    content_locations_filtered.append(loc_chunk_keep_str)
                    print("\t%s" %loc_chunk_keep_str)
                    if not sent_added:
                        sent_no_breaks = sent.replace('\n', ' ')
                        location_sentences.append(sent_no_breaks)
                        sent_added = True
            
    scidoc.content_locations = content_locations
    scidoc.content_locations_filtered = content_locations_filtered
    scidoc.location_sentences = location_sentences
    
print("\n### Done.")
            

In [None]:
# optionally repickle article data with locations etc
du.pickle_data(science_docs, path_to_repickle)

### Prepare per-article results

In [None]:
# flatten each list entry to a string instead of a list
filenames_for_df = []
use_xml_for_df = []
methods_for_df = []
titles_for_df = []
title_locations_for_df = []
content_locations_for_df = []
content_locations_filtered_for_df = []
location_sentences_for_df = []
for doc in science_docs:
    filenames_for_df.append(doc.file_name)
    use_xml_for_df.append(doc.use_xml)
    try:
        titles_for_df.append(doc.title)
    except AttributeError:
        titles_for_df.append(gp.NO_TITLE_STRING)
    if doc.title_locations == gp.NO_XML_STRING or doc.title_locations == gp.NO_TITLE_STRING:
        title_locations_for_df.append(doc.title_locations)
    else:
        title_locations_for_df.append('; '.join([x for x in doc.title_locations]))
    try:
        methods_for_df.append(doc.methods_sections)
    except AttributeError:
        methods_for_df.append('')
    if not doc.content_locations:
        content_locations_for_df.append('')
    elif doc.content_locations == gp.NO_METHODS_STRING:
        content_locations_for_df.append(doc.content_locations)
    else:
        content_locations_for_df.append('; '.join([x for x in doc.content_locations]))
    if not doc.content_locations_filtered:
        content_locations_filtered_for_df.append('')
    elif doc.content_locations_filtered == gp.NO_METHODS_STRING:
        content_locations_filtered_for_df.append(doc.content_locations_filtered)
    else:
        content_locations_filtered_for_df.append('; '.join([x for x in doc.content_locations_filtered]))
    if not doc.location_sentences:
        location_sentences_for_df.append('')
    else:
        location_sentences_for_df.append(doc.location_sentences)

df_geoparsed = pd.DataFrame({'filename_only':filenames_for_df, 
                            'use_xml':use_xml_for_df,
                            'title':titles_for_df,
                            'title_locations':title_locations_for_df,
                            'methods_sections':methods_for_df,
                            'content_locations':content_locations_for_df,
                            'content_locations_filtered':content_locations_filtered_for_df,
                            'location_sentences':location_sentences_for_df})

# increase the column width display of pandas tables to view full cells
#pd.options.display.max_colwidth = 500

df_geoparsed.head()

In [None]:
df_geoparsed.to_csv(os.path.join('results', 'articles_geoparsed.tsv'), sep='\t', index=False, quotechar='"', encoding='utf-8')

### Prepare per-location results

In [None]:
# now each list item (each final content location) will be a separate row in the df
filenames_flat = []
content_locations_filtered_flat = []
location_sentences_flat = []
use_xml_flat = []
for doc in science_docs:
    if not doc.content_locations_filtered:
        # store the 'no location' case!
        content_locations_filtered_flat.append(gp.NO_LOCATIONS_STRING)
        location_sentences_flat.append(gp.NO_LOCATIONS_STRING)
        use_xml_flat.append(doc.use_xml)
        filenames_flat.append(doc.file_name)
        continue
    elif doc.content_locations_filtered == gp.NO_METHODS_STRING:
        content_locations_filtered_flat.append(gp.NO_METHODS_STRING)
        location_sentences_flat.append(gp.NO_METHODS_STRING)
        use_xml_flat.append(doc.use_xml)  # we store 'N/A' already
        filenames_flat.append(doc.file_name)
    else:
        for location in doc.content_locations_filtered:
            content_locations_filtered_flat.append(location)
            use_xml_flat.append(doc.use_xml)
            filenames_flat.append(doc.file_name)
            found_sentence = False
            for sentence in doc.location_sentences:
                if location in sentence:
                    found_sentence = True
                    location_sentences_flat.append(sentence)
                    break
            if not found_sentence:
                location_sentences_flat.append('no exact sentence match')

In [None]:
df_flat = pd.DataFrame({'filename':filenames_flat,
                        'content_locations':content_locations_filtered_flat,
                        'use_xml':use_xml_flat,
                        'location_sentences':location_sentences_flat})

df_flat.head()

In [None]:
df_flat.to_csv(os.path.join('results', 'locations.tsv'), sep='\t', index=False, quotechar='"', encoding='utf-8')