# This notebook explores extracting candidates from GDD returned sentences.  It identifies sentences that have proper nouns and have the word dam.  It returns the docid, sentid, index of where in the sentence the word dam is, the proper noun and a start and end index value of where the noun is in the sentence.  The thought is that this can be used to help qualify a sentence as being a dam removal reference.

# This notebook is using Python 2.x

In [1]:
#Defines directories and credent
execfile('theStuff.py')

import psycopg2
import getpass
import psycopg2.extras
import pandas as pd
import numpy as np

In [2]:

conn_string = "host='localhost' dbname=" + str(database) + " user=" + str(pgu) + " password=" + str(passpg)
conn= psycopg2.connect(conn_string)

In [3]:
# Identifies proper nouns in sentences containing word dam.  This can be used to query dam names, stream names, states and alike
# that may help to identify main dam removal(s) referenced in a paper.

def findDam(words): 
    import re
    dam_index = []
    
    for indx, x in enumerate(words):
        word = x.lower()
        match = re.match(r"dam", word) or re.match(r"dam-.*", word) or re.match(r".*-dam", word)
        if match:
            dam_index.append(indx)
    return dam_index
        
def properNouns(cursor):
    d = []
    #Requires cursor from postgres query of initial NLP data from UW database
    for sentence in cursor:
        #Identify fields within cursor
        docid = sentence[0]
        sentid = sentence[1]
        words = sentence[3]
        poses = sentence[4]
        index_num = len(poses)   # Set index for total number of items within sentence
        dam_index = findDam(words)
        ####################################################################################################
        ###Loop through each sentence item################################################################## 
        #if Proper Noun return docid, sentid, begin_index, end_index, and entire string of Proper Noun Words
        ###################################################################################################
        
        for i in xrange(index_num): 
            
            #try used to catch first iteration where end_index is not currently defined
            try:   
                # If catches all Proper Nouns that are more than one word, elif catches Proper Nouns = 1 word
                # if and elif require end index to be less than index so partial string of Proper Nouns are not captured 
                # where end_index is greater than index continue to next index value in sentence 
                if i > end_index and poses[i] == "NNP" and (poses[i+1] == 'NNP' or poses[i+1] == 'NN'):
                    end_index = i + 1
                    proper_noun = words[i]
                    while end_index < index_num and ((poses[end_index] == "NNP") or (poses[end_index] == "NN")):
                        proper_noun += (" " + words[end_index])
                        mention_id = "%s_%d_%d_%d" % (docid, sentid, i, end_index)
                        end_index += 1
                elif i > end_index and poses[i] == "NNP":
                    end_index = i 
                    mention_id = "%s_%d_%d_%d" % (docid, sentid, i, end_index)
                    proper_noun = words[i]
                else:
                    continue
            except:
                if poses[i] == "NNP" and (poses[i+1] == 'NNP' or poses[i+1] == 'NN'):
                    end_index = i + 1
                    proper_noun = words[i]
                    while end_index < index_num and ((poses[end_index] == "NNP") or (poses[end_index] == "NN")):
                        proper_noun += (" " + words[end_index])
                        mention_id = "%s_%d_%d_%d" % (docid, sentid, i, end_index)
                        end_index += 1
                elif poses[i] == "NNP":
                    end_index = i 
                    mention_id = "%s_%d_%d_%d" % (docid, sentid, i, end_index)
                    proper_noun = words[i]
                else: 
                    continue
            d.append({"mention_id": mention_id, 'docid': docid, 'sentid': sentid, 'begin_index': i, 'end_index': end_index, 'proper_noun': proper_noun, 'dam_index': dam_index})


            
        #Clear variables: mostly for rerunning in ipython notebook
        try:
            del end_index
        except:
            continue
        try:
            del i
        except:
            continue
    return d


In [4]:
#returns sentences with work dam in it
cursor = conn.cursor()
cursor.execute("select distinct(b.*) from (select docid, sentid, unnest(words) as word from sentences) a left join sentences b on (a.docid=b.docid AND a.sentid=b.sentid) where (lower(a.word) = 'dam' OR lower(a.word) LIKE '%-dam' OR lower(a.word) LIKE 'dam-%' OR lower(a.word) = 'dams') ;")

#Small query return for testing purposes

#cursor.execute("select * from public.sentences where docid = '57a8c3f9cf58f19c6886b244' and (sentid in (20,21,22,23,24,25,26,27));")
data = properNouns(cursor)
cursor.close()
      
df=pd.DataFrame(data)
df.head(150)


            

Unnamed: 0,begin_index,dam_index,docid,end_index,mention_id,proper_noun,sentid
0,1,[19],54b43277e138239d86852119,3,54b43277e138239d86852119_28_1_2,I.M. Miller,28
1,6,[19],54b43277e138239d86852119,8,54b43277e138239d86852119_28_6_7,Marine Geology,28
2,22,[19],54b43277e138239d86852119,24,54b43277e138239d86852119_28_22_23,Elwha River,28
3,25,[19],54b43277e138239d86852119,27,54b43277e138239d86852119_28_25_26,Washington State,28
4,28,[19],54b43277e138239d86852119,30,54b43277e138239d86852119_28_28_29,Fig. 1a,28
5,23,[1],54b43277e138239d86852119,23,54b43277e138239d86852119_29_23_23,Gilbert,29
6,25,[1],54b43277e138239d86852119,25,54b43277e138239d86852119_29_25_25,Link,29
7,29,[1],54b43277e138239d86852119,29,54b43277e138239d86852119_29_29_29,Duda,29
8,4,[9],54b43277e138239d86852119,7,54b43277e138239d86852119_433_4_6,Elwha River ecosystem,433
9,13,[34],54bff8b8e1382389c54c3428,13,54bff8b8e1382389c54c3428_21_13_13,Fung,21


In [5]:
f = directory + "/app-template/testing/removedDams20151214.csv"
removedDams = pd.read_csv(f)
merged = pd.merge(df, removedDams, how='inner', left_on='proper_noun', right_on='title')

In [7]:
merged.head(150)

Unnamed: 0,begin_index,dam_index,docid,end_index,mention_id,proper_noun,sentid,title,gnis_name,state
0,42,"[9, 15, 21, 28, 36, 43, 50]",5579be78e138231c7c52c44d,44,5579be78e138231c7c52c44d_66_42_43,Kent Dam,66,Kent Dam,Cuyahoga River,OH
1,5,"[6, 17]",5579be78e138231c7c52c44d,7,5579be78e138231c7c52c44d_85_5_6,Kent Dam,85,Kent Dam,Cuyahoga River,OH
2,15,"[6, 17]",5579be78e138231c7c52c44d,18,5579be78e138231c7c52c44d_85_15_17,Munroe Falls Dam,85,Munroe Falls Dam,Cuyahoga River,OH
3,5,[7],5579be78e138231c7c52c44d,8,5579be78e138231c7c52c44d_303_5_7,Munroe Falls Dam,303,Munroe Falls Dam,Cuyahoga River,OH
4,39,[41],5579be78e138231c7c52c44d,42,5579be78e138231c7c52c44d_375_39_41,Munroe Falls Dam,375,Munroe Falls Dam,Cuyahoga River,OH
5,11,[13],5579be78e138231c7c52c44d,14,5579be78e138231c7c52c44d_526_11_13,Munroe Falls Dam,526,Munroe Falls Dam,Cuyahoga River,OH
6,15,[17],5579be78e138231c7c52c44d,18,5579be78e138231c7c52c44d_561_15_17,Munroe Falls Dam,561,Munroe Falls Dam,Cuyahoga River,OH
7,25,"[27, 44, 60, 77]",558f03abe13823109f3ee701,28,558f03abe13823109f3ee701_209_25_27,Mill Pond Dam,209,Mill Pond Dam,Third Herring Brook,MA
8,25,"[27, 44, 60, 77]",558f03abe13823109f3ee701,28,558f03abe13823109f3ee701_209_25_27,Mill Pond Dam,209,Mill Pond Dam,Chippewa River,MI
9,42,"[27, 44, 60, 77]",558f03abe13823109f3ee701,45,558f03abe13823109f3ee701_209_42_44,Mill Pond Dam,209,Mill Pond Dam,Third Herring Brook,MA
