# Generate Training Data

We wish to speed up the process of having a human being name entities that they recognize in a paragraph, and having their positions in the paragraph identified and placed in a syntax usable by spaCy's NER training routine.

## Open PDF file, extract page two and display as sentences

In [2]:
import spacy
import PyPDF2
nlp = spacy.load('en_core_web_sm')

#Open PDF file for reading
pdfFile = open("BarCvDescLJ11.pdf", mode="rb")
pdfReader = PyPDF2.PdfFileReader(pdfFile)

# Select a page to work on
pageNumber = 3

# Get text
OnePage = pdfReader.getPage(pageNumber-1) #0-based count
OnePageText = OnePage.extractText()

# Close PDF file
pdfFile.close()

# Remove newlinesxmx82k-&a. It appears multiple newlines together makes
# Spacy think that is the end of a sentence. The PDF reader reads the text in
# an odd fashion
OnePageText = OnePageText.replace('\n','')

# create a spaCy doc object from the page and break it into sentences
doc = nlp(OnePageText)
l=0
for sent in doc.sents:
     print(l, ": ", sent)
     l = l+1


0 :  3  
1 :  the Central Valley and the south-central coastal regions of California.     
2 :  CONRAD  
3 :  Conrad is a two-rowed spring malting barley.
4 :  It was released by Busch Agricultural Resources in 2005.
5 :  It was selected from the cross B1215/B88-5336.
6 :  Its experimental designation was 2B96-5057.
7 :  It has consistently plump grain.
8 :  It has medium late maturity
9 :  (heads about a half day earlier and matures about a half day later than B1202) and medium-tall plant height (averages about 3 cm shorter than B1202 and about 5 cm shorter than Harrington).
10 :  Straw strength is similar to B1202 (fair).
11 :  It has malt protein levels similar to Merit and Harrington, high levels of enzymes like Merit, and higher levels of extract and better malt modification than B1202.
12 :  At the time of release, its resistance to scald was similar to B1202 and slightly better than Harrington (moderately susceptible), and its resistance to net blotch (net form) was slightly bet

## Read in Per-line named entity file and match entities to sentence positions.

In [3]:
import re
import csv
import pandas as pd
fname = "barley_p3_ner.txt"

# Covert the nlp senetence generator into a list of sentences
sentences = list(doc.sents)

# Open the file of manually matched pairs (sentence # <tab> word phrase <tab> named entity)
# e.g.:
#  0      AC Metcalfe     CVAR
#  0      two-rowed       TRAT
#  0      barley          CROP
#  1      Agri-Food Candada   ORG
#  1      1997    DATE
file = open(fname)
reader = csv.reader(file, delimiter='\t', quoting=csv.QUOTE_NONE)
data = list()

for row in reader:
    try:
        (sentIndex, phrase, label) = row
        sent = sentences[int(sentIndex)].string.rstrip()
        
        # find all instances of the 'phrase' in the 'sent'.
        iter = re.finditer(r"\b"+phrase+r"\b", sent)
        indices = [m.start(0) for m in iter]
        
        # check to make sure the phrase the user said was there was indeed found
        if len(indices) == 0:
            raise ValueError
                
        # print out all instances
        for i in indices:
#            print(sentIndex, sent, phrase, "("+str(i), i+len(phrase), "'"+label+"')")
            data.append([sentIndex, sent, phrase, "("+str(i)+", "+str(i+len(phrase))+", '"+label+"')"])
            
    except:
        print("Handle manually: ", row)
        
df = pd.DataFrame(data, columns = ["Index", "Sentence", "Phrase", "MatchInfo"])
print(df)


Handle manually:  ['31', 'Crop Science 46:1396 (2006)', 'JRNL']
Handle manually:  ['41', '-amylase', 'TRAT']
Handle manually:  ['59', 'WA Sel 3564/Unitan//UT Short2*2', 'PED']
    Index                                           Sentence          Phrase  \
0       3       Conrad is a two-rowed spring malting barley.          Conrad   
1       3       Conrad is a two-rowed spring malting barley.       two-rowed   
2       3       Conrad is a two-rowed spring malting barley.          spring   
3       3       Conrad is a two-rowed spring malting barley.         malting   
4       3       Conrad is a two-rowed spring malting barley.          barley   
..    ...                                                ...             ...   
167    57  MILLENNIUM  Millennium is a six-rowed spring f...          barley   
168    58           It was released by the Utah AES in 2000.        Utah AES   
169    58           It was released by the Utah AES in 2000.            2000   
170    62  Its experimen

## Create a function to clean up overlapping intervals

In [36]:
import re
coordRegex = re.compile(r'(\d+), (\d+)')

def sortByStart(coords):
    """For use in sort routines, return object with lowest (X,Y) values"""
    # split out coordinates that come in as (5, 7, 'CVAR')
    mo = coordRegex.search(coords)
    return(int(mo.group(1)))

def overlaps(coord1, coord2):
    """Check if coordinates of the form 5, 7, 'CVAR' and 32, 46, 'TRAT' overlap"""
    mo1 = coordRegex.search(coord1)
    mo2 = coordRegex.search(coord2)
    coord1Low = int(mo1.group(1))
    coord1High = int(mo1.group(2))
    coord2Low = int(mo2.group(1))
    coord2High = int(mo2.group(2))
    
    if ((coord1High >= coord2Low) and (coord1Low <= coord2Low) or
        (coord2High >= coord1Low) and (coord2Low <= coord1Low)):
        return True
    else:
        return False

def keepFirst(coord1, coord2):
    """Given overlapping coordinates, return the wider encompassing one."""
    mo1 = coordRegex.search(coord1)
    mo2 = coordRegex.search(coord2)
    coord1Low = int(mo1.group(1))
    coord1High = int(mo1.group(2))
    coord2Low = int(mo2.group(1))
    coord2High = int(mo2.group(2))
 
    if (int(coord1High) - int(coord1Low)) >= (int(coord2High) - int(coord2Low)):
        return True
    else:
        return False

# print(sortByStart("(5, 7, 'CVAR')"))
# print("Should be false:", overlaps("(5, 7, 'CVAR')", "(32, 46, 'TRAT')"))
# print("Should be true:", overlaps("(26, 46, 'TRAT')", "(32, 46, 'TRAT')"))
# print("Keeper:", keepFirst("34, 46, 'TRAT'", "32, 46, 'TRAT'"))

def cleanIntervals(inputString=""):
    """order intervals like (5, 7, 'CVAR'), (32, 46, 'TRAT'), (26, 46, 'TRAT') and remove overlapping ones."""
    inputString = inputString.lstrip("(").rstrip(")")
    intervalList = inputString.split("), (")
    intervalList.sort(key = sortByStart)

    # Pairwise compare every interval in the list to every other interval to check overlap
    keeperList = [True]*len(intervalList) # Logic array to determine if each interval should be kept
    i=0
    for interval1 in intervalList:
        for interval2 in intervalList:
            if interval1 != interval2 and overlaps(interval1, interval2) and keepFirst(interval1, interval2) == False:
                keeperList[i] = False
        i = i+1
   
    # Build up the return interval list
    returnStr = "("
    for interval, isKeeper in zip(intervalList, keeperList):
        if isKeeper:
            returnStr = returnStr + interval + "), ("
    return (returnStr.rstrip("), (") + ")")
        
# cleanIntervals("(5, 7, 'CVAR'), (32, 46, 'TRAT'), (5, 9, 'CVAR'), (48, 55, 'ORG'), (26, 46, 'TRAT')")
cleanIntervals("(0, 8, 'CVAR'), (0, 5, 'CVAR'), (21, 26, 'PLAN'), (32, 37, 'CVAR')")

"(0, 8, 'CVAR'), (21, 26, 'PLAN'), (32, 37, 'CVAR')"

## Aggregate all matches for each sentence on a single line and output in spaCy training format

In [37]:
# use Pandas dataframes to aggregate all entity matches together for a single sentence
agg_rules = {'Sentence': 'first', 'Phrase': 'first', 'MatchInfo': lambda x: ', '.join(x)}
res = df.groupby('Index').agg(agg_rules)
#print(res)

# Now format it just like what is needed for the spaCy training module: 
# E.g.:
# ('Eight-Twelve is a six-rowed winter feed barley', {'entities': [(0, 12, 'CVAR'), (18, 27, 'TRAT'), (28, 39, 'TRAT'),(40, 46, 'CROP')]}),
records = res.to_dict('records')
for i in range(0,len(records)):
    print("    ('"+records[i]['Sentence']+"', {'entities': ["+cleanIntervals(records[i]['MatchInfo'])+"]}),")
    #print("    ('"+records[i]['Sentence']+"', {'entities': ["+records[i]['MatchInfo']+"]}),")


    ('Straw strength is similar to B1202 (fair).', {'entities': [(0, 14, 'TRAT'), (29, 34, 'CVAR')]}),
    ('It has malt protein levels similar to Merit and Harrington, high levels of enzymes like Merit, and higher levels of extract and better malt modification than B1202.', {'entities': [(38, 43, 'CVAR'), (48, 58, 'CVAR'), (60, 82, 'TRAT'), (88, 93, 'CVAR'), (99, 123, 'TRAT'), (128, 152, 'TRAT'), (158, 163, 'CVAR')]}),
    ('At the time of release, its resistance to scald was similar to B1202 and slightly better than Harrington (moderately susceptible), and its resistance to net blotch (net form) was slightly better than B1202 and Harrington (moderately resistant).', {'entities': [(28, 38, 'PPTD'), (42, 47, 'PATH'), (63, 68, 'CVAR'), (94, 104, 'CVAR'), (106, 128, 'PPTD'), (139, 149, 'PPTD'), (153, 163, 'PATH'), (200, 205, 'CVAR'), (210, 220, 'CVAR'), (222, 242, 'PPTD')]}),
    ('At the time of evaluation it was resistant to stripe rust.', {'entities': [(33, 42, 'PPTD'), (46, 57, 'PATH