# Generate Training Data

We wish to speed up the process of having a human being name entities that they recognize in a paragraph, and having their positions in the paragraph identified and placed in a syntax usable by spaCy's NER training routine.

## Open PDF file, extract page two and display as sentences

In [31]:
import spacy
import PyPDF2
nlp = spacy.load('en_core_web_sm')

#Open PDF file for reading
pdfFile = open("BarCvDescLJ11.pdf", mode="rb")
pdfReader = PyPDF2.PdfFileReader(pdfFile)

# Select a page to work on
pageNumber = 37

# Get text
OnePage = pdfReader.getPage(pageNumber-1) #0-based count
OnePageText = OnePage.extractText()

# Close PDF file
pdfFile.close()

# Remove newlinesxmx82k-&a. It appears multiple newlines together makes
# Spacy think that is the end of a sentence. The PDF reader reads the text in
# an odd fashion
OnePageText = OnePageText.replace('\n','')

# create a spaCy doc object from the page and break it into sentences
doc = nlp(OnePageText)
l=0
for sent in doc.sents:
     print(l, ": ", sent)
     l = l+1


0 :  37  
1 :  there are numerous barbs on lateral kernels.
2 :  The crease is wide, particularly at the awn end.
3 :  The kernel has a hump on the back, particularly on plump kernels.
4 :  Lateral kernels are moderately twisted.
5 :  Winter Tennessee is less susceptible to the major diseases than Atlas.
6 :  At the time of evaluation it was resistant to scald and susceptible to BYD.
7 :  It was evaluated as Entry 10 in the UC Regional Cereal Testing program from 1980-1981 for late fall planting in the Central Valley and the south-central coastal regions of California.    
8 :  WOCUS 71  
9 :  Wocus 71 is a six-rowed spring feed barley.
10 :  It was released by the University of California AES in 1972.
11 :  It is a composite of 500 head rows selected from Wocus on the basis of improved straw quality (height and strength), earlier maturity, and increased kernel size.
12 :  Wocus, which originated from the cross Coast/Lion//Winter Club made at Logan, Utah in 1949, was released by the Or

## Read in Per-line named entity file and match entities to sentence positions.

In [4]:
import re
import csv
import pandas as pd

fname = "Data/DavisLJ11/barley_p"+str(pageNumber)+"_ner.txt"

# Covert the nlp senetence generator into a list of sentences
sentences = list(doc.sents)

# Open the file of manually matched pairs (sentence # <tab> word phrase <tab> named entity)
# e.g.:
#  0      AC Metcalfe     CVAR
#  0      two-rowed       TRAT
#  0      barley          CROP
#  1      Agri-Food Candada   ORG
#  1      1997    DATE
file = open(fname)
reader = csv.reader(file, delimiter='\t', quoting=csv.QUOTE_NONE)
data = list()

for row in reader:
    try:
        (sentIndex, phrase, label) = row
        sent = sentences[int(sentIndex)].string.rstrip()
        
        # find all instances of the 'phrase' in the 'sent'.
        iter = re.finditer(r"\b"+phrase+r"\b", sent)
        indices = [m.start(0) for m in iter]
        
        # check to make sure the phrase the user said was there was indeed found
        if len(indices) == 0:
            raise ValueError
                
        # print out all instances
        for i in indices:
#            print(sentIndex, sent, phrase, "("+str(i), i+len(phrase), "'"+label+"')")
            data.append([sentIndex, sent, phrase, "("+str(i)+", "+str(i+len(phrase))+", '"+label+"')"])
            
    except:
        print("Handle manually: ", row)
        
df = pd.DataFrame(data, columns = ["Index", "Sentence", "Phrase", "MatchInfo"])
print(df)


Handle manually:  ['4', 'Ataco/Achira//Higo x UC 960', 'PED']
Handle manually:  ['35', 'Crop Science 30:1154-1155 (1990)', 'JRNL']
Handle manually:  ['39', 'Steptoe/2*Diamant /3/Minn Dwarf 64.98-8/Briggs/4/Asse', 'PED']
Handle manually:  ['56', 'Crop Science 43:437 (2003)', 'JRNL']
Handle manually:  ['61', 'Steptoe/2*Diamant/3/Minn Dwarf 64.98-8/Briggs/4/Asse', 'PED']
    Index                                           Sentence  \
0       2  Tamalpais is a six-rowed spring hulless feed/f...   
1       2  Tamalpais is a six-rowed spring hulless feed/f...   
2       2  Tamalpais is a six-rowed spring hulless feed/f...   
3       2  Tamalpais is a six-rowed spring hulless feed/f...   
4       2  Tamalpais is a six-rowed spring hulless feed/f...   
..    ...                                                ...   
145    69  UC 960 is short-statured with excellent straw ...   
146    70  UC 960 is early maturing (about one week earli...   
147    70  UC 960 is early maturing (about one week e

## Create a function to clean up overlapping intervals

In [5]:
import re
coordRegex = re.compile(r'(\d+), (\d+)')

def sortByStart(coords):
    """For use in sort routines, return object with lowest (X,Y) values"""
    # split out coordinates that come in as (5, 7, 'CVAR')
    mo = coordRegex.search(coords)
    return(int(mo.group(1)))

def overlaps(coord1, coord2):
    """Check if coordinates of the form 5, 7, 'CVAR' and 32, 46, 'TRAT' overlap"""
    mo1 = coordRegex.search(coord1)
    mo2 = coordRegex.search(coord2)
    coord1Low = int(mo1.group(1))
    coord1High = int(mo1.group(2))
    coord2Low = int(mo2.group(1))
    coord2High = int(mo2.group(2))
    
    if ((coord1High >= coord2Low) and (coord1Low <= coord2Low) or
        (coord2High >= coord1Low) and (coord2Low <= coord1Low)):
        return True
    else:
        return False

def keepFirst(coord1, coord2):
    """Given overlapping coordinates, return the wider encompassing one."""
    mo1 = coordRegex.search(coord1)
    mo2 = coordRegex.search(coord2)
    coord1Low = int(mo1.group(1))
    coord1High = int(mo1.group(2))
    coord2Low = int(mo2.group(1))
    coord2High = int(mo2.group(2))
 
    if (int(coord1High) - int(coord1Low)) >= (int(coord2High) - int(coord2Low)):
        return True
    else:
        return False

# print("Should be false:", overlaps("(5, 7, 'CVAR')", "(32, 46, 'TRAT')"))
# print("Should be true:", overlaps("(26, 46, 'TRAT')", "(32, 46, 'TRAT')"))
# print("Should be true:", overlaps("(26, 46, 'TRAT')", "(26, 46, 'TRAT')"))
# print("Keeper:", keepFirst("34, 46, 'TRAT'", "34, 46, 'TRAT'"))

def cleanIntervals(inputString=""):
    """order intervals like (5, 7, 'CVAR'), (32, 46, 'TRAT'), (26, 46, 'TRAT') and remove overlapping ones."""
    inputString = inputString.lstrip("(").rstrip(")")
    intervalList = inputString.split("), (")
    intervalList.sort(key = sortByStart)
#    print("Sorted Interval List:", intervalList)

    # Pairwise compare every interval in the list to every other interval to check overlap
    keeperList = [True]*len(intervalList) # Logic array to determine if each interval should be kept
    i=0
    for interval1 in intervalList:
        for interval2 in intervalList:
            if interval1 == interval2:
                if intervalList.index(interval1) != i: # when both are the same we reject the higher one
                    keeperList[i] = False
            else:
                if overlaps(interval1, interval2) and keepFirst(interval1, interval2) == False:
                    keeperList[i] = False
        i = i+1
        
#    print("keeperList:", keeperList)
   
    # Build up the return interval list
    returnStr = "("
    for interval, isKeeper in zip(intervalList, keeperList):
        if isKeeper:
            returnStr = returnStr + interval + "), ("
    return (returnStr.rstrip("), (") + ")")
        
# cleanIntervals("(5, 7, 'CVAR'), (32, 46, 'TRAT'), (5, 9, 'CVAR'), (48, 55, 'ORG'), (26, 46, 'TRAT')")
# cleanIntervals("(0, 8, 'CVAR'), (0, 5, 'CVAR'), (21, 26, 'PLAN'), (32, 37, 'CVAR')")
cleanIntervals("(0, 12, 'CVAR'), (39, 49, 'CVAR'), (39, 49, 'CVAR'), (71, 77, 'CVAR'), (71, 77, 'CVAR'), (92, 113, 'TRAT'), (140, 150, 'CVAR'), (140, 150, 'CVAR'), (181, 187, 'CVAR'), (181, 187, 'CVAR')")


"(0, 12, 'CVAR'), (39, 49, 'CVAR'), (71, 77, 'CVAR'), (92, 113, 'TRAT'), (140, 150, 'CVAR'), (181, 187, 'CVAR')"

## Aggregate all matches for each sentence on a single line and output in spaCy training format

In [10]:
# use Pandas dataframes to aggregate all entity matches together for a single sentence
agg_rules = {'Sentence': 'first', 'Phrase': 'first', 'MatchInfo': lambda x: ', '.join(x)}
res = df.groupby('Index').agg(agg_rules)
#print(res)

# Now format it just like what is needed for the spaCy training module: 
# E.g.:
# ('Eight-Twelve is a six-rowed winter feed barley', {'entities': [(0, 12, 'CVAR'), (18, 27, 'TRAT'), (28, 39, 'TRAT'),(40, 46, 'CROP')]}),
records = res.to_dict('records')
print("TRAIN_DATA = [")
maxr = len(records)
for i in range(0,maxr):
    print("    ('"+records[i]['Sentence']+"', {'entities': ["+cleanIntervals(records[i]['MatchInfo'])+"]})", end='')
    if (i == maxr-1):
        print()
    else:
        print(",")

print("]")

TRAIN_DATA = [
    ('It has full, rough awns and has good resistance to shattering.', {'entities': [(19, 23, 'PLAN'), (37, 61, 'TRAT')]}),
    ('Kernels are beige (non-blue, transparent aleurone which is classified as white).', {'entities': [(0, 7, 'PLAN'), (41, 49, 'PLAN')]}),
    ('As a hulless (naked) feed/food barley, Tamalpais should appeal to growers of organic barley for human consumption uses in breakfast cereal and soups.', {'entities': [(5, 12, 'TRAT'), (21, 25, 'TRAT'), (26, 30, 'TRAT'), (31, 37, 'CROP'), (39, 48, 'CVAR'), (85, 91, 'CROP')]}),
    ('Tamalpais is high in Beta-glucan (above 6%), a measure of soluble fiber, and may be useful in lowering human cholesterol levels.', {'entities': [(0, 9, 'CVAR'), (21, 32, 'TRAT')]}),
    ('At the time of release it was resistant to leaf rust and powdery mildew, moderately resistant to scald, net blotch, and BYD, and moderately susceptible to stripe rust.', {'entities': [(30, 39, 'PPTD'), (43, 52, 'PATH'), (57, 71, 'PATH'), (73, 93

In my current process, I am writing the above content to a file e.g., `Data/DavisLJ11/barley_p5_td.py`, adding any manual corrections (usually PED and JRNL entries) and then running the script `python3 py2json.py --doc 'BarCvDescLJ11.pdf' --url 'https://smallgrains.ucdavis.edu/cereal_files/BarCvDescLJ11.pdf' --chunk 5 Data/DavisLJ11/barley_p5_td.py Data/DavisLJ11/barley_p5_td.json` to create the JSON file for Training.