In [1]:
import pandas as pd
import spacy
from spacy import displacy
from spacy.matcher import Matcher 
from spacy.matcher import DependencyMatcher
from spacy.lang.en import English

In [2]:
df = pd.read_csv('cleaned_fulltext_articles.csv')
list_text = df.loc[:110, 'fulltext'].dropna().tolist()
text = ' '.join(map(str, list_text))

In [3]:
# loads nlp pipeline without NER model
nlp = spacy.load('en_core_sci_lg', exclude='ner')

In [4]:
doc = nlp(text)

In [7]:
sents = list(doc.sents)

## Pattern matching

In [8]:
# defines matcher patterns for sentences containing temperature growth
pattern1 = [
           {"TEXT": "temperature"},
           {"IS_ASCII": True, "OP": "*"}, 
           {"TEXT": "growth"},
           {"IS_ASCII": True, "OP": "*"}, 
           {"IS_DIGIT": True, "OP": "+"}
          ]

pattern2 = [{"TEXT": "growth"},
           {"IS_ASCII": True, "OP": "*"},
           {"TEXT": "temperature"},
           {"IS_ASCII": True, "OP": "*"}, 
           {"IS_DIGIT": True, "OP": "+"}
          
          ]

In [9]:
# defines the matcher and adds the patterns
matcher = Matcher(nlp.vocab)
matcher.add("temperature", [pattern1, pattern2])

In [10]:
# searches vor patterns in sentences and writes sentence in list if pattern was found
matcher_sents = []
for sent in sents:
    if matcher(sent):
        matcher_sents.append(sent)

In [64]:
matcher_sents

[The maximum temperature for growth was determined by incubating RSD broth cultures in temperature-controlled metal heat blocks at 1°C increments from 30 to 37°C and recording growth after 7, 14, and 21 days.,
 The optimal temperature for growth appeared to be between 28 and 30°C for all strains.,
 The maximum temperature for growth was between 31 and 32°C for the strains from sugarcane and between 35 and 36°C for the strains from Bermudagrass.,
 The optimal growth temperature was about 20°C.,
 The maximum growth temperature is 45°C, the optimum growth temperature is 32 to 35°C, and the minimum growth temperature is 25°C; growth does not occur at 20 and 50°C even after 2 months.,
 The temperature range for growth is 20 to 45.5°C, with optimum growth at 42°C.,
 However, these strains differ from B. cellulosolvens in producing isobutyric acid, liquefying gelatin, and having an optimum growth temperature of 30°C.,
 The temperature range for growth was determined on ISP medium 4.,
 The tem

In [10]:
# displays dependencies and POS of tokens
displacy.render(matcher_sents, style="dep", options={"compact": True})

## Dependency matching

In [14]:
# pattern for extracting growth temperature
dep_pattern = [{'RIGHT_ID': 'C', 'RIGHT_ATTRS': {'POS': 'NOUN'}},
               {'LEFT_ID': 'C', 'REL_OP': '>', 'RIGHT_ID': 'number', 'RIGHT_ATTRS': {'DEP': 'nummod', 'POS': 'NUM'}},
               {'LEFT_ID': 'C', 'REL_OP': '>', 'RIGHT_ID': 'temp', 'RIGHT_ATTRS': {'DEP': 'nsubj',  'TEXT': 'temperature'}},
                {'LEFT_ID': 'temp', 'REL_OP': '>', 'RIGHT_ID': 'growth', 'RIGHT_ATTRS': {'DEP': 'nmod', 'TEXT': 'growth'}}
              ]

In [10]:
# the first RIGHT_ID is anchor
dep_pattern = [{'RIGHT_ID': 'temp', 'RIGHT_ATTRS': {'TEXT': 'temperature'}},
               {'LEFT_ID': 'temp', 'REL_OP': '>', 'RIGHT_ID': 'growth', 'RIGHT_ATTRS': {'DEP': 'nmod', 'POS': 'NOUN'}},
               {'LEFT_ID': 'temp', 'REL_OP': '<', 'RIGHT_ID': 'C', 'RIGHT_ATTRS': {'DEP': 'nsubj'}},
                {'LEFT_ID': 'C', 'REL_OP': '>', 'RIGHT_ID': 'num', 'RIGHT_ATTRS': {'DEP': 'nummod'}}
              ]

In [15]:
# defines dependency matcher and add patterns
dep_matcher = DependencyMatcher(vocab=nlp.vocab)
dep_matcher.add("temperature", patterns=[dep_pattern])

In [16]:
# searches for matches in doc and saves them in list
dep_matches = dep_matcher(doc)

In [17]:
dep_matches

[(5627807717403523368, [13383, 13381, 13375, 13378]),
 (5627807717403523368, [61323, 61319, 61313, 61316]),
 (5627807717403523368, [71471, 71467, 71459, 71462]),
 (5627807717403523368, [78319, 78317, 78312, 78315])]

In [16]:
# 2 matches
for match in dep_matches:
    pattern_name = match[0]
    matches = match[1]   
    match1, match2 = matches[0], matches[1]
    print(nlp.vocab[pattern_name].text, '\t', doc[match1], '...', doc[match2])

In [17]:
# 3 matches
for match in dep_matches:
    pattern_name = match[0]
    matches = match[1]   
    match1, match2, match3 = matches[0], matches[1], matches[2]
    print(nlp.vocab[pattern_name].text, '\t', doc[match1], '...', doc[match2], '...', doc[match3])

In [23]:
# loops through matches and prints strings of token positions
for match in dep_matches:
    pattern_name = match[0]
    matches = match[1]   
    match1, match2, match3, match4 = matches[0], matches[1], matches[2], matches[3]
    print(nlp.vocab[pattern_name].text, '\t', doc[match3], '...', doc[match4], '...',doc[match2] , '...', doc[match1])

temperature 	 temperature ... growth ... 37 ... C
temperature 	 temperature ... growth ... 35 ... C
temperature 	 temperature ... growth ... 30 ... growth
temperature 	 temperature ... growth ... 37 ... C
