# Extraction of growth temperature from fulltext articles
A pattern is used to extract the growth temperature from the fulltext articles

In [1]:
import pandas as pd
import spacy
from spacy import displacy
from spacy.matcher import Matcher 
from spacy.matcher import DependencyMatcher
from spacy.lang.en import English

In [2]:
# reads full text articles as string
df = pd.read_csv('cleaned_fulltext_articles.csv')
list_text = df['fulltext'].dropna().tolist()
text = ' '.join(map(str, list_text))

In [3]:
# loads nlp pipeline without NER model
nlp = spacy.load('en_core_sci_lg', exclude='ner')

In [4]:
# pattern for extracting growth temperature
dep_pattern = [{'RIGHT_ID': 'C', 'RIGHT_ATTRS': {'POS': 'NOUN'}},
               {'LEFT_ID': 'C', 'REL_OP': '>', 'RIGHT_ID': 'number', 'RIGHT_ATTRS': {'DEP': 'nummod', 'POS': 'NUM'}},
               {'LEFT_ID': 'C', 'REL_OP': '>', 'RIGHT_ID': 'temp', 'RIGHT_ATTRS': {'DEP': 'nsubj',  'TEXT': 'temperature'}},
                {'LEFT_ID': 'temp', 'REL_OP': '>', 'RIGHT_ID': 'growth', 'RIGHT_ATTRS': {'DEP': 'nmod', 'TEXT': 'growth'}}
              ]

In [5]:
# defines dependency matcher and adds patterns
dep_matcher = DependencyMatcher(vocab=nlp.vocab)
dep_matcher.add('temperature', patterns=[dep_pattern])

In [6]:
'''
loops through each article, process it with the nlp pipeline, 
search in it for the pattern and save the results in a list.
counts the time the process takes
'''

import time
start = time.time()

results = []
for article in list_text:
    doc = nlp(article)
    dep_matches = dep_matcher(doc)
    for match in dep_matches:
        pattern_name = match[0]
        matches = match[1]   
        match1, match2, match3, match4 = matches[0], matches[1], matches[2], matches[3]
        results.append(( doc[match3], doc[match4], doc[match2], doc[match1]))

end = time.time()
print(end - start)

4115.023907184601


In [7]:
len(list_text)

14631

In [8]:
results

[(temperature, growth, 37, C),
 (temperature, growth, 35, C),
 (temperature, growth, 30, growth),
 (temperature, growth, 37, C),
 (temperature, growth, 37, C),
 (temperature, growth, 37, C),
 (temperature, growth, 28, C),
 (temperature, growth, 37, C),
 (temperature, growth, 31, C),
 (temperature, growth, 30, C),
 (temperature, growth, 34, C),
 (temperature, growth, 40, C),
 (temperature, growth, 37, C),
 (temperature, growth, 30, C),
 (temperature, growth, 30, C),
 (temperature, growth, 30, C),
 (temperature, growth, 30, C),
 (temperature, growth, 20–45, C),
 (temperature, growth, 37, C),
 (temperature, growth, 37, C),
 (temperature, growth, 30, C),
 (temperature, growth, 37, C),
 (temperature, growth, 37, C),
 (temperature, growth, 45, C),
 (temperature, growth, 20, C),
 (temperature, growth, 35, C),
 (temperature, growth, 25, C),
 (temperature, growth, 40, C),
 (temperature, growth, 40, C),
 (temperature, growth, 37, C),
 (temperature, growth, 35, C),
 (temperature, growth, 60, C),


In [9]:
len(results) # 11 false positives

656