# Extraction of growth temperature from fulltext articles
A pattern is used to extract the growth temperature from the fulltext articles

In [1]:
import pandas as pd
import spacy
from spacy import displacy
from spacy.matcher import Matcher 
from spacy.matcher import DependencyMatcher
from spacy.lang.en import English

In [2]:
# reads full text articles as string
df = pd.read_csv('cleaned_fulltext_articles.csv')
list_text = df['fulltext'].dropna().tolist()
text = ' '.join(map(str, list_text))

In [3]:
# loads nlp pipeline without NER model
nlp = spacy.load('en_core_sci_lg', exclude='ner')

In [4]:
# pattern for extracting growth temperature
dep_pattern = [{'RIGHT_ID': 'C', 'RIGHT_ATTRS': {'POS': 'NOUN'}},
               {'LEFT_ID': 'C', 'REL_OP': '>', 'RIGHT_ID': 'number', 'RIGHT_ATTRS': {'DEP': 'nummod', 'POS': 'NUM'}},
               {'LEFT_ID': 'C', 'REL_OP': '>', 'RIGHT_ID': 'temp', 'RIGHT_ATTRS': {'DEP': 'nsubj',  'TEXT': 'temperature'}},
                {'LEFT_ID': 'temp', 'REL_OP': '>', 'RIGHT_ID': 'growth', 'RIGHT_ATTRS': {'DEP': 'nmod', 'TEXT': 'growth'}}
              ]

In [5]:
# defines dependency matcher and adds patterns
dep_matcher = DependencyMatcher(vocab=nlp.vocab)
dep_matcher.add('temperature', patterns=[dep_pattern])

In [10]:
'''
loops through each article, process it with the nlp pipeline, 
search in it for the pattern and save the results in a list.
counts the time the process takes
'''

import time
start = time.time()

results = []
for count, article in enumerate(list_text):
    doc = nlp(article)
    dep_matches = dep_matcher(doc)
    for match in dep_matches:
        pattern_name = match[0]
        matches = match[1]   
        match1, match2, match3, match4 = matches[0], matches[1], matches[2], matches[3]
        results.append([count, match[1], [doc[match3], doc[match4], doc[match2], doc[match1]]])

end = time.time()
print(end - start)

4230.3361558914185


In [14]:
len(list_text)

14631

In [15]:
results # 11 false positives

[[9, [1595, 1593, 1587, 1590], [temperature, growth, 37, C]],
 [33, [1659, 1655, 1649, 1652], [temperature, growth, 35, C]],
 [38, [575, 571, 563, 566], [temperature, growth, 30, growth]],
 [39, [3520, 3518, 3513, 3516], [temperature, growth, 37, C]],
 [95, [2338, 2336, 2330, 2332], [temperature, growth, 37, C]],
 [98, [2419, 2417, 2412, 2414], [temperature, growth, 37, C]],
 [99, [596, 594, 589, 591], [temperature, growth, 28, C]],
 [100, [732, 730, 726, 728], [temperature, growth, 37, C]],
 [129, [3297, 3293, 3288, 3290], [temperature, growth, 31, C]],
 [152, [3488, 3486, 3479, 3482], [temperature, growth, 30, C]],
 [182, [2680, 2678, 2673, 2676], [temperature, growth, 34, C]],
 [199, [850, 848, 842, 844], [temperature, growth, 40, C]],
 [251, [4037, 4035, 4031, 4033], [temperature, growth, 37, C]],
 [267, [2229, 2227, 2223, 2225], [temperature, growth, 30, C]],
 [267, [2471, 2469, 2465, 2467], [temperature, growth, 30, C]],
 [267, [2700, 2698, 2694, 2696], [temperature, growth, 30, 

In [16]:
len(results) 

656

In [25]:
# save results in df
df = pd.DataFrame(results, columns=['list_position', 'match_position', 'found_information'])

In [26]:
df

Unnamed: 0,list_position,match_position,found_information
0,9,"[1595, 1593, 1587, 1590]","[temperature, growth, 37, C]"
1,33,"[1659, 1655, 1649, 1652]","[temperature, growth, 35, C]"
2,38,"[575, 571, 563, 566]","[temperature, growth, 30, growth]"
3,39,"[3520, 3518, 3513, 3516]","[temperature, growth, 37, C]"
4,95,"[2338, 2336, 2330, 2332]","[temperature, growth, 37, C]"
...,...,...,...
651,14545,"[1279, 1276, 1270, 1274]","[temperature, growth, 30, C]"
652,14584,"[2080, 2077, 2069, 2071]","[temperature, growth, 37, C]"
653,14599,"[3276, 3273, 3269, 3271]","[temperature, growth, 25, C]"
654,14614,"[1488, 1485, 1481, 1483]","[temperature, growth, 28, C]"


In [28]:
# save results in csv
df.to_csv('dependency_pattern_results.csv', encoding='utf-8', index=False)