# Matching subject-verb-object-dependencies
In the articles information is searched using subject-verb-object-dependencies.

In [1]:
import spacy
from spacy.matcher import DependencyMatcher
import pandas as pd

In [2]:
df = pd.read_csv('cleaned_fulltext_articles.csv')
list_text = df.loc[:110, 'fulltext'].dropna().tolist()
text = ' '.join(map(str, list_text))

In [3]:
# loads nlp pipeline without NER model
nlp = spacy.load('en_core_sci_lg', exclude='ner')

In [4]:
doc = nlp(text)

In [5]:
# defines list with nested dictionaries that contains the pattern of subject-object-root-relation
dep_pattern = [{'RIGHT_ID': 'verb', 'RIGHT_ATTRS': {'POS': 'VERB'}},
                 {'LEFT_ID': 'verb', 'REL_OP': '>', 'RIGHT_ID': 'subject', 'RIGHT_ATTRS': {'DEP': 'nsubj'}},
                 {'LEFT_ID': 'verb', 'REL_OP': '>', 'RIGHT_ID': 'd_object', 'RIGHT_ATTRS': {'DEP': 'dobj'}}
                ]

In [6]:
# # defines dependency matcher and add patterns
dep_matcher = DependencyMatcher(vocab=nlp.vocab)
dep_matcher.add('nsubj_verb_dobj', patterns=[dep_pattern])

In [7]:
# searches for matches in doc and saves them in list
dep_matches = dep_matcher(doc)

In [8]:
# loops through matches and prints strings of token positions
for match in dep_matches:
    pattern_name = match[0]
    matches = match[1]   
    match1, match2, match3 = matches[0], matches[1], matches[2]
    print(nlp.vocab[pattern_name].text, '\t', doc[match1], '...', doc[match2], '...', doc[match3])

nsubj_verb_dobj 	 describes ... article ... strain
nsubj_verb_dobj 	 determined ... study ... sequence
nsubj_verb_dobj 	 displayed ... trees ... positions
nsubj_verb_dobj 	 represents ... T ... species
nsubj_verb_dobj 	 showed ... it ... tolerance
nsubj_verb_dobj 	 hydrolyse ... T ... gelatin
nsubj_verb_dobj 	 contain ... genome ... gene
nsubj_verb_dobj 	 produce ... T ... H2S
nsubj_verb_dobj 	 contained ... species ... acids
nsubj_verb_dobj 	 contained ... Most ... diphosphatidylglycerol
nsubj_verb_dobj 	 contained ... T ... ubiquinone-8
nsubj_verb_dobj 	 include ... lipids ... diphosphatidylglycerol
nsubj_verb_dobj 	 include ... lipids ... aminophospholipid
nsubj_verb_dobj 	 contains ... Mameliella ... species
nsubj_verb_dobj 	 isolated ... we ... strain
nsubj_verb_dobj 	 indicated ... which ... potential
nsubj_verb_dobj 	 form ... strains ... colonies
nsubj_verb_dobj 	 form ... strains ... h
nsubj_verb_dobj 	 form ... strains ... C
nsubj_verb_dobj 	 cultivated ... we ... T
nsubj_ver