In [28]:
import spacy
import json
nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("zh_core_web_sm")
#nlp = spacy.load("ru_core_news_lg")

In [30]:
from spacy.language import Doc
from spacy.tokens import Span
from dataclasses import dataclass
from typing import Dict, Tuple, Iterable, List
import re

@dataclass
class Match:
    pattern_name:str
    matched:str
    sentence:str
    
    def __repr__(self) -> str:
        return f"{self.pattern_name} : {self.sentence}"

class SyntaxRegexMatcher:
    """
    This class encapsulates the sentence regex patterns and methods to apply them to target documents
    """
    def __init__(self, language:str):
        if language == "en":
            self.patterns = {
                "it-cleft": r"\([^-]*-be-[^-]*-[^-]*.*\([iI]t-it-PRP-nsubj\).*\([^-]*-[^-]*-NN[^-]*-attr.*\([^-]*-[^-]*-VB[^-]*-(relcl|advcl)",
                "pseudo-cleft": r"\([^-]*-be-[^-]*-[^-]*.*\([^-]*-[^-]*-(WP|WRB)-(dobj|advmod)",
                "all-cleft" : r"(\([^-]*-be-[^-]*-[^-]*.*\([^-]*-all-(P)?DT)|(\([^-]*-all-(P)?DT-[^-]*.*\([^-]*-be-[^-]*)",
                "there-cleft": r"\([^-]*-be-[^-]*-[^-]*.*\([^-]*-there-EX-expl.*\([^-]*-[^-]*-[^-]*-attr.*\([^-]*-[^-]*-[^-]*-(relcl|acl)",
                "if-because-cleft" : r"\([^-]*-be-[^-]*-[^-]*.*\([^-]*-[^-]*-[^-]*-advcl\([^-*]*-if-IN-mark",
                "passive" : r"\([^-]*-[^-]*-(NN[^-]*|PRP|WDT)-nsubjpass.*\([^-]*-be-[^-]*-auxpass",
                "subj-relcl" : r"\([^-]*-[^-]*-[^-]*-relcl.*\([^-]*-[^-]*-(WP|WDT)-nsubj",
                "obj-relcl" : r"\([^-]*-[^-]*-NN[^-]*-(nsubj|attr).*\([^-]*-[^-]*-[^-]*-(relcl|ccomp).*\([^-]*-[^-]*-(WP|WDT|IN)-(pobj|dobj)",
                "tag-question" : r"\([^-]*-(do|be|could|can|have)-[^-]*-ROOT.*\(\?-\?-\.-punct",
                "coordinate-clause" : r"\([^-]*-[^-]*-CC-cc\).*\([^-]*-[^-]*-(VB[^-]*|JJ)-conj.*\([^-]*-[^-]*-[^-]*-nsubj",
                
            }

        elif language == "ru":
            print("using russian constructions")
            self.patterns = {
                "passive_rus" : r"\([^-]*-[^-]*-(NOUN[^-]*|PRON)-nsubj:pass.*\)",
                "parataxis_rus": r"\([^-]*-[^-]*-[^-]*-parataxis.*\)",
                "participle_gerund_rus": r"\([^-]*-[^-]*-(VERB-acl(?!:relcl)|ADJ-amod|VERB-advcl).*?\)",
                "conj_rus": r"\([^()]*-[^-]*-[^-]*-conj[^()]*\)",
                "nested_structure_rus": r"\([^-]*-[^-]*-(VERB-acl:relcl).*?\)",
                "one_word_sent_rus": r"\([^-]*-[^-]*-(NOUN|VERB)-ROOT\(.*?-PUNCT-punct\)\)"
            }

        self.language = language

    def print_patterns(self) -> None:
        for pattern_name, pattern in self.patterns.items():
            print(f"{pattern_name} : {pattern}\n")

        
    def _find_treegex_matches(self, doc:Doc) -> Tuple[Match]:
        """Iterates through a document's sentences, applying every regex to each sentence"""
        matches = []
        for sent in doc.sents:
            tree_string = self.linearize_tree(sent)
            for name, pattern in self.patterns.items():
                match = re.search(pattern, tree_string)
                if match:
                    matches.append(Match(name, match.group(), sent.text))
        return tuple(matches)

    def add_patterns(self, patterns:Dict[str,str]) -> None:
        """Updates the default patterns dictionary with a user supplied dictionary of {pattern_name:regex} pairs"""
        self.patterns.update(patterns)
        
    def remove_patterns(self, to_remove:Iterable[str]) -> None:
        """Given an iterable of pattern names, removes those patterns from the registered pattens list"""
        for pattern_name in to_remove:
            try:
                del self.patterns[pattern_name]
            except KeyError:
                raise KeyError(f"Pattern '{pattern_name}' not in registered patterns.")
            
            
    def match_document(self, document:Doc) -> Tuple[Match]:
        """
        Applies all registered patterns to one spaCy-generated document
        
        Args
        ----
            - document - a single spaCy document\n
        Returns
        -------
            - a tuple of sentence matches for a single document
        """
        return self._find_treegex_matches(document)

    def match_documents(self, documents:Iterable[Doc]) -> List[Tuple[Match]]:
        """
        Applies all registered patterns to a collection of spaCy-generated documents
        
        Args
        ----
            - documents - iterable of spacy documents\n
        Returns
        -------
            - A list of tuples such that each tuple contains one document's sentence matches
        """
        all_matches = []
        for document in documents:
            all_matches.append(self._find_treegex_matches(document))
        return all_matches
    
    def linearize_tree(self, sentence:Span) -> str:
        """Converts a spaCy dependency-parsed sentence into a linear tree string while preserving dependency relations"""
        
        def get_NT_count(sentence) -> int:
            """Returns the number of non-terminal nodes in a dep tree"""
            return sum([1 for token in sentence if list(token.children)])

        def ending_parenthesis(n:int) -> str:
            """Returns the appropriate amount of parenthesis to add to linearlized tree"""
            return f"{')' * n}"
        
        def parse_dependency_parse(sentence):
            """Processes a dependency parse in a bottom-up fashion"""
            stack = [sentence.root]
            result = ""
            while stack:
                token = stack.pop()
                if self.language == "en":
                    result += f"({token.text}-{token.lemma_}-{token.tag_}-{token.dep_}-{token.ent_type_}" 
                elif self.language == "ru":
                    result += f"({token.text}-{token.lemma_}-{token.tag_}-{token.dep_}-{token.morph}" 
                
                for child in reversed(list(token.children)):
                    stack.append(child)
                
                if not list(token.children):
                    result += ")"
            return result
        
        parse = parse_dependency_parse(sentence)
        nt_count = get_NT_count(sentence)
        print(f"{parse}{ending_parenthesis(nt_count)}")
        return f"{parse}{ending_parenthesis(nt_count)}"
    

In [3]:
matcher = SyntaxRegexMatcher(language="en")


In [23]:
russian_text_1 = "Причина проста – многие занимаются английским языком вне школы."

from spacy import displacy

# Visualize the dependency parse
options = {"compact": True, "bg": "#ffffff",
           "color": "#000000", "font": "Source Sans Pro"}
displacy.render(nlp(russian_text_1), style="dep", options=options)

document = nlp(russian_text_1)
for sent in document.sents:
    tree_string = matcher.linearize_tree(sent)

(проста-простой-ADJ-ROOT-Degree=Pos|Gender=Fem|Number=Sing|StyleVariant=Short(Причина-причина-NOUN-nsubj-Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing)(занимаются-заниматься-VERB-parataxis-Aspect=Imp|Mood=Ind|Number=Plur|Person=Third|Tense=Pres|VerbForm=Fin|Voice=Mid(–-–-PUNCT-punct-)(многие-многие-NOUN-nsubj-Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur)(языком-язык-NOUN-obl-Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing(английским-английский-ADJ-amod-Case=Ins|Degree=Pos|Gender=Masc|Number=Sing)(школы-школа-NOUN-obl-Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing(вне-вне-ADP-case-)(.-.-PUNCT-punct-)))))


In [24]:
matches = matcher.match_document(document)
for match in matches:
    print(match)

(проста-простой-ADJ-ROOT-Degree=Pos|Gender=Fem|Number=Sing|StyleVariant=Short(Причина-причина-NOUN-nsubj-Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing)(занимаются-заниматься-VERB-parataxis-Aspect=Imp|Mood=Ind|Number=Plur|Person=Third|Tense=Pres|VerbForm=Fin|Voice=Mid(–-–-PUNCT-punct-)(многие-многие-NOUN-nsubj-Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur)(языком-язык-NOUN-obl-Animacy=Inan|Case=Ins|Gender=Masc|Number=Sing(английским-английский-ADJ-amod-Case=Ins|Degree=Pos|Gender=Masc|Number=Sing)(школы-школа-NOUN-obl-Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing(вне-вне-ADP-case-)(.-.-PUNCT-punct-)))))
parataxis_rus : Причина проста – многие занимаются английским языком вне школы.
participle_gerund_rus : Причина проста – многие занимаются английским языком вне школы.


In [25]:
# text = "It was John who bought the butter."
# russian_text = "Это Джон купил масло."
russian_text = "У меня есть подруга, живущая в Мексике."
#from spacy import displacy

# Visualize the dependency parse
options = {"compact": True, "bg": "#ffffff",
           "color": "#000000", "font": "Source Sans Pro"}
displacy.render(nlp(russian_text), style="dep", options=options)

document = nlp(russian_text)
for sent in document.sents:
    tree_string = matcher.linearize_tree(sent)

(есть-быть-VERB-ROOT-Aspect=Imp|Mood=Ind|Number=Sing|Person=Third|Tense=Pres|VerbForm=Fin|Voice=Act(меня-меня-PRON-obl-Case=Gen|Number=Sing|Person=First(У-у-ADP-case-)(подруга-подруга-NOUN-nsubj-Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing(живущая-жить-VERB-acl-Aspect=Imp|Case=Nom|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act(,-,-PUNCT-punct-)(Мексике-мексика-PROPN-obl-Animacy=Inan|Case=Loc|Gender=Fem|Number=Sing(в-в-ADP-case-)(.-.-PUNCT-punct-))))))


In [26]:
matches = matcher.match_document(document)
for match in matches:
    print(match)

(есть-быть-VERB-ROOT-Aspect=Imp|Mood=Ind|Number=Sing|Person=Third|Tense=Pres|VerbForm=Fin|Voice=Act(меня-меня-PRON-obl-Case=Gen|Number=Sing|Person=First(У-у-ADP-case-)(подруга-подруга-NOUN-nsubj-Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing(живущая-жить-VERB-acl-Aspect=Imp|Case=Nom|Gender=Fem|Number=Sing|Tense=Pres|VerbForm=Part|Voice=Act(,-,-PUNCT-punct-)(Мексике-мексика-PROPN-obl-Animacy=Inan|Case=Loc|Gender=Fem|Number=Sing(в-в-ADP-case-)(.-.-PUNCT-punct-))))))
participle_gerund_rus : У меня есть подруга, живущая в Мексике.


In [27]:
russian_text = "Прочитанная книга, лежала на столе"

document = nlp(russian_text)
for sent in document.sents:
    tree_string = matcher.linearize_tree(sent)

(лежала-лежать-VERB-ROOT-Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act(книга-книга-NOUN-nsubj-Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing(Прочитанная-прочитанная-ADJ-amod-Case=Nom|Degree=Pos|Gender=Fem|Number=Sing)(,-,-PUNCT-punct-)(столе-стол-NOUN-obl-Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing(на-на-ADP-case-))))


In [13]:
matcher.match_document(document)

(лежала-лежать-VERB-ROOT(книга-книга-NOUN-nsubj(Прочитанная-прочитанная-ADJ-amod)(,-,-PUNCT-punct)(столе-стол-NOUN-obl(на-на-ADP-case))))


(participle_gerund_rus : Прочитанная книга, лежала на столе,)

In [28]:
matches = matcher.match_document(document)
for match in matches:
    print(match)

(лежала-лежать-VERB-ROOT-Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act(книга-книга-NOUN-nsubj-Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing(Прочитанная-прочитанная-ADJ-amod-Case=Nom|Degree=Pos|Gender=Fem|Number=Sing)(,-,-PUNCT-punct-)(столе-стол-NOUN-obl-Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing(на-на-ADP-case-))))
participle_gerund_rus : Прочитанная книга, лежала на столе


In [29]:
russian_text = "Меня интересует музыка."

document = nlp(russian_text)
for sent in document.sents:
    tree_string = matcher.linearize_tree(sent)

(интересует-интересовать-VERB-ROOT-Aspect=Imp|Mood=Ind|Number=Sing|Person=Third|Tense=Pres|VerbForm=Fin|Voice=Act(Меня-меня-PRON-obj-Case=Acc|Number=Sing|Person=First)(музыка-музыка-NOUN-nsubj-Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing)(.-.-PUNCT-punct-))


In [481]:
linearize_tree(sent)

'(есть-быть-VERB-ROOT(меня-меня-PRON-obl(У-у-ADP-case)(подруга-подруга-NOUN-nsubj(живущая-жить-VERB-acl(,-,-PUNCT-punct)(Мексике-мексика-PROPN-obl(в-в-ADP-case)(.-.-PUNCT-punct))))))'

In [482]:
matches = matcher.match_document(document)
for match in matches:
    print(match)

participle_gerund_rus : У меня есть подруга, живущая в Мексике.


In [518]:
russian_text = "Зима."

document = nlp(russian_text)
for sent in document.sents:
    tree_string = linearize_tree(sent)

In [519]:
linearize_tree(sent)

'(Зима-зима-NOUN-ROOT(.-.-PUNCT-punct))'

In [520]:
matches = matcher.match_document(document)
for match in matches:
    print(match)

one_word_sent_rus : Зима.


In [508]:
matcher.add_patterns(
    {
        "passive_rus" : r"\([^-]*-[^-]*-(NOUN[^-]*|PRON)-nsubj:pass.*\)",
        "parataxis_rus": r"\([^-]*-[^-]*-[^-]*-parataxis.*\)",
        "participle_gerund_rus": r"\([^-]*-[^-]*-(VERB-acl(?!:relcl)|ADJ-amod|VERB-advcl).*?\)",
        "conj_rus": r"\([^()]*-[^-]*-[^-]*-conj[^()]*\)",
        "nested_structure_rus": r"\([^-]*-[^-]*-(VERB-acl:relcl).*?\)",
        "one_word_sent_rus": r"\([^-]*-[^-]*-(NOUN|VERB)-ROOT\(.*?-PUNCT-punct\)\)"
    }
)

In [517]:
matcher.print_patterns() 

it-cleft : \([^-]*-be-[^-]*-[^-]*.*\([iI]t-it-PRP-nsubj\).*\([^-]*-[^-]*-NN[^-]*-attr.*\([^-]*-[^-]*-VB[^-]*-(relcl|advcl)

pseudo-cleft : \([^-]*-be-[^-]*-[^-]*.*\([^-]*-[^-]*-(WP|WRB)-(dobj|advmod)

all-cleft : (\([^-]*-be-[^-]*-[^-]*.*\([^-]*-all-(P)?DT)|(\([^-]*-all-(P)?DT-[^-]*.*\([^-]*-be-[^-]*)

there-cleft : \([^-]*-be-[^-]*-[^-]*.*\([^-]*-there-EX-expl.*\([^-]*-[^-]*-[^-]*-attr.*\([^-]*-[^-]*-[^-]*-(relcl|acl)

if-because-cleft : \([^-]*-be-[^-]*-[^-]*.*\([^-]*-[^-]*-[^-]*-advcl\([^-*]*-if-IN-mark

passive : \([^-]*-[^-]*-(NN[^-]*|PRP|WDT)-nsubjpass.*\([^-]*-be-[^-]*-auxpass

subj-relcl : \([^-]*-[^-]*-[^-]*-relcl.*\([^-]*-[^-]*-(WP|WDT)-nsubj

obj-relcl : \([^-]*-[^-]*-NN[^-]*-(nsubj|attr).*\([^-]*-[^-]*-[^-]*-(relcl|ccomp).*\([^-]*-[^-]*-(WP|WDT|IN)-(pobj|dobj)

tag-question : \([^-]*-(do|be|could|can|have)-[^-]*-ROOT.*\(\?-\?-\.-punct

coordinate-clause : \([^-]*-[^-]*-CC-cc\).*\([^-]*-[^-]*-(VB[^-]*|JJ)-conj.*\([^-]*-[^-]*-[^-]*-nsubj

passive_rus : \([^-]*-[^-]*-(NOUN[^-]

In [516]:
matcher.remove_patterns([
    "one_noun_sent_rus"
#    "participle_gerund_rus"
])

In [None]:
#\([^-]*-[^-]*-(NOUN[^-]*|PRON)-nsubj:pass.*\([^-]*-будет|будут|была|был|были-[^-]*-aux:pass

In [44]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [26]:
nlp= spacy.load("en_core_web_lg")

In [48]:
text = "my cat is smaller than my dog"

document = nlp(text)
for sent in document.sents:
    tree_string = matcher.linearize_tree(sent)

(is-be-VBZ-ROOT-(cat-cat-NN-nsubj-(my-my-PRP$-poss-)(smaller-small-JJR-acomp-(than-than-IN-prep-(dog-dog-NN-pobj-(my-my-PRP$-poss-))))))


In [25]:
supports_cop = False
for token in document:
    print(f"Token: {token.text}, Dep: {token.dep_}")
    if token.dep_ == "cop":
        supports_cop = True

# Output result
if supports_cop:
    print("This spaCy model supports the 'cop' dependency label.")
else:
    print("This spaCy model does not support the 'cop' dependency label.")

Token: Bill, Dep: nsubj
Token: is, Dep: ROOT
Token: a, Dep: det
Token: teacher, Dep: attr
This spaCy model does not support the 'cop' dependency label.


In [101]:
linearize_tree(sent)

'(stolen-steal-VBN-ROOT(car-car-NN-nsubjpass)(was-be-VBD-auxpass)(night-night-NN-npadvmod(last-last-JJ-amod)))'

In [102]:
matches = matcher.match_document(document)
for match in matches:
    print(match)

passive : car was stolen last night


In [104]:
texts = ["How she paid for her food was with her credit card. When did Sarah say she was coming over?", 
         "English is spoken all over the world. When it was sunny, I went outside, but it started raining.",
         "She is the author that I have interviewed. They might have been invited to the party."]
docs = nlp.pipe(texts)
matches = matcher.match_documents(docs)
for match in matches:
    print(match)

()
()
()


In [105]:
matcher.print_patterns() 

it-cleft : \([^-]*-be-[^-]*-[^-]*.*\([iI]t-it-PRP-nsubj\).*\([^-]*-[^-]*-NN[^-]*-attr.*\([^-]*-[^-]*-VB[^-]*-(relcl|advcl)

pseudo-cleft : \([^-]*-be-[^-]*-[^-]*.*\([^-]*-[^-]*-(WP|WRB)-(dobj|advmod)

all-cleft : (\([^-]*-be-[^-]*-[^-]*.*\([^-]*-all-(P)?DT)|(\([^-]*-all-(P)?DT-[^-]*.*\([^-]*-be-[^-]*)

there-cleft : \([^-]*-be-[^-]*-[^-]*.*\([^-]*-there-EX-expl.*\([^-]*-[^-]*-[^-]*-attr.*\([^-]*-[^-]*-[^-]*-(relcl|acl)

if-because-cleft : \([^-]*-be-[^-]*-[^-]*.*\([^-]*-[^-]*-[^-]*-advcl\([^-*]*-if-IN-mark

passive : \([^-]*-[^-]*-(NN[^-]*|PRP|WDT)-nsubjpass.*\([^-]*-be-[^-]*-auxpass

subj-relcl : \([^-]*-[^-]*-[^-]*-relcl.*\([^-]*-[^-]*-(WP|WDT)-nsubj

obj-relcl : \([^-]*-[^-]*-NN[^-]*-(nsubj|attr).*\([^-]*-[^-]*-[^-]*-(relcl|ccomp).*\([^-]*-[^-]*-(WP|WDT|IN)-(pobj|dobj)

tag-question : \([^-]*-(do|be|could|can|have)-[^-]*-ROOT.*\(\?-\?-\.-punct

coordinate-clause : \([^-]*-[^-]*-CC-cc\).*\([^-]*-[^-]*-(VB[^-]*|JJ)-conj.*\([^-]*-[^-]*-[^-]*-nsubj



In [None]:
matcher.add_patterns(
    {
        "pattern_name_1" : r"regex_1",
        "pattern_name_2" : r"regex_2"
    }
)

In [62]:
#It was Jane’s car that got stolen last night.

#(was-be-VBD-ROOT(It-it-PRP-nsubj)(car-car-NN-attr(Jane-Jane-NNP-poss(’s-’s-POS-case)(stolen-steal-VBN-relcl

#\([^-]*-[^-]*-(NN[^-]*|PRP|WDT)-nsubjpass.*\([^-]*-be-[^-]*-auxpass

In [74]:
text = "Yesterday the book was bought."
# russian_text = "Это Джон купил масло."
#russian_text = "Вчера книга была куплена и прочитана."

#from spacy import displacy

# Visualize the dependency parse
#options = {"compact": True, "bg": "#ffffff",
          # "color": "#000000", "font": "Source Sans Pro"}
#displacy.render(nlp(russian_text), style="dep", options=options)

document = nlp(text)
for sent in document.sents:
    tree_string = linearize_tree(sent)

In [75]:
linearize_tree(sent)

'(Yesterday-yesterday-X-ROOT(the-the-X-flat:foreign)(book-book-X-flat:foreign)(was-was-X-flat:foreign)(bought-bought-X-flat:foreign)(.-.-PUNCT-punct))'