This notebook is an expirement of using Spacy.io depency matcher to identify sentences that describe work expirence. As the results below show, it's very hard to get the depency matcher to work on phrases. I pivote to use textacy pattern, that work much better. See VERB_NOUN_Phrase_Extractor.ipynb notebook for better approach.

In [1]:
import spacy
from spacy.util import filter_spans
from spacy import displacy
from spacy.tokens import Span, Doc
from spacy.language import Language
from spacy.matcher import DependencyMatcher
import re
import pandas as pd
import sys
import textacy


In [2]:
lnkjobs = pd.read_csv("../data/dice_solution_architect_2021-12-14T17:45:56.csv")
jobdescs = [jd for jd in lnkjobs ["job_desc"]]

In [14]:
def start_end_tokens(matches, doc) -> tuple:
    
    anchors_list = []
    matches_list = []
        
    for match in matches:
        id, token_ids = match
        match_id = id
        anchors_list.append(token_ids[0])
        matches_list.append(token_ids[1])

    return (min(anchors_list), max(matches_list))


In [26]:
def dep_matcher(doc, label="EXPERIENCE"):
    matcher = DependencyMatcher(nlp.vocab)
    pattern = [
        {
            "RIGHT_ID": "anchor",    
            "RIGHT_ATTRS": {"LEMMA": { "IN": ["define", "drive", "demonstrate" , "experience", "expertise", "excellent", "exceptional", "excellent", "effective", "require", "make", "manage", "work"]}}  
        },
        {
            "LEFT_ID": "anchor",
            "REL_OP": ".*",
            "RIGHT_ID": "experience",
            "RIGHT_ATTRS": {"POS": {"IN": ["NOUN", "PROPN","VERB"]}}
        }
    ]
    matcher.add("EXPERIENCE", [pattern])
    matches = matcher(doc)
    original_ents = list(doc.ents)
    if matches:
        min, max = start_end_tokens(matches, doc)
        if max:
            new_entity = Span(doc, min, max, label=label)
            original_ents.append(new_entity)
        
        doc.ents = filter_spans(original_ents)
        return doc
    else:
        return None

In [27]:
nlp = spacy.load("en_core_web_sm")
@Language.component("annotate_experience_phrase")
def annotate_experience_phrase(doc):
    
    results = dep_matcher(doc)
    if results:
        return results
    else:
        return None

In [28]:
nlp.add_pipe("annotate_experience_phrase")
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'annotate_experience_phrase']


In [33]:
displacy.render(nlp(jobdescs[1]), style='ent', jupyter=True)

In [41]:
nlp = spacy.load("en_core_web_sm")
displacy.render(nlp("propose strategic solution. collaborate with senior product leaders"), style='dep', jupyter=True)


In [60]:
patterns = [{"POS": "VERB"}, {"POS": {"IN": ["ADP", "ADJ"]}, "OP": "+"}, {"POS":"NOUN", "OP": "+"}]
#patterns = ["POS:VERB:+ POS:ADP:? POS:ADJ:? POS:NOUN:*"]
doc = nlp("propose strategic solution. collaborate with senior product leaders")

verb_chunks = textacy.extract.token_matches(doc, patterns)
for verb_chunk in verb_chunks:
    print(verb_chunk)


propose strategic solution
collaborate with senior product
collaborate with senior product leaders


In [42]:
doc = nlp("propose strategic solution. collaborate with senior product leaders")
matcher = DependencyMatcher(nlp.vocab)
patterns = [
  [
      {
          "RIGHT_ID": "verb_adj_noun_noun_noun",       # unique name
          "RIGHT_ATTRS": {"POS": "VERB"}  # token pattern for "founded"
      },
      {
          "LEFT_ID": "verb_adj_noun_noun_noun",
          "REL_OP": ">",
          "RIGHT_ID": "adposition", 
          "RIGHT_ATTRS": {"POS":{"IN": ["ADP"]}}
      },
      {
          "LEFT_ID": "adposition",
          "REL_OP": ">",
          "RIGHT_ID": "adjectives", 
          "RIGHT_ATTRS": {"POS":{"IN": ["ADJ", "ADV"]}}
      },
  ],
  [
      {
          "RIGHT_ID": "noun_amod_verb",       # unique name
          "RIGHT_ATTRS": {"POS": "NOUN"}  # token pattern for "founded"
      },
      {
          "LEFT_ID": "noun_amod_verb",
          "REL_OP": ">",
          "RIGHT_ID": "modifier", 
          "RIGHT_ATTRS": {"DEP":{"IN": ["amod"]}}
      },
      {
          "LEFT_ID": "noun_amod_verb",
          "REL_OP": "<",
          "RIGHT_ID": "verb", 
          "RIGHT_ATTRS": {"POS": "VERB"}
      }
  ]
]

matcher.add("EXPERIENCE", patterns)
matches = matcher(doc)
print(matches)

match_id, token_ids = matches[0]
#for i in range(len(token_ids)):
    #print(patterns[i][0]["RIGHT_ID"] + ":", doc[token_ids[i]].text)

[(7324372616739864093, [2, 1, 0])]


In [15]:
match_id

7324372616739864093

In [16]:
token_ids

[2, 1, 0]

In [18]:
doc[token_ids[0]]

solution

In [25]:
patterns[1][0]["RIGHT_ID"]

'noun_amod_verb'

In [26]:
len(token_ids)

3

In [47]:
doc = nlp("propose strategic solution. collaborate with senior product leaders")
for chunk in doc.noun_chunks:
    print(chunk)

print("\n ----------------------- \n")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

strategic solution
senior product leaders

 ----------------------- 

propose ROOT propose VERB [solution, .]
strategic amod solution NOUN []
solution dobj propose VERB [strategic]
. punct propose VERB []
collaborate ROOT collaborate VERB [with]
with prep collaborate VERB [leaders]
senior amod leaders NOUN []
product compound leaders NOUN []
leaders pobj with ADP [senior, product]
