In [140]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import pandas as pd
nlp = spacy.load("en_core_web_sm")

### Lets check our rule on a larger corpus

In [141]:
active_passive = pd.read_csv('active_passive.csv')
active_passive.head(2)

Unnamed: 0,Active,Passive
0,He reads a novel.,A novel is read.
1,He does not cook food.,Food is not cooked by him.


In [142]:
active_passive_new = pd.read_csv('active_passive_upGrad_Q.csv', encoding='unicode_escape')
active_passive_new.head(2)

Unnamed: 0,Active,Passive
0,What is your name?,Is she being promoted as new assistant?
1,Is coffee serving here?,Women are said to live longer than men.


In [143]:
active_passive.shape

(40, 2)

In [144]:
active = active_passive['Active']
passive = active_passive['Passive']

In [145]:
active_new = active_passive_new['Active']
passive_new = active_passive_new['Passive']

### Create the rule

In [146]:
passive_rule = [{'DEP':'nsubjpass'}]
matcher = Matcher(nlp.vocab)
matcher.add('Rule',[passive_rule])

In [147]:
def is_passive(doc,matcher):
    if len(matcher(doc))>0:
        return True
    else:
        return False

### Check rule on active voice sentences

In [148]:
cnt = 0
for sent in active:
    doc = nlp(sent)
    if not is_passive(doc,matcher):
        cnt += 1
print(cnt)

40


In [149]:
cnt = 0
for sent in active_new:
    doc = nlp(sent)
    if not is_passive(doc,matcher):
        cnt += 1
print(cnt)

2


### Check rule on passive voice sentences

In [150]:
cnt = 0
for sent in passive:
    doc = nlp(sent)
    if is_passive(doc,matcher):
        cnt += 1
print(cnt)

39


In [151]:
cnt = 0
for sent in passive_new:
    doc = nlp(sent)
    if not is_passive(doc,matcher):
        cnt += 1
print(cnt)

0


### Let's troubleshoot

In [152]:
cnt = 0
missed = []
for sent in passive:
    doc = nlp(sent)
    if is_passive(doc,matcher):
        cnt += 1
    else:
        missed.append(doc)
print(cnt)

39


In [153]:
cnt = 0
missed_new = []
for sent in passive_new:
    doc = nlp(sent)
    if is_passive(doc,matcher):
        cnt += 1
    else:
        missed_new.append(doc)
print(cnt)

2


In [154]:
missed[0]

Is a table being bought by Ritika?

In [155]:
missed_new[0]

IndexError: list index out of range

In [None]:
missed[1]

IndexError: list index out of range

### Let's visualize their dependency trees

In [None]:
for doc in missed:
    displacy.render(doc, style="dep")

In [None]:
for doc in missed_new:
    displacy.render(doc, style="dep")

In [None]:
spacy.explain("auxpass")

'auxiliary (passive)'

[Dependencies](https://universaldependencies.org/docs/en/dep/)

### Update our rule
[Reference](https://spacy.io/usage/rule-based-matching)

In [None]:
passive_rule = [{'DEP':{"IN":['nsubjpass','auxpass']}}] 
 # old rule: passive_rule = [{'DEP':'nsubjpass'}]
matcher = Matcher(nlp.vocab)
matcher.add('Rule',[passive_rule])

In [None]:
cnt = 0
for sent in active:
    doc = nlp(sent)
    if not is_passive(doc,matcher):
        cnt += 1
print(cnt)

40


In [None]:
cnt = 0
for sent in active_new:
    doc = nlp(sent)
    if not is_passive(doc,matcher):
        cnt += 1
print(cnt)

2


In [None]:
cnt = 0
missed = []
for sent in passive:
    doc = nlp(sent)
    if is_passive(doc,matcher):
        cnt += 1
    else:
        missed.append(doc)
print(cnt)

40


In [None]:
cnt = 0
missed = []
for sent in passive_new:
    doc = nlp(sent)
    if is_passive(doc,matcher):
        cnt += 1
    else:
        missed.append(doc)
print(cnt)

2


## Summary
 - Always test your rules and hueristics on a larger corpus to see the effectiveness of the rules
 - One can write intricate matching rules using `matcher` object