In [1]:
import pandas as pd
import spacy
from spacy.language import Language
from spacy.tokens import Doc
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP
import numpy as np
import en_core_web_sm
import wordninja # for splitting tokens lacking whitespace
# import jamspell
import contextualSpellCheck
import re
from collections import Counter

In [2]:
# read data
narratives = pd.read_csv("../data/narratives.csv")
intake = narratives.column_name.str.contains('take')
narratives = (narratives[intake])[["cr_id", "column_name", "text"]]
narratives = narratives.drop_duplicates()
df = narratives[0:5].copy()

In [3]:
test_text = df["text"][5]
test_text

'It is reported that the accused officer failed\nto terminate a motor vehicle pursuit when\nordered by Sergeant Hernandez\nIt is reported that the accused officer failed\nto terminate a motor vehicle pursuit when\nordered by Sergeant Hernandez\nIt is reported that the accused officer failed\nto terminate a motor vehicle pursuit when\nordered by sergeant Hernandez.\nIt is reported that the accused officer failed\nto terminate a motor vehicle pursuit when\nordered by Sergeant Hernandez'

### Strategy 0: Nada
NOTES:
- Literelly just use spacy to process text imputs.  does pretty good job tagging everything!

In [4]:
# set up NLP
nlp_0 = spacy.load('en_core_web_sm')
doc_test_0 = nlp_0(test_text)

for i,token in enumerate(doc_test_0):
    print(i)
    print(token.text,'--',token.lemma_,'---',token.pos_,'---',token.has_vector,'---',nlp_0.vocab.strings[str(token)], '---',token.is_stop)

0
It -- it --- PRON --- True --- 7859011591137717335 --- True
1
is -- be --- AUX --- True --- 3411606890003347522 --- True
2
reported -- report --- VERB --- True --- 11181246799942687462 --- False
3
that -- that --- SCONJ --- True --- 4380130941430378203 --- True
4
the -- the --- DET --- True --- 7425985699627899538 --- True
5
accused -- accuse --- VERB --- True --- 8915410849535181575 --- False
6
officer -- officer --- NOUN --- True --- 9228201189916158328 --- False
7
failed -- fail --- VERB --- True --- 4500079622559289248 --- False
8

 -- 
 --- SPACE --- True --- 962983613142996970 --- False
9
to -- to --- PART --- True --- 3791531372978436496 --- True
10
terminate -- terminate --- VERB --- True --- 974796105764162566 --- False
11
a -- a --- DET --- True --- 11901859001352538922 --- True
12
motor -- motor --- NOUN --- True --- 1640505308719491870 --- False
13
vehicle -- vehicle --- NOUN --- True --- 854351138829791262 --- False
14
pursuit -- pursuit --- NOUN --- True --- 14575054513

### Strategy 1: Pre-Tokenization Custom Filtering Pipe (Tamar's V1)
NOTES: 
- removing punctiation/stop works/keeps POS tagging from working properly
- Lets explore post processing teqniques

In [8]:
nlp_1 = spacy.load('en_core_web_sm')

@Language.component("repeats_1")
def repeats_1(doc):
  s = doc.text.lower()
  i = (s+" "+s).find(s, 1, -1)
  if i == -1:
    doc = doc
  else:
    doc = nlp_1.make_doc(s[:i-1])
  return(doc)

# component for removing stop words
@Language.component("stopwords_1")
def stopwords_1(doc):
  doc = [t.text for t in doc if not t.is_stop]
  doc = nlp_1.make_doc(' '.join(map(str, doc)))
  return(doc)

# component for removing punctuation
@Language.component("punctuation_1")
def punctuation_1(doc):
  doc = [t.text for t in doc if (not t.is_punct and not t.is_space)]
  doc = nlp_1.make_doc(' '.join(map(str, doc)))
  return(doc)


nlp_1.add_pipe("stopwords_1", name = "stopwords_1", before = "tagger") # add stopword remover to pipeline
# add punctuation remover to pipeline
nlp_1.add_pipe("punctuation_1", name = "punctuation_1", before = "tagger")
# add repeats remover to pipeline
nlp_1.add_pipe("repeats_1", name = "repeats_1", before = "tagger")
# nlp.add_pipe("contextual spellchecker")

doc_test_1 = nlp_1(test_text)
doc_test_1

print(nlp_1.pipe_names)

['tok2vec', 'stopwords_1', 'punctuation_1', 'repeats_1', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


In [9]:
for i,token in enumerate(doc_test_1):
    print(i)
    print(token.text,'--',token.lemma_,'---',token.pos_,'---',token.has_vector,'---',nlp_1.vocab.strings[str(token)], '---',token.is_stop)

0
reported -- reported --- ADP --- False --- 11181246799942687462 --- False
1
accused -- accused --- ADP --- False --- 8915410849535181575 --- False
2
officer -- officer --- ADP --- False --- 9228201189916158328 --- False
3
failed -- failed --- ADP --- False --- 4500079622559289248 --- False
4
terminate -- terminate --- ADP --- False --- 974796105764162566 --- False
5
motor -- motor --- ADP --- False --- 1640505308719491870 --- False
6
vehicle -- vehicle --- ADP --- False --- 854351138829791262 --- False
7
pursuit -- pursuit --- ADP --- False --- 14575054513208559317 --- False
8
ordered -- ordered --- ADP --- False --- 18198004002626200087 --- False
9
sergeant -- sergeant --- ADP --- False --- 10122840340774289451 --- False
10
hernandez -- hernandez --- ADP --- False --- 5211270583463062168 --- False


### Strategy 2: Filtering Docs After Tagging
NOTES:
- Filters are effectively applied
- Tokens loose important attributes (pos/lemma/vector)
- Token still maintain hash-ids

In [10]:
# set up strategy 2 NLP
nlp_2 = spacy.load('en_core_web_sm')


@Language.component("repeats_2")
def repeats_2(doc):
  s = doc.text.lower()
  i = (s+" "+s).find(s, 1, -1)
  if i == -1:
    doc = doc
  else:
    doc = nlp_2.make_doc(s[:i-1])
  return(doc)

# component for removing stop words
@Language.component("stopwords_2")
def stopwords_2(doc):
  doc = [t.text for t in doc if not t.is_stop]
  doc = nlp_2.make_doc(' '.join(map(str, doc)))
  return(doc)

# component for removing punctuation
@Language.component("punctuation_2")
def punctuation_2(doc):
  doc = [t.text for t in doc if (not t.is_punct and not t.is_space)]
  doc = nlp_2.make_doc(' '.join(map(str, doc)))
  return(doc)

#add sustom pipes
nlp_2.add_pipe("stopwords_2", name = "stopwords_2", last = True) # add stopword remover to pipeline
# add punctuation remover to pipeline
nlp_2.add_pipe("punctuation_2", name = "punctuation_2", last = True)
# add repeats remover to pipeline
nlp_2.add_pipe("repeats_2", name = "repeats_2", last = True)
# nlp.add_pipe("contextual spellchecker")

print(nlp_2.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer', 'stopwords_2', 'punctuation_2', 'repeats_2']


In [11]:
doc_test_2 = nlp_2(test_text)
doc_test_2

reported accused officer failed terminate motor vehicle pursuit ordered sergeant hernandez

In [12]:
for i,token in enumerate(doc_test_2):
    print(i)
    print(token.text,'--',token.lemma_,'---',token.pos_,'---',token.has_vector,'---',nlp_2.vocab.strings[str(token)], '---',token.is_stop)

0
reported --  ---  --- False --- 11181246799942687462 --- False
1
accused --  ---  --- False --- 8915410849535181575 --- False
2
officer --  ---  --- False --- 9228201189916158328 --- False
3
failed --  ---  --- False --- 4500079622559289248 --- False
4
terminate --  ---  --- False --- 974796105764162566 --- False
5
motor --  ---  --- False --- 1640505308719491870 --- False
6
vehicle --  ---  --- False --- 854351138829791262 --- False
7
pursuit --  ---  --- False --- 14575054513208559317 --- False
8
ordered --  ---  --- False --- 18198004002626200087 --- False
9
sergeant --  ---  --- False --- 10122840340774289451 --- False
10
hernandez --  ---  --- False --- 5211270583463062168 --- False


### Strategy 3: Filtering Docs After Tagging
NOTES:
- Filters are ineffectively applied
- Tokens loose important attributes (pos/lemma/vector)
- Token still maintain hash-ids
- Cycled though each step in the pipeline and only works at "after = lemmatizer". syspeciously doesn't produce the same result at just the default "last = True"
- My understanding of spacy says this should work....

In [13]:
# set up strategy 3 NLP
nlp_3 = spacy.load('en_core_web_sm')


@Language.component("repeats_3")
def repeats_3(doc):
  s = doc.text.lower()
  i = (s+" "+s).find(s, 1, -1)
  if i == -1:
    doc = doc
  else:
    doc = nlp_3.make_doc(s[:i-1])
  return(doc)

# component for removing stop words
@Language.component("stopwords_3")
def stopwords_3(doc):
  doc = [t.text for t in doc if not t.is_stop]
  doc = nlp_3.make_doc(' '.join(map(str, doc)))
  return(doc)

# component for removing punctuation
@Language.component("punctuation_3")
def punctuation_3(doc):
  doc = [t.text for t in doc if (not t.is_punct and not t.is_space)]
  doc = nlp_3.make_doc(' '.join(map(str, doc)))
  return(doc)

#add custom pipes
nlp_3.add_pipe("stopwords_3", name = "stopwords_3", after = 'lemmatizer') # add stopword remover to pipeline
# add punctuation remover to pipeline
nlp_3.add_pipe("punctuation_3", name = "punctuation_3", after = 'lemmatizer')
# add repeats remover to pipeline
nlp_3.add_pipe("repeats_3", name = "repeats_3", after = 'lemmatizer')
# nlp.add_pipe("contextual spellchecker")

print(nlp_3.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer', 'repeats_3', 'punctuation_3', 'stopwords_3']


In [14]:
doc_test_3 = nlp_3(test_text)

for i,token in enumerate(doc_test_3):
    print(i)
    print(token.text,'--',token.lemma_,'---',token.pos_,'---',token.has_vector,'---',nlp_2.vocab.strings[str(token)], '---',token.is_stop)

0
reported --  ---  --- False --- 11181246799942687462 --- False
1
accused --  ---  --- False --- 8915410849535181575 --- False
2
officer --  ---  --- False --- 9228201189916158328 --- False
3
failed --  ---  --- False --- 4500079622559289248 --- False
4
terminate --  ---  --- False --- 974796105764162566 --- False
5
motor --  ---  --- False --- 1640505308719491870 --- False
6
vehicle --  ---  --- False --- 854351138829791262 --- False
7
pursuit --  ---  --- False --- 14575054513208559317 --- False
8
ordered --  ---  --- False --- 18198004002626200087 --- False
9
Sergeant --  ---  --- False --- 2732174988197022273 --- False
10
Hernandez --  ---  --- False --- 5312260791442479864 --- False
11
reported --  ---  --- False --- 11181246799942687462 --- False
12
accused --  ---  --- False --- 8915410849535181575 --- False
13
officer --  ---  --- False --- 9228201189916158328 --- False
14
failed --  ---  --- False --- 4500079622559289248 --- False
15
terminate --  ---  --- False --- 974796105

### Strategy 4: Create list of Tokens w/ Preserved Attributes
NOTES:
- List of token maintains important token attributes
- fitleres by stop words, punctuation, repeats.
- Filters are ineffectively applied
- "repeated" elements are those with identical .text attributes and .pos_ attributes.  The idea is to avoid conflating identical words with different POS tags.  Can potentially update this to include other differentating features if it comes up.
- Over all good strategy for leveraging spacy's speech tagging and the going with a bag of words style model
- Potential imporvements:
     - Include custom word count attribute (count the number of occurances for a specific repeat)
     - Include word position information (? from origional doc or from filteres doc? lot to sort out, interesting to think about)
     - funky stuff happening with capitalizations

In [18]:
# set up NLP
nlp_4 = spacy.load('en_core_web_sm')
doc_test_4 = nlp_4(test_text)
tok_list_working = [t for t in doc_test_4 if not t.is_stop]
tok_list_working = [t for t in tok_list_working if (not t.is_punct and not t.is_space)]

# tok_list_working
count_dict = {}
for i, tok in enumerate(tok_list_working):
    count_dict[tok.text.lower() + "_" + tok.pos_] = []
for i, tok in enumerate(tok_list_working):
    count_dict[tok.text.lower() + "_" + tok.pos_].append(i)

unique_index = []
for element in count_dict:
    unique_index.append(count_dict[element][0])

tok_list_final = [tok_list_working[i] for i in unique_index]
tok_list_final

[reported,
 accused,
 officer,
 failed,
 terminate,
 motor,
 vehicle,
 pursuit,
 ordered,
 Sergeant,
 Hernandez]

In [19]:
for i,token in enumerate(tok_list_final):
    print(i)
    print(token.text,'--',token.lemma_,'---',token.pos_,'---',token.has_vector,'---',nlp_0.vocab.strings[str(token)], '---',token.is_stop)

0
reported -- report --- VERB --- True --- 11181246799942687462 --- False
1
accused -- accuse --- VERB --- True --- 8915410849535181575 --- False
2
officer -- officer --- NOUN --- True --- 9228201189916158328 --- False
3
failed -- fail --- VERB --- True --- 4500079622559289248 --- False
4
terminate -- terminate --- VERB --- True --- 974796105764162566 --- False
5
motor -- motor --- NOUN --- True --- 1640505308719491870 --- False
6
vehicle -- vehicle --- NOUN --- True --- 854351138829791262 --- False
7
pursuit -- pursuit --- NOUN --- True --- 14575054513208559317 --- False
8
ordered -- order --- VERB --- True --- 18198004002626200087 --- False
9
Sergeant -- Sergeant --- PROPN --- True --- 2732174988197022273 --- False
10
Hernandez -- Hernandez --- PROPN --- True --- 5312260791442479864 --- False


### Strategy 5: Explicity Filter and Reconstruct Doc object
NOTES:
- Filters are ineffectively applied
- Tokens vector for some reason aren't there any more
- Tokens maintain other important attributes: POS, lemma, etc etc
- Token still maintain hash-ids
- Maybe come up with a way to add element count as a custom attribute to the word in the doc?
- Basically we exploit the class structure of the token/doc objects to make a frankenstein doc.  not sure home much sense this makes for application but w/e.

In [23]:
def remove_tokens(doc, index_to_del, list_attr=[LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP]):
    """
    Remove tokens from a Spacy *Doc* object without losing 
    associated information (PartOfSpeech, Dependance, Lemma, extensions, ...)
    
    Parameters
    ----------
    doc : spacy.tokens.doc.Doc
        spacy representation of the text
    index_to_del : list of integer 
         positions of each token you want to delete from the document
    list_attr : list, optional
        Contains the Spacy attributes you want to keep (the default is 
        [LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP])
    Returns
    -------
    spacy.tokens.doc.Doc
        Filtered version of doc
    """
    
    np_array = doc.to_array(list_attr) # Array representation of Doc
    
    # Creating a mask: boolean array of the indexes to delete
    mask_to_del = np.ones(len(np_array), np.bool)
    mask_to_del[index_to_del] = 0
    
    np_array_2 = np_array[mask_to_del]
    doc2 = Doc(doc.vocab, words=[t.text for t in doc if t.i not in index_to_del])
    doc2.from_array(list_attr, np_array_2)
    
    ### Modification made by @yarongon https://gist.github.com/Jacobe2169/5086c7c4f6c56e9d3c7cfb1eb0010fe8#gistcomment-2941380
    # Handling user extensions
    #  The `doc.user_data` dictionary is holding the data backing user-defined attributes.
    #  The data is based on characters offset, so a conversion is needed from the
    #  old Doc to the new one.
    #  More info here: https://github.com/explosion/spaCy/issues/2532
    arr = np.arange(len(doc))
    new_index_to_old = arr[mask_to_del]
    doc_offset_2_token = {tok.idx : tok.i  for tok in doc}  # needed for the user data
    doc2_token_2_offset = {tok.i : tok.idx  for tok in doc2}  # needed for the user data
    new_user_data = {}
    for ((prefix, ext_name, offset, x), val) in doc.user_data.items():
        old_token_index = doc_offset_2_token[offset]
        new_token_index = np.where(new_index_to_old == old_token_index)[0]
        if new_token_index.size == 0:  # Case this index was deleted
            continue
        new_char_index = doc2_token_2_offset[new_token_index[0]]
        new_user_data[(prefix, ext_name, new_char_index, x)] = val
    doc2.user_data = new_user_data
    
    return doc2

In [24]:
nlp_5 = spacy.load('en_core_web_sm')
doc_test_5 = nlp_5(test_text)

#collect stop word/punctuation indices 
index_list = []
for i, tok in enumerate(doc_test_5):
    if tok.is_stop:
        index_list.append(i)
    if (tok.is_punct or tok.is_space):
        index_list.append(i)

#Collect repeated element indices
count_dict = {}

for i, tok in enumerate(doc_test_5):
    count_dict[tok.text.lower()] = []
for i, tok in enumerate(doc_test_5):
    count_dict[tok.text.lower()].append(i)

repeat_indeces = []
for element in count_dict:
    repeat_indeces +=count_dict[element][1:]

#combine unwanted indices
final_indeces = repeat_indeces + index_list
final_indeces

doc_test_5 = remove_tokens(doc_test_5,index_to_del=final_indeces)
print(doc_test_5)

reported accused officer failed terminate motor vehicle pursuit ordered Sergeant Hernandez 


In [25]:
for i,token in enumerate(doc_test_5):
    print(i)
    print(token.text,'--',token.lemma_,'---',token.pos_,'---',token.has_vector,'---',nlp_2.vocab.strings[str(token)], '---',token.is_stop)

0
reported -- report --- VERB --- False --- 11181246799942687462 --- False
1
accused -- accuse --- VERB --- False --- 8915410849535181575 --- False
2
officer -- officer --- NOUN --- False --- 9228201189916158328 --- False
3
failed -- fail --- VERB --- False --- 4500079622559289248 --- False
4
terminate -- terminate --- VERB --- False --- 974796105764162566 --- False
5
motor -- motor --- NOUN --- False --- 1640505308719491870 --- False
6
vehicle -- vehicle --- NOUN --- False --- 854351138829791262 --- False
7
pursuit -- pursuit --- NOUN --- False --- 14575054513208559317 --- False
8
ordered -- order --- VERB --- False --- 18198004002626200087 --- False
9
Sergeant -- Sergeant --- PROPN --- False --- 2732174988197022273 --- False
10
Hernandez -- Hernandez --- PROPN --- False --- 5312260791442479864 --- False


### Strategy 6: Add Custom Attributes Tagger
NOTES: note attempted