### We seek to implement the full strategy 4 pipeline.  The inputs are raw text and the out puts are a list of tokens processed by spacy.  The pipe line schematic is described below:
1. Input normalization
2. spaCy Processing
    - tokenization
    - tagging
    - parsing
    - ner
3. Filtering
4. Bag of Tokens

#### Import packages

In [70]:
import pandas as pd
import spacy
from spacy.language import Language
from spacy.tokens import Doc
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA, DEP, LEMMA, LOWER, IS_PUNCT, IS_DIGIT, IS_SPACE, IS_STOP
import numpy as np
import en_core_web_sm
import wordninja # for splitting tokens lacking whitespace
# import jamspell
import contextualSpellCheck
import re
from collections import Counter

#### Define input normalziation and token filtering functions

In [4]:
def input_normalization(text):
    '''
    input:
        text: raw text as string
    output:
        text_normalized: normalized string of text

    takes raw text as input and "standardizes" it for spacy processing
    '''
    text_normalized = text.lower()
    text_normalized = text_normalized.split()
    text_normalized = ' '.join(text_normalized)
    return(text_normalized)

def token_filtering(doc):
    '''
    input:
        doc: spaCy doc object
    output:
        bag_of_tokens: list of spaCy Token objects to be used for model features
    Description:
        takes a spacy doc object as an input and filters out undesired tokens.  Puts desired tokens into a list and returns it.
    '''
    #removes stop words
    tok_list_working = [t for t in doc if not t.is_stop]
    
    #removes punctuation
    tok_list_working = [t for t in tok_list_working if (not t.is_punct and not t.is_space)]

    # removes repeats
    count_dict = {}
    for i, tok in enumerate(tok_list_working):
        # count_dict[tok.text.lower() + "_" + tok.pos_] = []
        # count_dict[tok.lemma_.lower() + "_" + tok.pos_] = []
        count_dict[tok.lemma_.lower()] = []

    for i, tok in enumerate(tok_list_working):
        # count_dict[tok.text.lower() + "_" + tok.pos_].append(i)
        # count_dict[tok.lemma_.lower() + "_" + tok.pos_].append(i)
        count_dict[tok.lemma_.lower()].append(i)

    unique_index = []
    for element in count_dict:
        unique_index.append(count_dict[element][0])

    tok_list_final = [tok_list_working[i] for i in unique_index]
    return tok_list_final

In [52]:
def lemma_filtering(doc):
    '''
    input:
        doc: spaCy doc object
    output:
        bag_of_words: list of spaCy Token objects to be used for model features
    Description:
        takes a spacy doc object as an input and filters out undesired tokens.  Puts desired tokens into a list and returns it.
    '''
    #removes stop words
    tok_list_working = [t for t in doc if not t.is_stop]
    
    #removes punctuation
    tok_list_working = [t for t in tok_list_working if (not t.is_punct and not t.is_space)]
    
    list_of_lemmas = [t.lemma_ for t in tok_list_working]
#     bag_of_lemmas = Counter(list_of_lemmas)
    return(list_of_lemmas)
           
def list_to_bag(list):
    bag = Counter(list)
    return(bag)

#### Read Data
 - filter by intake allegation

In [71]:
narratives = pd.read_csv("/Users/pedrogalarza/Documents/NYU-MSDS/2021_police-misconduct/misclass/data/narratives.csv")
intake = narratives.column_name.str.contains('take')
narratives = (narratives[intake])[["cr_id", "column_name", "text"]]
narratives = narratives.drop_duplicates()
df = narratives[:100].copy()
df.head()

Unnamed: 0,cr_id,column_name,text
1,1049924,Initial / Intake Allegation,"THE REPORTING PARTY, WHO DID NOT\nWITNESS THE ..."
5,1050193,Initial / Intake Allegation,It is reported that the accused officer failed...
9,1050294,Initial / Intake Allegation,The reporting party (aD\nalleges that he was b...
12,1050294,Initial / Intake Allegation,The reporting party\nalleges that an unknown o...
13,1050508,Initial / Intake Allegation,The complainant alleges that the accused\noffi...


In [74]:
#test out pipeline on single entry
nlp = spacy.load('en_core_web_sm')

test_text = df['text'].iloc[3]
test_text = input_normalization(test_text)
test_doc = nlp(test_text)
lemma_list = lemma_filtering(test_doc)
list_to_bag(lemma_list)

Counter({'reporting': 4,
         'party': 4,
         'allege': 5,
         'unknown': 3,
         'officer': 3,
         'tase': 2,
         'reason': 2,
         'try': 1,
         'help': 1,
         'brother': 4,
         'ee': 1,
         'strike': 2,
         'pass': 2,
         'vehicle': 3,
         'ma': 1,
         'beat': 1,
         'ncer': 1,
         'club': 1,
         'call': 2,
         'kick': 1,
         'begin': 2,
         'walk': 1,
         'hé': 1,
         'observe': 1,
         'incident': 1,
         'run': 1,
         'rescue': 1,
         'time': 1,
         'police': 3,
         'arrive': 1,
         'fail': 2,
         'file': 1,
         'report': 1,
         'ar': 1,
         'behalf': 1,
         'provide': 1,
         'service': 1,
         'request': 1,
         'verbal': 1,
         'altercation': 1,
         'crowd': 1,
         'appear': 1,
         'physical': 1,
         'fight': 1,
         'ensue': 1,
         'sergeant': 1})

#### Implement

In [75]:
nlp = spacy.load('en_core_web_sm')

df_tokenized = df.copy()
df_tokenized['text_norm'] = df_tokenized.apply(lambda row: input_normalization(row["text"]),axis=1)
df_tokenized['doc'] = df_tokenized.apply(lambda row: nlp(row['text_norm']), axis=1)
df_tokenized['token_list'] = df_tokenized.apply(lambda row: token_filtering(row['doc']), axis=1)
df_tokenized['bag_of_lemmas'] = df_tokenized.apply(lambda row: list_to_bag(lemma_filtering(row['doc'])), axis=1)

df_tokenized

Unnamed: 0,cr_id,column_name,text,text_norm,doc,token_list,bag_of_lemmas
1,1049924,Initial / Intake Allegation,"THE REPORTING PARTY, WHO DID NOT\nWITNESS THE ...","the reporting party, who did not witness the i...","(the, reporting, party, ,, who, did, not, witn...","[reporting, party, witness, incident, alleges,...","{'reporting': 1, 'party': 1, 'witness': 1, 'in..."
5,1050193,Initial / Intake Allegation,It is reported that the accused officer failed...,it is reported that the accused officer failed...,"(it, is, reported, that, the, accused, officer...","[reported, accused, officer, failed, terminate...","{'report': 4, 'accuse': 4, 'officer': 4, 'fail..."
9,1050294,Initial / Intake Allegation,The reporting party (aD\nalleges that he was b...,the reporting party (ad alleges that he was be...,"(the, reporting, party, (, ad, alleges, that, ...","[reporting, party, ad, alleges, beaten, bounce...","{'reporting': 5, 'party': 5, 'ad': 1, 'allege'..."
12,1050294,Initial / Intake Allegation,The reporting party\nalleges that an unknown o...,the reporting party alleges that an unknown of...,"(the, reporting, party, alleges, that, an, unk...","[reporting, party, alleges, unknown, officer, ...","{'reporting': 4, 'party': 4, 'allege': 5, 'unk..."
13,1050508,Initial / Intake Allegation,The complainant alleges that the accused\noffi...,the complainant alleges that the accused offic...,"(the, complainant, alleges, that, the, accused...","[complainant, alleges, accused, officer, physi...","{'complainant': 1, 'allege': 1, 'accuse': 1, '..."
...,...,...,...,...,...,...,...
217,1073593,Initial / Intake Allegation,The reporting party victim alleged that when\n...,the reporting party victim alleged that when t...,"(the, reporting, party, victim, alleged, that,...","[reporting, party, victim, alleged, accused, o...","{'reporting': 2, 'party': 2, 'victim': 2, 'all..."
218,1074323,Initial / Intake Allegation,The reporting party alleged that an officer\np...,the reporting party alleged that an officer pu...,"(the, reporting, party, alleged, that, an, off...","[reporting, party, alleged, officer, pulled, v...","{'reporting': 1, 'party': 1, 'allege': 1, 'off..."
221,1074323,Initial / Intake Allegation,The reporting party alleged that an officer\np...,the reporting party alleged that an officer pu...,"(the, reporting, party, alleged, that, an, off...","[reporting, party, alleged, officer, pulled, v...","{'reporting': 1, 'party': 1, 'allege': 1, 'off..."
222,1074498,Initial / Intake Allegation,The complainant alleged that the accused\noffi...,the complainant alleged that the accused offic...,"(the, complainant, alleged, that, the, accused...","[complainant, alleged, accused, officers, appr...","{'complainant': 4, 'allege': 2, 'accuse': 2, '..."
