In [175]:
#####Set the column names appearing in the incident narratives .csv file, beginning with the individual case identifier.
##### As an example, shown here, we include 7 columns, one each for the case identifier
##### (i.e., PersonID) and the incident narrative text, as well as one column for each 
##### of our five binary labels that we identified and coded through our manual review process.

col_names = ["PersonID", "AcuteChronicPain", "RecentDispute", "RomanticRelationshipProblem", "ImmediateFamily_AtScene", "IntimatePartner_AtScene", "narrative"]

In [176]:
###Load your incident narrative data by inputting the file path within double quotation marks below, replacing the sample file path.

import pandas as pd
train = pd.read_csv(r"C:\Users\Box\code_files\narratives_file.csv", header=0, dialect='excel', encoding = "ISO-8859-1")

In [177]:
###This will tell you the shape of your incident narratives data file.

train.shape

(1462, 21)

In [178]:
###This will indicate the column names.

train.columns.values

array(['ï»¿PersonID', 'AbusiveRelationship', 'AcuteChronicPain',
       'caregiver', 'Bullying', 'Custody', 'Dementia', 'Abortion',
       'SubstanceAbuse', 'Infertility', 'Miscarriage', 'PostpartumMH',
       'RecentDispute', 'RomanticRelationshipProblem', 'Sexual_violence',
       'Sleep_Prob', 'Isolation_Lonliness', 'Terminal',
       'ImmediateFamily_AtScene', 'IntimatePartner_AtScene', 'narrative'],
      dtype=object)

In [179]:
###This is a nice check to read a narrative (here the 5th narrative) and ensure the incident narrative text mirrors what is in the .csv file.

print(train["narrative"][5])

 ME: VICTIM 60 YEAR OLD WHITE FEMALE DIED FROM A SELF-INFLICTED SHOTGUN TO CHEST WITH A .12 GAUGE DOUBLE BARREL SHOTGUN AT HER RESIDENCE.  VICTIMâS HUSBAND FOUND THE VICTIM SITTING OUT COUCH, AFTER HER HUSBAND REPORTED HEARING A LOUD NOISE. HE WENT INTO THE LIVING ROOM AND FOUND THE VICTIM. THERE WAS A CHILDRENâS SNORKEL CLOSE TO THE GUN.  IT IS BELIEVED THAT THE VICTIM USED THIS TO PULL THE TRIGGER.  EMS RESPONDED TO THE HOME. HOWEVER, NO RESUSCITATIVE MEASURES WERE TAKEN.  VICTIM WAS PRONOUNCED ON SCENE BY CORONER.  VICTIM SUFFERED FROM DEPRESSION FOR TEN (10) YEARS.  THIS WAS DUE TO SEVERAL DEATHS TO FAMILY MEMBERS. SHE WAS TAKING MEDICATIONS TO HELP WITH HER DEPRESSION.  OVER THE PAST COUPLE OF MONTHS, THE VICTIMâS HUSBAND HAS PICKED UP ON SIGNS THAT THE VICTIM MAY BE DEPRESSED AGAIN.  THE HUSBAND ADDRESSED THIS CONCERN WITH THE VICTIM. DURING THAT CONVERSATION, SHE ADMITTED THAT SHE HAS STOPPED TAKING HER MEDICATION.  TEN (10) YEARS AGO, VICTIM VOICED SUICIDAL IDEATIONS.  NO

In [180]:
###Likewise, you can import and use BeautifulSoup to check and ensure the incident narrative text mirrors what is in the .csv file.

from bs4 import BeautifulSoup             

example1 = BeautifulSoup(train["narrative"][5]) 

print(train["narrative"][5])
print(example1.get_text())


In [None]:
###### NLP Preprocessing the Incident Narratives

In [182]:
###Here we can remove punctuation and non-alphanumeric symbols so that we are left with words.

import re
# Use regular expressions to do a find-and-replace
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      example1.get_text() )  # The text to search
print(letters_only)

ME  VICTIM    YEAR OLD WHITE FEMALE DIED FROM A SELF INFLICTED SHOTGUN TO CHEST WITH A     GAUGE DOUBLE BARREL SHOTGUN AT HER RESIDENCE   VICTIM   S HUSBAND FOUND THE VICTIM SITTING OUT COUCH  AFTER HER HUSBAND REPORTED HEARING A LOUD NOISE  HE WENT INTO THE LIVING ROOM AND FOUND THE VICTIM  THERE WAS A CHILDREN   S SNORKEL CLOSE TO THE GUN   IT IS BELIEVED THAT THE VICTIM USED THIS TO PULL THE TRIGGER   EMS RESPONDED TO THE HOME  HOWEVER  NO RESUSCITATIVE MEASURES WERE TAKEN   VICTIM WAS PRONOUNCED ON SCENE BY CORONER   VICTIM SUFFERED FROM DEPRESSION FOR TEN      YEARS   THIS WAS DUE TO SEVERAL DEATHS TO FAMILY MEMBERS  SHE WAS TAKING MEDICATIONS TO HELP WITH HER DEPRESSION   OVER THE PAST COUPLE OF MONTHS  THE VICTIM   S HUSBAND HAS PICKED UP ON SIGNS THAT THE VICTIM MAY BE DEPRESSED AGAIN   THE HUSBAND ADDRESSED THIS CONCERN WITH THE VICTIM  DURING THAT CONVERSATION  SHE ADMITTED THAT SHE HAS STOPPED TAKING HER MEDICATION   TEN      YEARS AGO  VICTIM VOICED SUICIDAL IDEATIONS   NO 

In [183]:
###Download the following NLTK packages.
import nltk
nltk.download('punkt')
nltk.download('stopwords') 

In [184]:
###Import the stop word list.

from nltk.corpus import stopwords
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [185]:
###### NLP Preprocessing: Including token stemming (and lemmatization). Choose either suboption 4b below for Snowball stemming
###### or 4a below for lemmatization using #s to disable one or the other.

from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

lemmer=WordNetLemmatizer()
stemmer = SnowballStemmer('english')

def narrative_to_words( raw_narrative ):
    # Function to convert a raw narrative to a string of words
    # The input is a single string (a raw text narrative), and 
    # the output is a single string (a preprocessed text narrative)
    #
    # 1. Remove HTML
    narrative_text = BeautifulSoup(raw_narrative).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", narrative_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()  
    
    # 4a. Lemmatize the individual words
    #####newcorpus=[' '.join([lemmer.lemmatize(words) for words in text.split(' ')]) for text in words]
    
    # 4b. Stemming the individual words
    newcorpus=[' '.join([stemmer.stem(words) for words in text.split(' ')]) for text in words]
    
    # 5. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 6. Remove stop words
    meaningful_words = [w for w in newcorpus if not w in stops]   
    #
    # 7. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   

In [186]:
###From our previous example, we can see what the preprocessing steps do to each narrative report.

clean_narrative = narrative_to_words(train["narrative"][5])
print(clean_narrative)

victim year old white femal die self inflict shotgun chest gaug doubl barrel shotgun resid victim husband found victim sit couch husband report hear loud nois went live room found victim children snorkel close gun believ victim use pull trigger em respond home howev resuscit measur taken victim pronounc scene coron victim suffer depress ten year due sever death famili member take medic help depress past coupl month victim husband pick sign victim may depress husband address concern victim dure convers admit stop take medic ten year ago victim voic suicid ideat note intent found scene detail co victim year old white femal die self inflict gunshot wound chest gaug shotgun victim place resid victim use snorkel engag firearm victim mother commit suicid approxim year grandson granddaught kill car wreck victim talk suicid never attempt victim left note detail time victim year old white femal die self inflict gunshot wound chest shotgun victim place resid detail time


In [187]:
###Get the number of text narratives based on the dataframe column size
num_narratives = train["narrative"].size

###Initialize an empty list to hold the clean reviews
clean_train_narratives = []

In [188]:
###Loop over each text narrative; create an index i that goes from 0 to the length
### of the text narrative list 
for i in range( 0, num_narratives ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_narratives.append( narrative_to_words( train["narrative"][i] ) )

In [189]:
###Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             ngram_range = (1,2), \
                             max_features = 1000) 

### fit_transform() does two functions: First, it fits the model
### and learns the vocabulary; second, it transforms our training data
### into feature vectors. The input to fit_transform should be a list of 
### strings.

train_data_features = vectorizer.fit_transform(clean_train_narratives)


In [190]:
### Weight word tokens using TF-IDF technique.

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(train_data_features)

### Convert the TF-IDF results to an array

tfidf_features = X_train_tfidf.toarray()

In [191]:
###Set your features (X) to be the TF-IDF weighted features from the previous step.

### IMPORTANT: Set you label of interest (y) here. This will change for different labels.
### For example, IntimatePartner_AtScene would be set as our label below.

X = tfidf_features
y = train["IntimatePartner_AtScene"]

In [194]:
### Take a look at the words in the vocabulary.

vocab = vectorizer.get_feature_names()
print(vocab)

['abdomen', 'abl', 'abus', 'access', 'accord', 'accord report', 'accord victim', 'act', 'addict', 'addit', 'address', 'admit', 'advis', 'age', 'ago', 'alcohol', 'alcohol abus', 'aliv', 'alon', 'along', 'alprazolam', 'also', 'also found', 'also report', 'also state', 'alway', 'ammunit', 'amount', 'ani', 'ani prior', 'anoth', 'answer', 'antidepress', 'anxieti', 'anxieti depress', 'anymor', 'anyth', 'apart', 'appar', 'appar gunshot', 'appar self', 'appar suicid', 'appear', 'approxim', 'approxim hour', 'area', 'argu', 'argument', 'arm', 'around', 'around hour', 'arrest', 'arriv', 'arriv found', 'arriv home', 'arriv pronounc', 'arriv scene', 'ask', 'assist', 'attempt', 'attempt suicid', 'attempt threat', 'attempt victim', 'auto', 'automat', 'automat handgun', 'automat pistol', 'autopsi', 'avail', 'away', 'bac', 'back', 'backyard', 'bag', 'basement', 'bathroom', 'bathtub', 'becam', 'becaus', 'becom', 'bed', 'bedroom', 'bedroom resid', 'beer', 'befor', 'befor incid', 'began', 'behavior', 'beh

In [195]:
import numpy as np

### Sum the counts of each vocabulary word.

dist = np.sum(tfidf_features, axis=0)

### For each, print the vocabulary word and the frequency it 
### appears in the training set.

for tag, count in zip(vocab, dist): print(count, tag)

6.29240627276257 abdomen
7.817713233039054 abl
21.556492515607864 abus
3.685824917306344 access
17.41640762114624 accord
7.334962757462632 accord report
3.372440192545998 accord victim
4.4740471282644805 act
4.075255007845817 addict
7.833842586032708 addit
6.111794050021267 address
5.495639115599844 admit
12.516563642591601 advis
8.74054356038504 age
18.437746255022265 ago
26.89615248281266 alcohol
7.784408354826711 alcohol abus
12.33496028289731 aliv
4.657886872687203 alon
7.4780564839389605 along
5.198018427324151 alprazolam
29.517285903850887 also
3.3297355204388266 also found
4.63309748934102 also report
5.071228730014727 also state
5.091002197790329 alway
3.9623503130090096 ammunit
4.6187846713555984 amount
19.113163749411992 ani
6.597946924098546 ani prior
12.82534990949863 anoth
6.542648615986751 answer
4.911334066512435 antidepress
15.824946223383597 anxieti
4.558883489862856 anxieti depress
6.620963230482594 anymor
3.7686815005867333 anyth
9.301092701396584 apart
20.7243302741

In [196]:
######
###### Supervised Machine Learning: Classifying and Testing Mode Performance
######

In [197]:
#####
#####Gradient Boosting (GB) Classifier
#####

In [198]:
###Create a 20% hold-out test set
### If you are using this process to tune and identify a best model's hyperparameters,
### it is a good practice to conduct this experiment with multiple (e.g., 5) random train_test_split
### seeds (e.g, random_state = 72, below) and multiple random model seeds (e.g., random_state=891, below).

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.20, random_state=72)

In [199]:
###Defining the GB hyperparameter choices. You can learn more about this classifier
### and the hyperparameter options at 
### https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html.

from time import time
import scipy.stats as stats
from sklearn.utils.fixes import loguniform
from scipy.stats import uniform, loguniform
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt  

param_grid_gb = params = {
                  "loss": ['exponential'],
                  "learning_rate": sp_randFloat(),
                  "subsample"    : sp_randFloat(),
                  "n_estimators" : sp_randInt(100, 1000),
                  "max_depth"    : sp_randInt(4, 10),
                  "min_samples_split" : [2, 5, 10],
                  "min_samples_leaf": [1, 2, 4],
                  "max_features" : ["log2","sqrt"],
                  "criterion": ["friedman_mse",  "mse"],
}

In [200]:
###Conduct 5-fold CV to validate the model while optimizing the model hyperparameters to maximize F1.

import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

gb = GradientBoostingClassifier(random_state=891)

# Random search of parameters, using 5 fold cross validation, 
# search across 50 different combinations, and use all available cores
clf_random_gb = RandomizedSearchCV(estimator=gb, param_distributions=param_grid_gb,
                              scoring='f1', n_iter = 100,
                              cv = 5, verbose=2, n_jobs=-1, random_state=891)

# Fit the random search model
clf_random_gb.fit(X_train, Y_train);
Y_pred=clf_random_gb.predict(X_test)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [201]:
###Dispay the best model hyperparameters identified through the process above. You will use these later to
### specify your best GB model when applying the label of interest to your broader sample of unlabeled 
### data in NVDRS.

clf_random_gb.best_params_

{'criterion': 'friedman_mse',
 'learning_rate': 0.5573336512484721,
 'loss': 'exponential',
 'max_depth': 7,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 995,
 'subsample': 0.85095136244715}

In [202]:
def evaluate(model, X_test, Y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - Y_test)
    mape = 100 * np.mean(errors / Y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f}'.format(np.mean(errors)))
    

In [203]:
###Examine the best model's performance metrics for this combination of random seeds.

from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8783    0.9352    0.9058       216
           1     0.7778    0.6364    0.7000        77

    accuracy                         0.8567       293
   macro avg     0.8280    0.7858    0.8029       293
weighted avg     0.8519    0.8567    0.8517       293



In [204]:
###You can also examine specific performance metrics as such.

roc_auc_score(Y_test, Y_pred)
#roc_auc_score(Y_test, clf_random_gb.best_estimator_.predict(X_test))

0.7857744107744108