In [None]:
#####Set the column names appearing in the incident narratives .csv file, beginning with the individual case identifier.
##### As an example, shown here, we include 7 columns, one each for the case identifier
##### (i.e., PersonID) and the incident narrative text, as well as one column for each 
##### of our five binary labels that we identified and coded through our manual review process.

col_names = ["PersonID", "AcuteChronicPain", "RecentDispute", "RomanticRelationshipProblem", "ImmediateFamily_AtScene", "IntimatePartner_AtScene", "narrative"]

In [None]:
###Load your incident narrative data by inputting the file path within double quotation marks below, replacing the sample file path.

import pandas as pd
train = pd.read_csv(r"C:\Users\Box\code_files\narratives_file.csv", header=0, dialect='excel', encoding = "ISO-8859-1")

In [None]:
###This will tell you the shape of your incident narratives data file.

train.shape

In [None]:
###This will indicate the column names.

train.columns.values

In [None]:
###This is a nice check to read a narrative (here the 5th narrative) and ensure the incident narrative text mirrors what is in the .csv file.

print(train["narrative"][5])

In [None]:
###Likewise, you can import and use BeautifulSoup to check and ensure the incident narrative text mirrors what is in the .csv file.

from bs4 import BeautifulSoup             

example1 = BeautifulSoup(train["narrative"][5]) 

print(train["narrative"][5])
print(example1.get_text())


In [None]:
###### NLP Preprocessing the Incident Narratives

In [None]:
###Here we can remove punctuation and non-alphanumeric symbols so that we are left with words.

import re
# Use regular expressions to do a find-and-replace
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      example1.get_text() )  # The text to search
print(letters_only)

In [None]:
###Download the following NLTK packages.
import nltk
nltk.download('punkt')
nltk.download('stopwords') 

In [None]:
###Import the stop word list.

from nltk.corpus import stopwords
print(stopwords.words("english"))

In [None]:
###### NLP Preprocessing: Including token stemming (and lemmatization). Choose either suboption 4b below for Snowball stemming
###### or 4a below for lemmatization using #s to disable one or the other.

from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

lemmer=WordNetLemmatizer()
stemmer = SnowballStemmer('english')

def narrative_to_words( raw_narrative ):
    # Function to convert a raw narrative to a string of words
    # The input is a single string (a raw text narrative), and 
    # the output is a single string (a preprocessed text narrative)
    #
    # 1. Remove HTML
    narrative_text = BeautifulSoup(raw_narrative).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", narrative_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()  
    
    # 4a. Lemmatize the individual words
    #####newcorpus=[' '.join([lemmer.lemmatize(words) for words in text.split(' ')]) for text in words]
    
    # 4b. Stemming the individual words
    newcorpus=[' '.join([stemmer.stem(words) for words in text.split(' ')]) for text in words]
    
    # 5. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 6. Remove stop words
    meaningful_words = [w for w in newcorpus if not w in stops]   
    #
    # 7. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   

In [None]:
###From our previous example, we can see what the preprocessing steps do to each narrative report.

clean_narrative = narrative_to_words(train["narrative"][5])
print(clean_narrative)

In [None]:
###Get the number of text narratives based on the dataframe column size
num_narratives = train["narrative"].size

###Initialize an empty list to hold the clean reviews
clean_train_narratives = []

In [None]:
###Loop over each text narrative; create an index i that goes from 0 to the length
### of the text narrative list 
for i in range( 0, num_narratives ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_narratives.append( narrative_to_words( train["narrative"][i] ) )

In [None]:
###Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             ngram_range = (1,2), \
                             max_features = 1000) 

### fit_transform() does two functions: First, it fits the model
### and learns the vocabulary; second, it transforms our training data
### into feature vectors. The input to fit_transform should be a list of 
### strings.

train_data_features = vectorizer.fit_transform(clean_train_narratives)


In [None]:
### Weight word tokens using TF-IDF technique.

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(train_data_features)

### Convert the TF-IDF results to an array

tfidf_features = X_train_tfidf.toarray()

In [None]:
###Set your features (X) to be the TF-IDF weighted features from the previous step.

### IMPORTANT: Set you label of interest (y) here. This will change for different labels.
### For example, IntimatePartner_AtScene would be set as our label below.

X = tfidf_features
y = train["IntimatePartner_AtScene"]

In [None]:
### Take a look at the words in the vocabulary.

vocab = vectorizer.get_feature_names()
print(vocab)

In [None]:
import numpy as np

### Sum the counts of each vocabulary word.

dist = np.sum(tfidf_features, axis=0)

### For each, print the vocabulary word and the frequency it 
### appears in the training set.

for tag, count in zip(vocab, dist): print(count, tag)

In [None]:
######
###### Supervised Machine Learning: Classifying and Testing Mode Performance
######

In [None]:
#####
#####Gradient Boosting (GB) Classifier
#####

In [None]:
###Create a 20% hold-out test set
### If you are using this process to tune and identify a best model's hyperparameters,
### it is a good practice to conduct this experiment with multiple (e.g., 5) random train_test_split
### seeds (e.g, random_state = 72, below) and multiple random model seeds (e.g., random_state=891, below).

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.20, random_state=72)

In [None]:
###Defining the GB hyperparameter choices. You can learn more about this classifier
### and the hyperparameter options at 
### https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html.

from time import time
import scipy.stats as stats
from sklearn.utils.fixes import loguniform
from scipy.stats import uniform, loguniform
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt  

param_grid_gb = params = {
                  "loss": ['exponential'],
                  "learning_rate": sp_randFloat(),
                  "subsample"    : sp_randFloat(),
                  "n_estimators" : sp_randInt(100, 1000),
                  "max_depth"    : sp_randInt(4, 10),
                  "min_samples_split" : [2, 5, 10],
                  "min_samples_leaf": [1, 2, 4],
                  "max_features" : ["log2","sqrt"],
                  "criterion": ["friedman_mse",  "mse"],
}

In [None]:
###Conduct 5-fold CV to validate the model while optimizing the model hyperparameters to maximize F1.

import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

gb = GradientBoostingClassifier(random_state=891)

# Random search of parameters, using 5 fold cross validation, 
# search across 50 different combinations, and use all available cores
clf_random_gb = RandomizedSearchCV(estimator=gb, param_distributions=param_grid_gb,
                              scoring='f1', n_iter = 100,
                              cv = 5, verbose=2, n_jobs=-1, random_state=891)

# Fit the random search model
clf_random_gb.fit(X_train, Y_train);
Y_pred=clf_random_gb.predict(X_test)


In [None]:
###Dispay the best model hyperparameters identified through the process above. You will use these later to
### specify your best GB model when applying the label of interest to your broader sample of unlabeled 
### data in NVDRS.

clf_random_gb.best_params_

In [None]:
def evaluate(model, X_test, Y_test):
    predictions = model.predict(X_test)
    errors = abs(predictions - Y_test)
    mape = 100 * np.mean(errors / Y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f}'.format(np.mean(errors)))
    

In [None]:
###Examine the best model's performance metrics for this combination of random seeds.

from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred, digits=4))

In [None]:
###You can also examine specific performance metrics as such.

roc_auc_score(Y_test, Y_pred)
#roc_auc_score(Y_test, clf_random_gb.best_estimator_.predict(X_test))