In [None]:
#####Section 1. You can use this code to apply your best GB (or other algorithm) model using the
##### hyperparameters your found earlier. The following NLP preprocessing steps will be the same
##### as explained earlier. The ML modeling steps will be different in Section 2, below.

col_names = ["PersonID", "AcuteChronicPain", "RecentDispute", "RomanticRelationshipProblem", "ImmediateFamily_AtScene", "IntimatePartner_AtScene", "narrative"]

In [None]:
import pandas as pd
train = pd.read_csv(r"C:\Users\Box\code_files\narratives_file.csv", header=0, dialect='excel', encoding = "utf-8-sig")

In [None]:
train.shape

In [None]:
train.columns.values

In [None]:
print(train["narrative"][25])

In [None]:
###USING BEAUTIFUL SOUP FOR PREPROCESSING

from bs4 import BeautifulSoup             

example1 = BeautifulSoup(train["narrative"][25]) 

In [None]:
print(train["narrative"][25])
print(example1.get_text())

In [None]:
import re
# Use regular expressions to do a find-and-replace
letters_only = re.sub("[^a-zA-Z]",           # The pattern to search for
                      " ",                   # The pattern to replace it with
                      example1.get_text() )  # The text to search
print(letters_only)

In [None]:
from nltk.corpus import stopwords # Import the stop word list
print(stopwords.words("english"))

In [None]:
######NLP Preprocessing: Including token stemming (and lemmatization). Choose either suboption 4b below for Snowball stemming
###### or 4a below for lemmatization using #s to disable one or the other.

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

lemmer=WordNetLemmatizer()
stemmer = SnowballStemmer('english')

def narrative_to_words( raw_narrative ):
    # Function to convert a raw narrative to a string of words
    # The input is a single string (a raw text narrative), and 
    # the output is a single string (a preprocessed text narrative)
    #
    # 1. Remove HTML
    narrative_text = BeautifulSoup(raw_narrative).get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", narrative_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()  
    
    # 4a. Lemmatize the individual words
    #####newcorpus=[' '.join([lemmer.lemmatize(words) for words in text.split(' ')]) for text in words]
    
    # 4b. Stemming the individual words
    newcorpus=[' '.join([stemmer.stem(words) for words in text.split(' ')]) for text in words]
    
    # 5. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 6. Remove stop words
    meaningful_words = [w for w in newcorpus if not w in stops]   
    #
    # 7. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))   

In [None]:
clean_narrative = narrative_to_words(train["narrative"][25])
print(clean_narrative)

In [None]:
# Get the number of text narratives based on the dataframe column size
num_narratives = train["narrative"].size

# Initialize an empty list to hold the clean reviews
clean_train_narratives = []

In [None]:
# Loop over each text narrative; create an index i that goes from 0 to the length
# of the text narrative list 
for i in range( 0, num_narratives ):
    # Call our function for each one, and add the result to the list of
    # clean reviews
    clean_train_narratives.append( narrative_to_words( train["narrative"][i] ) )

In [None]:
# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             ngram_range = (1,2), \
                             stop_words = None,    \
                             max_features = 1000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_narratives)


In [None]:
# ! Convert word tokens frequency to TF-IDF features !
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(train_data_features)


# Convert the TF-IDF results to an array
train_data_features = X_train_tfidf.toarray()

In [None]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print(vocab)

In [None]:
######
###### Section 2. Using the best GB model to predict the binary label of
###### interest to unlabeled samples. You can also do this process with a different
###### classifier (e.g., RF or SVM).
######

In [None]:
###Train the GB model using the best hyperparameters identified earlier.

from sklearn.ensemble import GradientBoostingClassifier

### The following hyperparameter values are just examples. Input your own hyperparameters here:
gb = GradientBoostingClassifier(criterion='friedman_mse', learning_rate=0.7816184107748736, loss= 'exponential', max_depth=4, max_features='sqrt', min_samples_leaf=2, min_samples_split=10, n_estimators=410, subsample=0.7770667453608728)

### Now fit:
gb = gb.fit(train_data_features, train["IntimatePartner_AtScene"])

In [None]:
###Read a .csv file with unlabeled incident narratives. E.g., a .csv file with two columns:
### one with the PersonID and the second with the incident narrative text.
test = pd.read_csv(r"C:\Users\Box\code_files\unlabeled_narratives_file.csv", header=0, dialect='excel', encoding = "utf-8-sig")

print(test.shape)

# Create an empty list and append the clean narratives one by one
num_narratives = len(test["narrative"])
clean_test_narratives = [] 

for i in range(0,num_narratives):
    if( (i+1) % 1000 == 0 ): print("Review %d of %d\n" % (i+1, num_narratives))
    clean_narrative = narrative_to_words(test["narrative"][i] )
    clean_test_narratives.append(clean_narrative)

# Get a bag of words for the test set, and convert to a numpy array
test_data_features = vectorizer.transform(clean_test_narratives)

tfidf_transformer = TfidfTransformer()
X_test_tfidf = tfidf_transformer.fit_transform(test_data_features)


# Convert the TF-IDF results to an array



In [None]:
clean_narrative = narrative_to_words(test["narrative"][1])
print(clean_narrative)

In [None]:
###Use the GB model to make outcome label predictions using the unlabeled incident narrative features.
result = gb.predict(X_test_tfidf)

In [None]:
###Apply and output the GB model label predictions to a .csv file.

# Copy the results to a pandas dataframe with an "id" column and
# a "label" column
output = pd.DataFrame(data={"PersonID":test["PersonID"], "PREDICTION":result})

# Use pandas to write a CSV output file: This will containt the binary measure (e.g., 0 = no or 1 = yes)
# indicating whether it was predicted and labeled that a female firearm suicide decedent experienced
# the circumstance of interest (i.e,. through the NLP/ML pipeline). You can replicate this process for
# different binary circumstance labels of interest -- just make sure you change the label of interest
# throughout the code above.

output.to_csv("C:\\Users\\Box\\label_predictions_model_GB_labelname.csv", index=False, quoting=3 )