# CSAA

In [None]:
# === Libraries ===
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import csv
nltk.download('wordnet')
from nltk.corpus import stopwords
from wordcloud import WordCloud
from textblob import TextBlob
from textblob import Word
from nltk.stem import PorterStemmer
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV, train_test_split

# === Potential libraries to use === 
#from bs4 import BeautifulSoup
#from nltk import word_tokenize, sent_tokenize
#from nltk.stem import LancasterStemmer, WordNetLemmatizer

## Data Understanding

In [85]:
# Change to the directory where the notes and claims data is
cwd = os.getcwd()

# Reading the note_data3.csv to a pandas dataframe
notes_old = pd.read_csv('note_data4.csv')
#claims = pd.read_csv('claim_data4.csv')

# Use a subset to avoid memory error (first hundred thousand values)
#n_samples = 100000
#notes_subset = notes_old[:n_samples]

In [None]:
# First five values of notes_subset
notes_subset.head()

In [None]:
# Information about notes_subset (i.e column types and dimensions)
notes_subset.info()

In [None]:
# Text data for notes for basic exploration
text = notes['body']
text.head()

In [None]:
# Dimensions of notes
notes.shape

In [None]:
# Number of null values
sum(notes['body'].isna() == True)

## Data Cleaning

In [None]:
# Removing all the null values
notes = notes[notes['body'].notnull()]

In [None]:
# Converting the text data to all lowercase to avoid multiple copies of the same word 
notes['body'] = notes['body'].apply(lambda x: " ".join(x.lower() for x in x.split()))
notes['body'].head()

In [None]:
# Removing punctuation as it doesn't add any information 
notes['body'] = notes['body'].str.replace('[^\w\s]','')
notes['body'].head()

In [None]:
# Removal of Stop Words as it also won't add any information
stop_nltk = list(stop_words.ENGLISH_STOP_WORDS)
stop_tm = list(pd.read_csv('tm_stop.csv').iloc[:,1])

stopWords = list(set().union(stop_nltk, stop_tm))

notes['body'] = notes['body'].apply(lambda x: " ".join(x for x in x.split() if x not in stopWords))
notes['body'].head()

In [None]:
# Lemmatization
notes['body'] = notes['body'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
notes['body'].head()

In [None]:
# Stemming 
st = PorterStemmer()
notes['body'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
notes['body'].head()

In [26]:
# Stop words using CSAA data after stemming/lemmatization
CSAA_stop_words = ["how", "you", "absolut", "account", "actual", "address", "afternoon", "agent", "ahead", "alreadi", "alright", "alrighti", 
"also", "alway","answer", "anyth", "apolog", "appreci", "assist", "auto", "automat", "avail", "gonna", 
"good", "great", "guess", "hello", "help", "hope","just", "kind", "kinda", "know", "like", "mhmm",
"might", "name", "number", "phone","pleas","pleasur","polici","probabl", "problem", "question", "quick",
"welcom", "well", "went", "whatev", "whenev", "wonder", "xxxx", "xxxxx", "cam", "xxxxxx", "xxxxxxx", "xxxxxxxx", 
"xxxxxxxxx", "xxxxxxxxxx", "xxxxxxxxxxx", "yeah", "youd", "youll", "youv", "help", "wanna", "want", "much", 
"today", "sure", "call", "right", "xaxx",  "xoxxxxx", "xxxc", "okay", "um", "uh", "yeah", "thank", "policy",
"let", "yes", "mean", "moment", "don", "going", "hi", "bye", "need", "calling", "oh", "ll", "hold",
"day", "think", "look", "got", "did", "correct", "aaa", "aaa insurance", "dot", "com", "thanks", "holding",
"br", "xx", "xxx", "appreciate", "business", "representative", "available", "sir", "ma", "michelle", "kimberly",
"nancy","line", "morning", "actually", "gonzalez", "minute", "welcome", "perfect", "verify", "zip", "code",
"axxxx", "verify zip code", "verify zip", "patience", "mister", "sorry", "exactly", "ve", "reached", "place",
"minutes", "ask", "couple", "questions", "insurance", "needs", "ma'am", ">", "<", "'ll", "happy",
"'m", "'re", "'s", "'re", "x", "gon", "na", "n't", "na", "ca", "wan", "does",
"taken", "care", "little", "bit", "double", "check", "maybe", "said", "say", "really", "cause", "way",
"date", "july", "august", "year", "month", "customer", "service", "recorded", "monitored", "quality", "center",
"xxxxxxxxxxxxxxxx", "best", "possible", "speaking", "bring", "able", "xxxxxxxxxxxxxxxx", "xxxxxxxxxxxxxxxx",
"apologize", "services", "providing", "doing", "able", "ready", "looks", "apologize", "provide", "speak", "doing",
"absolutely", "access", "wonderful", "information", "enjoy", "rest", "assisting", "guy", "november", "december", 
"somebody","honda", "huh", "pleasure", "choosing", "supervisor", "evening", "waiting", "weekend", "october", "dollar", "guys", "brief",
"wait", "contact", "alrighty", "nice", "ok", "trying", "mind", "reason", "miss", "getting", "thousand",
"looking", "course", "didn", "press", "ensure", "set", "loyal", "conversation", "pull",
"clmt", "clmts" , "claimant"  , "claimants"  , "clmnt","csaa",  "mr" ,  "wb", "nb", "sb", "eb"  , "claim", "injury", 
"office", "letter", "form" , "to", "from", "at", "in", "with", "within", "injured", "advise", "send", 
"vehicle","received", "insured", "insure", "bodily", "liability", "adjuster", "loss", "info", "left", "right", "email", "phone", "call", "time",
'ccdoclink','injury', 'james','mary','john','patricia','robert', 'jennifer','michael','linda','william','elizabeth',
'david','barbara','richard','susan','joseph','jessica','thomas','sarah','charles','margaret','christopher','karen',
'daniel','nancy','matthew','lisa','anthony','betty','donald','dorothy','mark','sandra','paul','ashley','steven',
'kimberly','andrew','donna','kenneth','emily','george','carol','joshua','michelle','kevin','amanda','brian',
'melissa','edward','deborah','ronald','stephanie','timothy','rebecca','jason','laura','jeffrey','helen','ryan',
'sharon','jacob','cynthia','gary','kathleen','nicholas','amy','eric','shirley','stephen','angela','jonathan',
'anna','larry','ruth','justin','brenda','scott','pamela','brandon','nicole','frank','katherine','benjamin',
'samantha','gregory','christine','raymond','catherine','samuel','virginia','patrick','debra','alexander','rachel',
'jack','janet','dennis','emma','jerry','carolyn','tyler','maria','aaron','heather','henry','diane','jose','julie',
'douglas','joyce','peter','evelyn','adam','joan','nathan','victoria','zachary','kelly','walter','christina','kyle',
'lauren','harold','frances','carl','martha','jeremy','judith', 'gerald','cheryl','keith','megan','roger','andrea',
'arthur','olivia','terry','ann',';awrence','jean','sean','alice','christian','jacqueline','ethan','hannah','austin',
'doris','joe','kathryn','albert','gloria','jesse','teresa','willie','sara','billy','janice','bryan','marie','bruce',
'julia','noah','grace','jordan','judy','dylan','theresa','ralph','madison','roy','beverly','alan','denise','wayne',
'marilyn','eugene','amber','juan','danielle','gabriel','rose','louis','brittany','russell','diana', 'randy',
'abigail','vincent','natalie','philip', 'jane','logan','lori','bobby','alexis','harry','tiffany','johnny','kayla'] 

#notes['body'] = notes['body'].apply(lambda x: " ".join(x for x in x.split() if x not in CSAA_stop_words))
#notes['body'].head()

In [None]:
# Removing common words as these words might be too vague or general (top 12 most common words)
common_words = pd.Series(' '.join(notes['body']).split()).value_counts()[:12]
common_words

## Ask if we need to increase number of common words to remove more or less 

In [None]:
# Removing common words as these words might be too vague or general (top 12 most common words) (cont.)
common_words = list(common_words.index)
notes['body'] = notes['body'].apply(lambda x: " ".join(x for x in x.split() if x not in common_words))
notes['body'].head()

In [None]:
# Removing rare words as these words will not offer any info (top 10000 least used words)
rare_words = pd.Series(' '.join(notes['body']).split()).value_counts()[-1000:]
rare_words

In [None]:
# Removing rare words as these words will not offer any info (top 50000ish least used words) (cont.)
rare_words = list(rare_words.index)
notes['body'] = notes['body'].apply(lambda x: " ".join(x for x in x.split() if x not in rare_words))
notes['body'].head()

In [None]:
# Removing names as these words are not helpful in our analysis (could add more)
names_list = ['macmeekin', 'brum', 'eliza', 'greg', 'stein', 'howard', 'emeka', 'ellison', 
              'rick', 'gayle', 'slater', 'feeny']

notes['body'] = notes['body'].apply(lambda x: " ".join(x for x in x.split() if x not in names_list))

In [None]:
# Removing rows with nan in the body column
notes_subset1['body'] = notes_subset1['body'][notes_subset1['body'] != 'nan']
notes_subset2['body'] = notes_subset2['body'][notes_subset2['body'] != 'nan']
notes_subset3['body'] = notes_subset3['body'][notes_subset3['body'] != 'nan']
notes_subset4['body'] = notes_subset4['body'][notes_subset4['body'] != 'nan']
notes_subset5['body'] = notes_subset5['body'][notes_subset5['body'] != 'nan']
notes_subset6['body'] = notes_subset6['body'][notes_subset6['body'] != 'nan']
notes_subset7['body'] = notes_subset7['body'][notes_subset7['body'] != 'nan']
notes_subset8['body'] = notes_subset8['body'][notes_subset8['body'] != 'nan']
notes_subset9['body'] = notes_subset9['body'][notes_subset9['body'] != 'nan']
notes_subset10['body'] = notes_subset10['body'][notes_subset10['body'] != 'nan']

In [None]:
# B/c running the code for this section can be time consuming, I decided to import to csv 
notes.to_csv('notes_processingV3_names.csv', encoding='utf-8', index=False)

#notes = pd.read_csv('notes_processingV2.csv')
#notes['body] = notes['body].astype(str)

## Term Document Matrix/Term Frequency-Inverse Document Frequency

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
X = vec.fit_transform(notes['body'])
tdm = pd.DataFrame(X.toarray(), columns = vec.get_feature_names())

In [None]:
# Function based on Term Document Matrix (not focusing on weights/importance)
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        max_words=200,
        max_font_size=40,
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title:
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

show_wordcloud(notes['body'], title = 'Term Document Matrix')

In [None]:
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
                       stop_words= 'english',ngram_range=(1,2))

notes_tfidf = tfidf.fit_transform(notes['body'].values.astype('U'))

In [None]:
freqs = [(word, notes_tfidf.getcol(idx).sum()) for word, idx in tfidf.vocabulary_.items()]
w = WordCloud(width=800,height=600,mode='RGBA',background_color='white',max_words=2000).fit_words(freqs)
plt.imshow(w)
plt.show()

## Topic Modeling

In [5]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx + 1))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
no_features = 1000
no_topics = 5
no_top_words = 10

In [50]:
subset = [notes_subset1['body'], notes_subset2['body'], notes_subset3['body'],
          notes_subset4['body'], notes_subset5['body'], notes_subset6['body'], 
          notes_subset7['body'], notes_subset8['body'], notes_subset9['body'], 
         notes_subset10['body']]

param = [10, 15, 10, 15, 20, 10, 15, 20, 15, 15]

for i in range(10):
    
    # TF for LDA
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10)
    tf = tf_vectorizer.fit_transform(subset[i])
    tf_feature_names = tf_vectorizer.get_feature_names()

    # Run LDA 
    lda = LatentDirichletAllocation(n_components = 10, max_iter = 10, learning_method='online', 
                                    batch_size = 1000, learning_offset = param[i], learning_decay = 0.7).fit(tf)

    results_file = "results_LDA/results_{}_LDA.csv".format(i)
    with open(results_file, 'a') as f:
        writer = csv.writer(f)
        for t in range(10):
            writer.writerow(["Topic {}, {}".format(t+1, 
                                ', '.join([tf_feature_names[i] for i in lda.components_.argsort()[t][:-10-1:-1]]))])   
for i in range(10):
    
    # TFIDF for NMF
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10)
    tfidf = tfidf_vectorizer.fit_transform(subset[i])
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    # NMF algorithm
    nmf = NMF(n_components = 10, alpha=.01, init='nndsvd').fit(tfidf)

    results_file = "results_NMF/results_{}_NMF.csv".format(i+1)
    with open(results_file, 'a') as f:
        writer = csv.writer(f)
        for t in range(10):
            writer.writerow(["Topic {}, {}".format(t+1, 
                                ', '.join([tfidf_feature_names[i] for i in nmf.components_.argsort()[t][:-10-1:-1]]))])
            
for i in range(10):
    
    # TF for LDA (using n-grams)
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10, input = 'content', analyzer = 'word',
                                   ngram_range=[1,2], max_features = 5000)
    tf = tf_vectorizer.fit_transform(subset[i])
    tf_feature_names = tf_vectorizer.get_feature_names()

    # Run LDA 
    lda = LatentDirichletAllocation(n_components = 10, max_iter = 10, learning_method='online', 
                                    batch_size = 1000, learning_decay = 0.7, learning_offset = param[i]).fit(tf)

    results_file = "results_LDAgram/results_{}_LDAgram.csv".format(i+1)
    with open(results_file, 'a') as f:
        writer = csv.writer(f)
        for t in range(10):
            writer.writerow(["Topic {}, {}".format(t+1, 
                                ', '.join([tf_feature_names[i] for i in lda.components_.argsort()[t][:-10-1:-1]]))])

In [51]:
subset = [notes_subset1['body'], notes_subset2['body'], notes_subset3['body'],
          notes_subset4['body'], notes_subset5['body'], notes_subset6['body'], 
          notes_subset7['body'], notes_subset8['body'], notes_subset9['body'], 
         notes_subset10['body']]

param = [10, 15, 10, 15, 20, 10, 15, 20, 15, 15]

for i in range(10):
    
    # TF for LDA
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10)
    tf = tf_vectorizer.fit_transform(subset[i])
    tf_feature_names = tf_vectorizer.get_feature_names()

    # Run LDA 
    lda = LatentDirichletAllocation(n_components = 10, max_iter = 10, learning_method='online', 
                                    batch_size = 1000, learning_offset = param[i], learning_decay = 0.7).fit(tf)

    results_file = "results_LDA/results_{}_LDA.csv".format(i)
    with open(results_file, 'a') as f:
        writer = csv.writer(f)
        for t in range(10):
            writer.writerow(["Topic {}, {}".format(t+1, 
                                ', '.join([tf_feature_names[i] for i in lda.components_.argsort()[t][:-10-1:-1]]))])  

KeyboardInterrupt: 

In [None]:
# Define Search Param
search_params = {'n_components': [10], 
                 'learning_decay': [0.9, 0.7], 
                 'learning_offset': [10, 15, 20], 
                 'batch_size': [1000]}

# Init the Model
lda = LatentDirichletAllocation(learning_method = 'online')

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10)
data_vectorized = tf_vectorizer.fit_transform(notes_subset10['body'])
model.fit(data_vectorized)

# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

In [34]:
st = PorterStemmer()

notes_subset1 = pd.read_csv('notes_subset/notes_subset1_v2.csv')
notes_subset1['body'] = notes_subset1['body'].astype(str)
notes_subset1 = notes_subset1[notes_subset1['body'] != 'nan']
notes_subset1['body'] = notes_subset1['body'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
notes_subset1['body'] = notes_subset1['body'].apply(lambda x: " ".join(x for x in x.split() if x not in CSAA_stop_words))

notes_subset2 = pd.read_csv('notes_subset/notes_subset2_v2.csv')
notes_subset2['body'] = notes_subset2['body'].astype(str)
notes_subset2 = notes_subset2[notes_subset2['body'] != 'nan']
notes_subset2['body'] = notes_subset2['body'].apply(lambda x: " ".join(x for x in x.split() if x not in CSAA_stop_words))
notes_subset2['body'] = notes_subset2['body'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

notes_subset3 = pd.read_csv('notes_subset/notes_subset3_v2.csv')
notes_subset3['body'] = notes_subset3['body'].astype(str)
notes_subset3 = notes_subset3[notes_subset3['body'] != 'nan']
notes_subset3['body'] = notes_subset3['body'].apply(lambda x: " ".join(x for x in x.split() if x not in CSAA_stop_words))
notes_subset3['body'] = notes_subset3['body'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

notes_subset4 = pd.read_csv('notes_subset/notes_subset4_v2.csv')
notes_subset4['body'] = notes_subset4['body'].astype(str)
notes_subset4 = notes_subset4[notes_subset4['body'] != 'nan']
notes_subset4['body'] = notes_subset4['body'].apply(lambda x: " ".join(x for x in x.split() if x not in CSAA_stop_words))
notes_subset4['body'] = notes_subset4['body'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

notes_subset5 = pd.read_csv('notes_subset/notes_subset5_v2.csv')
notes_subset5['body'] = notes_subset5['body'].astype(str)
notes_subset5 = notes_subset5[notes_subset5['body'] != 'nan']
notes_subset5['body'] = notes_subset5['body'].apply(lambda x: " ".join(x for x in x.split() if x not in CSAA_stop_words))
notes_subset5['body'] = notes_subset5['body'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

notes_subset6 = pd.read_csv('notes_subset/notes_subset6_v2.csv')
notes_subset6['body'] = notes_subset6['body'].astype(str)
notes_subset6 = notes_subset6[notes_subset6['body'] != 'nan']
notes_subset6['body'] = notes_subset6['body'].apply(lambda x: " ".join(x for x in x.split() if x not in CSAA_stop_words))
notes_subset6['body'] = notes_subset6['body'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

notes_subset7 = pd.read_csv('notes_subset/notes_subset7_v2.csv')
notes_subset7['body'] = notes_subset7['body'].astype(str)
notes_subset7 = notes_subset7[notes_subset7['body'] != 'nan']
notes_subset7['body'] = notes_subset7['body'].apply(lambda x: " ".join(x for x in x.split() if x not in CSAA_stop_words))
notes_subset7['body'] = notes_subset7['body'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

notes_subset8 = pd.read_csv('notes_subset/notes_subset8_v2.csv')
notes_subset8['body'] = notes_subset8['body'].astype(str)
notes_subset8 = notes_subset8[notes_subset8['body'] != 'nan']
notes_subset8['body'] = notes_subset8['body'].apply(lambda x: " ".join(x for x in x.split() if x not in CSAA_stop_words))
notes_subset8['body'] = notes_subset8['body'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

notes_subset9 = pd.read_csv('notes_subset/notes_subset9_v2.csv')
notes_subset9['body'] = notes_subset9['body'].astype(str)
notes_subset9 = notes_subset9[notes_subset9['body'] != 'nan']
notes_subset9['body'] = notes_subset9['body'].apply(lambda x: " ".join(x for x in x.split() if x not in CSAA_stop_words))
notes_subset9['body'] = notes_subset9['body'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

notes_subset10 = pd.read_csv('notes_subset/notes_subset10_v2.csv')
notes_subset10['body'] = notes_subset10['body'].astype(str)
notes_subset10 = notes_subset10[notes_subset10['body'] != 'nan']
notes_subset10['body'] = notes_subset10['body'].apply(lambda x: " ".join(x for x in x.split() if x not in CSAA_stop_words))
notes_subset10['body'] = notes_subset10['body'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

In [8]:
notes_subset1 = pd.read_csv('notes_subset/notes_subset1.csv')
notes_subset1['body'] = notes_subset1['body'].astype(str)
notes_subset1 = notes_subset1[notes_subset1['body'] != 'nan']

In [9]:
import gensim
import pyLDAvis
import pyLDAvis.sklearn

# TF for LDA
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10)
tf = tf_vectorizer.fit_transform(notes_subset1['body'])

lda = LatentDirichletAllocation(n_components = 10, max_iter = 10, learning_method='online', learning_decay = 0.7,
                                learning_offset = 20, batch_size = 1000)

lda.fit(tf)

prepare = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer, mds='tsne')

In [None]:
pyLDAvis.show(prepare)


Note: if you're in the IPython notebook, pyLDAvis.show() is not the best command
      to use. Consider using pyLDAvis.display(), or pyLDAvis.enable_notebook().
      See more information at http://pyLDAvis.github.io/quickstart.html .

You must interrupt the kernel to end this command

Serving to http://127.0.0.1:8888/    [Ctrl-C to exit]


127.0.0.1 - - [27/Feb/2019 16:20:17] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [27/Feb/2019 16:20:17] "GET /LDAvis.css HTTP/1.1" 200 -
127.0.0.1 - - [27/Feb/2019 16:20:18] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [27/Feb/2019 16:20:18] "GET /LDAvis.js HTTP/1.1" 200 -
