In [11]:
import warnings
warnings.filterwarnings('ignore')
import os
import codecs
import json
import pandas as pd

## load dataset

In [2]:
path = 'yelp_data/health_text_sentiment_full.csv'

In [4]:
df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)
bs = df[u'business_id'].unique()


In [5]:
health_ids = set()
for i in bs:
    health_ids.add(i)
health_ids = frozenset(health_ids)

In [7]:
type(health_ids)

frozenset

In [8]:
len(health_ids)

4572

In [9]:
ls

[0m[01;36;40mintermediate_yelp[0m/  [01;32mUntitled.ipynb[0m*  [01;36;40myelp_data[0m/


In [12]:
intermediate_directory = os.path.join('intermediate_yelp')

review_txt_filepath = os.path.join(intermediate_directory,
                                   'review_text_all.txt')

In [13]:
review_txt_filepath

'intermediate_yelp/review_text_all.txt'

In [14]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:
    
    review_count = 0

    # create & open a new file in write mode
    with codecs.open(review_txt_filepath, 'w', encoding='utf_8') as review_txt_file:

        # open the existing review json file
        with codecs.open(review_json_filepath, encoding='utf_8') as review_json_file:

            # loop through all reviews in the existing file and convert to dict
            for review_json in review_json_file:
                review = json.loads(review_json)

                # if this review is not about a restaurant, skip to the next one
                if review[u'business_id'] not in restaurant_ids:
                    continue

                # write the restaurant review as a line in the new file
                # escape newline characters in the original review text
                review_txt_file.write(review[u'text'].replace('\n', '\\n') + '\n')
                review_count += 1

    print (u'''Text from {:,} Healthcare reviews
              written to the new txt file.'''.format(review_count))
    
else:
    
    with codecs.open(review_txt_filepath, encoding='utf_8') as review_txt_file:
        for review_count, line in enumerate(review_txt_file):
            pass
        
    print (u'Text from {:,} Healthcare reviews in the txt file.'.format(review_count + 1))

Text from 64,006 Healthcare reviews in the txt file.
CPU times: user 1.8 s, sys: 797 ms, total: 2.59 s
Wall time: 2.62 s


## Load Spacy

In [15]:
%%time
import spacy
import pandas as pd
import itertools as it

nlp = spacy.load('en_core_web_lg')

CPU times: user 15.5 s, sys: 5.94 s, total: 21.5 s
Wall time: 20.4 s


## Phrase modeling

_Phrase modeling_ is another approach to learning combinations of tokens that together represent meaningful multi-word concepts. We can develop phrase models by looping over the the words in our reviews and looking for words that _co-occur_ (i.e., appear one after another) together much more frequently than you would expect them to by random chance. The formula our phrase models will use to determine whether two tokens $A$ and $B$ constitute a phrase is:

$$\frac{count(A\ B) - count_{min}}{count(A) * count(B)} * N > threshold$$

In [16]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [17]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    # w.lemma_ if w.lemma_ != '-PRON-' else w.lower_ for w in d
    return token.is_punct or token.is_space or token.lemma_ =='-PRON-' 

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    #w.lemma_ if w.lemma_ != '-PRON-' else w.lower_ for w in d
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [18]:
unigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'unigram_sentences_all.txt')

In [19]:
%%time

if 0 == 1:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(review_txt_filepath):
            f.write(sentence + '\n')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 13.8 µs


In [20]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [23]:
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print (' '.join(unigram_sentence))
    print ('')

be awful

mom go to dr todd lincoln on n 27th ave by jc lincoln hospital north

be early

would give a five and be nice go there

now let see how long take dr sink to send the medical record

the staff be very polite and professional

one of the previous reviewer complain about the staff

but do not believe be the office personnel

be the insurance coverage the patient choose

be see by dr. stone and think be thorough and professional



In [24]:
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')

In [25]:
%%time

if 0 == 1:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

CPU times: user 641 ms, sys: 594 ms, total: 1.23 s
Wall time: 1.21 s


In [26]:
bigram_sentences_filepath = os.path.join(intermediate_directory,
                                         'bigram_sentences_all.txt')

In [27]:
%%time

if 0 == 1:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 14.3 µs


In [28]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [29]:
for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print( u' '.join(bigram_sentence))
    print( u'')

be awful

mom go to dr todd lincoln on n 27th_ave by jc lincoln hospital north

be early

would give a five and be nice go there

now let see how long take dr sink to send the medical_record

the staff be very polite and professional

one of the previous_reviewer complain_about the staff

but do not believe be the office personnel

be the insurance_coverage the patient choose

be see by dr. stone and think be thorough and professional



In [30]:
trigram_model_filepath = os.path.join(intermediate_directory,
                                      'trigram_model_all')

In [31]:
%%time

if 0 == 1:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

CPU times: user 734 ms, sys: 578 ms, total: 1.31 s
Wall time: 1.32 s


In [32]:
trigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'trigram_sentences_all.txt')

In [33]:
%%time

if 0 == 1:

    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for bigram_sentence in bigram_sentences:
            
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
            f.write(trigram_sentence + '\n')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 14.1 µs


In [34]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [35]:
for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print( u' '.join(trigram_sentence))
    print( u'')

be awful

mom go to dr todd lincoln on n 27th_ave by jc lincoln hospital north

be early

would give a five and be nice go there

now let see how long take dr sink to send the medical_record

the staff be very polite and professional

one of the previous_reviewer complain_about the staff

but do not believe be the office personnel

be the insurance_coverage the patient choose

be see by dr. stone and think be thorough and professional



In [36]:
trigram_reviews_filepath = os.path.join(intermediate_directory,
                                        'trigram_transformed_reviews_all.txt')

In [37]:
from spacy.lang.en import STOP_WORDS

In [38]:
%%time
if 0 == 1:

    with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_review in nlp.pipe(line_review(review_txt_filepath),
                                      batch_size=10000, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                              if term not in STOP_WORDS] # spacy.en.language_data.STOP_WORDS
            
            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 13.8 µs


In [40]:
print ('Original:' + u'\n')

for review in it.islice(line_review(review_txt_filepath), 11, 12):
    print( review)

print ('----' + u'\n')
print ('Transformed:' + '\n')

with codecs.open(trigram_reviews_filepath, encoding='utf_8') as f:
    for review in it.islice(f, 11, 12):
        print( review)

Original:

Excellent, Excellent facility.  I have had several procedures done here and have been absolutely pleased with everything.  The overall experience from the front desk to Dr. Hall, to Evelyn the aesthetician has been wonderful.  Everyone is friendly, the facility is state of the art and the results are AMAZING.

I spend a lot of time in the sun and have experienced severe sun damage, primarily the ugly brown sun spots that connect and take over your skin!  After consultations regarding what I hoped to achieve I was given suggestions on what would work best for my skin.  This was done at first with Shannon.  She is terrific!  Very down to earth and funny. This really puts you at ease as you discuss procedures and financials; also important, given that these procedures are obvious luxuries for most people .  There is no high pressure sales and they always point you to promotions that might be coming up and how to get the most bang for your buck. 

Evelyn, the aesthetician is won

## Topic Modeling with Latent Dirichlet Allocation (_LDA_)

In [42]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle as pickle

In [43]:
trigram_dictionary_filepath = os.path.join(intermediate_directory,
                                           'trigram_dict_all.dict')

In [44]:
%%time

if 0 == 1:

    trigram_reviews = LineSentence(trigram_reviews_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 23.7 ms


In [45]:
trigram_bow_filepath = os.path.join(intermediate_directory,
                                    'trigram_bow_corpus_all.mm')

In [46]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [47]:
%%time
if 0 == 1:

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_reviews_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

CPU times: user 15.6 ms, sys: 46.9 ms, total: 62.5 ms
Wall time: 27.3 ms


In [48]:
lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')

In [49]:
%%time
if 0 == 1:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=5,# change to 10 or 20
                           id2word=trigram_dictionary,
                           workers=3)
    
    lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

CPU times: user 15.6 ms, sys: 15.6 ms, total: 31.2 ms
Wall time: 36.9 ms


In [50]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print ('{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print ('{:20} {:.3f}'.format(term, round(frequency, 3)))

In [51]:
explore_topic(topic_number=0)

term                 frequency

good                 0.012
dr.                  0.011
surgery              0.009
great                0.008
office               0.007
staff                0.007
help                 0.007
eye                  0.006
need                 0.006
tell                 0.006
year                 0.006
come                 0.006
know                 0.006
work                 0.006
feel                 0.006
experience           0.006
ask                  0.005
pain                 0.005
glass                0.005
patient              0.005
find                 0.005
look                 0.005
care                 0.005
want                 0.005
insurance            0.004


In [52]:
for i in range(5):
    print('\nTOPIC:{}'.format(i))
    print('--------------\n')
    explore_topic(topic_number=i)


TOPIC:0
--------------

term                 frequency

good                 0.012
dr.                  0.011
surgery              0.009
great                0.008
office               0.007
staff                0.007
help                 0.007
eye                  0.006
need                 0.006
tell                 0.006
year                 0.006
come                 0.006
know                 0.006
work                 0.006
feel                 0.006
experience           0.006
ask                  0.005
pain                 0.005
glass                0.005
patient              0.005
find                 0.005
look                 0.005
care                 0.005
want                 0.005
insurance            0.004

TOPIC:1
--------------

term                 frequency

office               0.022
staff                0.021
good                 0.013
care                 0.013
friendly             0.009
dr.                  0.009
great                0.009
need                 0

In [53]:
topic_names = {0: u'good_dr_surgery_staff',
               1: u'office_staff_good_friendly',
               2: u'patient_dr_care_need',
               3: u'tell_appointment_office_ask',
               4: u'wait_appointment_hour_room'}

In [57]:
LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared')

In [56]:
%%time

if 1 == 1:

    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary)

    with open(LDAvis_data_filepath, 'w') as f:
        pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath) as f:
    LDAvis_prepared = pickle.load(f)

TypeError: write() argument must be str, not bytes

In [58]:
pyLDAvis.display(LDAvis_prepared)

##  Describing text with LDA


In [59]:
def get_sample_review(review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(review_txt_filepath),
                          review_number, review_number+1))[0]

In [92]:
def lda_description(review_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the review text with spaCy
    parsed_review = nlp(review_text)
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    
    # remove any remaining stopwords
    trigram_review = [term for term in trigram_review
                      if not term in STOP_WORDS]
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    #review_lda = sorted(review_lda, key=lambda topic_number, freq : -freq)
    review_lda = sorted(review_lda, key=lambda  x:x, reverse=True)

    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        #print( round(freq, 3))
        #print('{:25}'.format(topic_names[topic_number]))
        print( '{:25} {}'.format(topic_names[topic_number], round(freq, 3)))

In [93]:
sample_review = get_sample_review(50)
print (sample_review)

I am so happy I chose Dr. Nachbar for my breast augmentation.  Surgery (cosmetic) or not is a big decision. His website was very informative - advising people the proper questions to ask cosmetic surgeons. The staff was welcoming and kind (especially Tammy). Dr. Nachbar is realistic with clients about their expectations. He does not impose any preferences and allows you the client to make the proper decision for yourself (ex: how many ccs to get for your breasts).  He is  very energetic and you can tell he loves what he does. His sense of humor helps diffuse any nervousness you have when talking with him. I had my breast augmentation done May 2, 2014, and I'm still happy with the results. His description of the healing process and what to expect during recovery was on point. Thanks Dr. Nachbar for giving me a lil more pep in my step.



In [94]:
lda_description(sample_review)

patient_dr_care_need      0.32499998807907104
good_dr_surgery_staff     0.6650000214576721


In [95]:
sample_review = get_sample_review(100)
print (sample_review)

I've been taking my kids here for a while and I absolutely love this office. We currently live in the Goodyear Area and I'm more than ok with taking a 30 minute drive to get to the office. My kids see doctor Shepherd and she is awesome. Very knowledgeable, caring, understanding, and very gentle with my children. She always available for advice or to answer any questions that I may have. Anita, the MA is just as awesome. She's easy going and very knowledgeable as well. They make their patients feel like they're at home and that's what I love most. When I call and leave a message for the nurses, they call back ASAP and that's another thing about this place, you don't have to wait hours for a response on questions that you have. Front desk are always cheerful and greet you with a warm smile. This office is the way to go!



In [96]:
lda_description(sample_review)

wait_appointment_hour_room 0.22200000286102295
office_staff_good_friendly 0.765999972820282


## Word Vector Embeddng w/ Word2Vec

In [97]:
from gensim.models import Word2Vec

trigram_sentences = LineSentence(trigram_sentences_filepath)
word2vec_filepath = os.path.join(intermediate_directory, 'word2vec_model_all')

In [100]:
%%time
if 0 == 1:

    # initiate the model and perform the first epoch of training
    food2vec = Word2Vec(trigram_sentences, size=100, window=5,
                        min_count=20, sg=1, workers=4)
    
    food2vec.save(word2vec_filepath)

    # perform another 11 epochs of training
    for i in range(1,12):

        food2vec.train(trigram_sentences, total_examples=food2vec.corpus_count,epochs=food2vec.iter)
        food2vec.save(word2vec_filepath)
        
# load the finished model from disk
food2vec = Word2Vec.load(word2vec_filepath)
food2vec.init_sims()

print ('{} training epochs so far.'.format(food2vec.train_count))

12 training epochs so far.
CPU times: user 62.5 ms, sys: 31.2 ms, total: 93.8 ms
Wall time: 94.1 ms


  setattr(self, attrib, None)


In [101]:
print ('{:,} terms in the food2vec vocabulary.'.format(len(food2vec.wv.vocab)))

9,291 terms in the food2vec vocabulary.


In [124]:
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in food2vec.wv.vocab.items()]

In [125]:
type(ordered_vocab)

list

In [126]:
len(ordered_vocab)

9291

In [128]:
#ordered_vocab.sort(key=lambda x: int(x[-2]))


In [130]:
# sort by the term counts, so the most common terms appear first
#ordered_vocab = sorted(ordered_vocab, key=lambda (term, index, count): -count)
ordered_vocab.sort(key=lambda x: int(x[-2]))

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(food2vec.wv.syn0norm[term_indices, :],
                            index=ordered_terms)

word_vectors

  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
be,0.042933,-0.176900,-0.060522,0.091755,0.073617,-0.081214,-0.016272,-0.054262,0.145700,-0.005230,...,0.099128,-0.053480,-0.104721,-0.184649,-0.044308,-0.083005,0.059811,-0.150380,-0.004636,0.025479
the,0.181669,0.056968,0.122268,-0.053141,0.054383,-0.005181,-0.058436,-0.108587,0.059872,-0.078648,...,-0.053087,-0.133193,-0.195876,-0.033889,-0.034484,0.041712,0.041752,0.041761,-0.031564,-0.065673
and,-0.027977,-0.093760,-0.134843,-0.020117,0.050309,0.021667,0.014467,-0.026640,0.049923,-0.048959,...,-0.000373,0.040053,-0.148304,-0.161915,-0.076207,-0.075621,-0.135197,-0.118997,0.046171,-0.176436
to,-0.076072,-0.081850,-0.019214,-0.133938,-0.183947,-0.024512,-0.012810,-0.063802,-0.057252,0.094553,...,-0.088816,-0.116148,0.000213,-0.127997,-0.079446,0.065030,0.015098,-0.006998,0.005195,-0.195575
a,0.161378,0.014931,-0.131659,0.038403,0.002810,-0.069324,-0.174914,-0.064322,0.080454,0.092493,...,0.125761,-0.040567,-0.133840,0.041466,-0.189314,0.027838,0.058632,-0.043934,-0.050714,-0.213498
have,-0.057448,-0.044276,0.049703,0.148065,0.145300,-0.008531,-0.024994,0.067563,0.067965,0.022814,...,0.077080,-0.117574,-0.224957,-0.081998,0.013219,0.150981,0.021599,-0.064803,-0.009894,-0.190133
not,0.228159,-0.073528,-0.088336,-0.084566,-0.087192,-0.075298,-0.089664,0.044693,0.029984,0.022093,...,0.121594,-0.034836,-0.029510,-0.239696,-0.091788,0.002522,0.096719,0.091435,0.007963,-0.080688
in,0.034166,-0.108147,0.072113,0.088481,-0.112871,0.024506,0.003362,0.053450,0.122645,0.109856,...,-0.044295,-0.145574,-0.010095,-0.028182,-0.036435,-0.024611,-0.067651,0.171765,0.048070,-0.081051
for,0.103073,-0.079893,0.000233,-0.019848,-0.149982,-0.019849,-0.099845,-0.052035,-0.041476,0.050031,...,-0.005120,-0.101272,-0.046348,-0.099904,-0.088544,-0.001502,-0.062394,-0.031133,-0.023727,-0.121474
of,0.155389,-0.001476,-0.008817,-0.103103,0.020160,0.004912,0.067281,-0.018430,-0.078751,0.001184,...,0.107283,-0.268399,-0.066568,0.049931,-0.245855,-0.004564,-0.095756,0.040281,0.071002,0.059810


In [131]:
def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in food2vec.most_similar(positive=[token], topn=topn):

        print ('{:20} {}'.format(word, round(similarity, 3)))

In [132]:
get_related_terms('nurse')

medical_assistant    0.733
tech                 0.717
assistant            0.71
ma                   0.696
male_nurse           0.682
cna                  0.666
nurses               0.653
staff_member         0.652
nursing_staff        0.651
take_vital           0.637


  import sys


In [133]:
get_related_terms('surgery')

procedure            0.796
outpatient_surgery   0.653
surgical_procedure   0.638
cataract_surgery     0.62
hysterectomy         0.618
septoplasty          0.599
hip_replacement      0.595
operation            0.589
extraction           0.588
revision             0.586


  import sys


In [134]:
get_related_terms('insurance', topn=20)

insurance_company    0.804
coverage             0.741
insurance_coverage   0.736
bcbs                 0.707
aetna                0.667
insurance_carrier    0.665
medicare             0.66
blue_cross_blue_shield 0.651
secondary_insurance  0.649
tricare              0.643
deductible           0.642
vsp                  0.638
meet_deductible      0.634
ppo                  0.633
uhc                  0.616
employer             0.604
accept_insurance     0.589
cover_100            0.589
medicaid             0.586
fsa                  0.586


  import sys


## Word Algebra

In [135]:
def word_algebra(add=[], subtract=[], topn=1):
    """
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    """
    answers = food2vec.most_similar(positive=add, negative=subtract, topn=topn)
    
    for term, similarity in answers:
        print (term)

In [139]:
word_algebra(add=['hospital', 'pay'])

emergency_room


  import sys


In [142]:
word_algebra(add=['clinic', u'pay'], subtract=['cash'])

facility


  import sys


In [145]:
word_algebra(add=['healthcare', 'hospital'], subtract=[u'doctor'])

medical_center


  import sys


In [147]:
word_algebra(add=['hospital', 'nurse'], subtract=['doctor'])

icu


  import sys


## Word Vector Visualization with t-SNE

In [148]:
from sklearn.manifold import TSNE

In [149]:
tsne_input = word_vectors.drop(STOP_WORDS, errors=u'ignore')
tsne_input = tsne_input.head(5000)

In [150]:
tsne_input.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
doctor,0.12841,-0.045598,-0.020055,0.129487,-0.107814,-0.051737,-0.066861,-0.046774,0.186836,-0.052215,...,-0.030753,-0.036436,-0.135651,0.048613,-0.021071,-0.097397,-0.067135,0.056866,-0.013078,-0.029537
time,-0.10954,-0.108372,-0.016455,-0.047754,0.01028,-0.05934,-0.095351,-0.175868,-0.02009,-0.052397,...,-0.042784,0.074686,-0.135983,-0.133364,0.000593,0.058011,0.100018,0.081836,0.047969,-0.15468
office,0.103377,-0.214433,-0.14161,-0.035546,0.048305,-0.000686,-0.182957,-0.025223,-0.063836,-0.032186,...,0.04453,0.073312,-0.300989,0.074976,-0.058334,-0.037948,-0.040693,-0.031566,-0.092688,-0.050862
staff,0.024923,-0.107977,-0.062548,-0.050128,-0.090761,-0.090017,-0.110744,-0.015567,0.141665,-0.1948,...,-0.117127,0.038619,-0.2013,-0.085548,0.029608,-0.040393,0.027264,-0.158741,-0.05051,-0.029322
wait,0.098476,-0.170564,-0.093773,0.063028,0.007691,-0.182316,-0.125817,-0.08799,0.07277,0.058992,...,0.045787,0.055854,0.006357,-0.081294,-0.035885,0.130823,0.014768,-0.070723,0.156905,-0.063365


In [178]:
tsne_filepath = os.path.join(intermediate_directory,
                             'tsne_model')

tsne_vectors_filepath = os.path.join(intermediate_directory,
                                     'tsne_vectors.npy')

In [179]:
tsne_filepath

'intermediate_yelp/tsne_model'

In [180]:
model_path= 'intermediate_yelp/tsne_model'

In [None]:
%%time

if 1 == 1:
    
    tsne = TSNE()
    tsne_vectors = tsne.fit_transform(tsne_input.values)
    
    with open(model_path, 'w') as f:
        pickle.dump(tsne, f)

    pd.np.save(tsne_vectors_filepath, tsne_vectors)
    
with open(tsne_filepath) as f:
    tsne = pickle.load(f)
    
tsne_vectors = pd.np.load(tsne_vectors_filepath)

In [193]:
tsne_vectors = pd.DataFrame(tsne_vectors,
                            index=pd.Index(tsne_input.index),
                            columns=['x_coord', 'y_coord'])

In [195]:
tsne_vectors.head()

Unnamed: 0,x_coord,y_coord
doctor,-43.014004,-33.771454
time,12.929251,74.010422
office,-48.071701,-10.937896
staff,-33.174458,-16.113323
wait,20.043264,65.545578


In [199]:
ls intermediate_yelp/

[0m[01;32mbigram_model_all[0m*                 [01;32mtrigram_dict_all.dict[0m*
[01;32mbigram_sentences_all.txt[0m*         [01;32mtrigram_model_all[0m*
[01;32mlda_model_all[0m*                    [01;32mtrigram_sentences_all.txt[0m*
[01;32mlda_model_all.expElogbeta.npy[0m*    [01;32mtrigram_transformed_reviews_all.txt[0m*
[01;32mlda_model_all.id2word[0m*            [01;32mtsne_model[0m*
[01;32mlda_model_all.state[0m*              [01;32mtsne_vectors.csv[0m*
[01;32mldavis_prepared[0m*                  [01;32mtsne_vectors.npy[0m*
[01;32mreview_text_all.txt[0m*              [01;32mtsne_vectors.tsv[0m*
[01;32mtopic_names.pkl[0m*                  [01;32munigram_sentences_all.txt[0m*
[01;32mtrigram_bow_corpus_all.mm[0m*        [01;32mword2vec_model_all[0m*
[01;32mtrigram_bow_corpus_all.mm.index[0m*


In [198]:
tsne_vectors.to_csv('intermediate_yelp/tsne_vectors.tsv')

In [200]:
tsne_vectors.head()

Unnamed: 0,x_coord,y_coord
doctor,-43.014004,-33.771454
time,12.929251,74.010422
office,-48.071701,-10.937896
staff,-33.174458,-16.113323
wait,20.043264,65.545578


In [201]:
tsne_vectors['word'] = tsne_vectors.index

### Plotting with Bokeh

In [207]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value
from bokeh.io import export_png

output_notebook()

In [208]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                  active_scroll=u'wheel_zoom'
                  )

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!
show(tsne_plot);

In [211]:
export_png(tsne_plot, filename="plot.png")

'/mnt/c/Users/frank/Documents/linux_dev/github/nlp_healthcare_reviews/models/unsupervised/plot.png'

![title](plot.png)
