In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [4]:
path = 'yelp_data/health_text_sentiment.csv'
df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

df = df[['stars',
 'clean_text',
 'sent_value_clean',
 'sent_score_clean']]

In [5]:
df.head()

Unnamed: 0,stars,clean_text,sent_value_clean,sent_score_clean
0,1,please stay away place bad care imaginable sta...,-0.036719,negative
1,5,husband patient dr byrne last year half last m...,0.069479,positive
2,4,dr byrne great doctor great bed side manner ex...,0.317778,positive
3,3,raise review dr bryne receptive daughter go an...,0.002806,positive
4,1,wish could give star bad office ever horrible ...,-0.084259,negative


In [6]:
data = df.copy()

In [7]:
data = data.ix[np.where((data.stars==1)|(data.stars==5))]

data.head()

Unnamed: 0,stars,clean_text,sent_value_clean,sent_score_clean
0,1,please stay away place bad care imaginable sta...,-0.036719,negative
1,5,husband patient dr byrne last year half last m...,0.069479,positive
4,1,wish could give star bad office ever horrible ...,-0.084259,negative
5,1,go emergency room kidney stone attack emergenc...,0.133618,positive
6,5,dr byrne excellent doctor right skill include ...,0.210708,positive


In [8]:
data.stars.replace(1,0,inplace=True)
data.stars.replace(5,1,inplace=True)

In [9]:
data.head()

Unnamed: 0,stars,clean_text,sent_value_clean,sent_score_clean
0,0,please stay away place bad care imaginable sta...,-0.036719,negative
1,1,husband patient dr byrne last year half last m...,0.069479,positive
4,0,wish could give star bad office ever horrible ...,-0.084259,negative
5,0,go emergency room kidney stone attack emergenc...,0.133618,positive
6,1,dr byrne excellent doctor right skill include ...,0.210708,positive


## split test train

In [13]:
from sklearn.model_selection import train_test_split

In [11]:
 train_test_split?

In [14]:
review = data['clean_text'].values.astype(str)
sentiments = data['stars'].values

In [15]:
X_train, X_test, y_train, y_test = train_test_split(review, sentiments, test_size=0.33, random_state=42)

## Pre-processing

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer  # two classes of text conv to 1&0's

In [17]:
count_vect = CountVectorizer() # counts the occurance of text in doc

In [18]:
X_train_counts = count_vect.fit_transform(X_train)

In [19]:
list(count_vect.vocabulary_.items())[0:3]

[('client', 5210), ('lenscrafter', 16571), ('year', 32366)]

In [20]:
len(count_vect.vocabulary_)

32641

In [21]:
lab_bin = LabelBinarizer()
y_train_bin = lab_bin.fit_transform(y_train)
y_test_bin = lab_bin.fit_transform(y_test)

## Train

In [22]:
from sklearn.naive_bayes import MultinomialNB

In [23]:
clf = MultinomialNB().fit(X_train_counts, y_train_bin.ravel())

In [24]:
len(clf.coef_[0])

32641

In [25]:
import collections

In [26]:
importanceCount = collections.Counter()

In [27]:
for word, imp in zip(count_vect.vocabulary_.keys(), clf.coef_[0]):
    importanceCount[word] = imp

In [28]:
importanceCount.most_common()[-10:]

[('partridg', -13.885211978371142),
 ('blurb', -13.885211978371142),
 ('deputy', -13.885211978371142),
 ('gunbelt', -13.885211978371142),
 ('lode', -13.885211978371142),
 ('baranoff', -13.885211978371142),
 ('sript', -13.885211978371142),
 ('winfrey', -13.885211978371142),
 ('schedd', -13.885211978371142),
 ('snort', -13.885211978371142)]

In [29]:
importanceCount.most_common()[0:10]

[('xiao', -3.7565429524952396),
 ('aliyah', -4.024579736141444),
 ('mattress', -4.472012398383457),
 ('expirienc', -4.495053179819124),
 ('subscriber', -4.501674887933563),
 ('askaris', -4.562436177065171),
 ('dx', -4.630759031776639),
 ('seizure', -4.639504462557669),
 ('christie', -4.655559295362586),
 ('bergner', -4.709256033256791)]

## Now test

In [30]:
X_test_counts = count_vect.transform(X_test)

In [31]:
pred = clf.predict(X_test_counts)

In [32]:
from sklearn.metrics import average_precision_score

In [33]:
print('Preicision = {:f}'.format(average_precision_score(y_test_bin,pred)))

Preicision = 0.955101


## sanity check

In [34]:
clf.predict(count_vect.transform(['this nurse couldnt even measure my temperature corectly']))

array([0])

## prob of the prediction

In [35]:
clf.predict_proba(count_vect.transform(['this nurse couldnt even measure my temperature corectly']))

array([[0.93845429, 0.06154571]])

In [36]:
rev0 = '''
If you plan on working here for benefits, run away, especially if you want to have a baby here. You would think being an employee would offer some perks, but it does not.  They changed their insurance plans at the end of the year, so people who planned a pregnancy in 2017 were screwed if their babies were born in 2018. $1000's in in hospital bills (and this is for an uncomplicated delivery). 

If Dinette is your nurse, ask for another one. My experience was terrible with her. I called to file a complaint about her and was told they would look into it. No one has ever contacted me!

Watch out for your itemized billing statements. They charge you for the hospital, the nurses, the anesthesiologist, OB and labs all separately. This is why our healthcare is going down the hole. There is no possible way to tell if you are being charged properly. Also, they charge you to admit you.
'''

rev1 = '''
First off this is a review of the cafe.  This is by far the best hospital food I've had.  Their cafe honestly better than Luby's cafe food.  They offer a wide variety of selections from fresh to order sandwiches, a BBQ area, Asian Stir frys, Homestyle cooking meals, Tex-Mex stand, Pasta Bar, Pizza, Sushi, fresh made salads and soup, and a made to order grill.  But what makes this a 5 star is their desserts.  Their desserts are amazing!  The presentation looks like something out of la madeline.  I am willing to make the walk from work to this cafe.
'''

rev00 = '''
Attn Small Business Owners and Independence Insurance Policy Holders. Beginning in  March 2018, received a call from Houston Methodist Hospital concerning a bill for 289.20.  On March 16, 2018, paid the amount over the phone with a Customer Service representative receiving a confirmation from my bank stating the bill was paid with an authorization code time 11:53 a.m. As of May 3rd, 2018, my patient portal still reflecting a payment due on the same bill listed above. Contacted Houston Methodist Hospital again for the same bill their representative called me back in March 2018 and payment was not credited to my bill. 

I receive a call from the same customer service department dated 24th of May, 2108, this representative from Houston Methodist department stating my insurance payment has been posted from the EOB(Explanation of Benefits) for the outstanding bill of 30, 033.20 and an outstanding credit (REFUND) of $594.09 will be processed. The female representative will process the refund amount stated above.  She left the contact number 832-667-6291 (does not work) concerning her message. 

Now returning to my mailbox on June 7, 2018 inside I found a bill from Houston Methodist Hospital for the amount listed above for 30,033.20. OK- DUMB ASS - I am totally sick of you and will use all multimedia to post my complaint plus letters to the Board Members and Chief Executive Officer. I have kept a copy of this post for my file !
'''
rev11 = '''
This review is solely based on being a visitor and not a patient.

Houston Methodist is a top notch hospital in terms of patient care and customer service to visitors. From the cafeteria staff to nurses, from the security to the surgeons, every staff member greet you with a smile and goes above and beyond to serve you. 

Here are few compliments about this hospital:

1. Food- The cafeteria food is amazing and reasonably price. From pasta, pizza, sushi, fried fish sandwiches and more, they have it all. Did I mentioned that it was reasonably priced? I love that the hospital is not trying to overcharge you when you are frequently visiting here. 

2. Staff- They are very friendly and sensitive to your reason of visiting the facility. The nurse staff was  personable to family and loved one needing the care. We felt comforted knowing that our loved one was in good hands.

ONLY ISSUE is parking. There should be a special discount or program for visitors who frequently visit for weeks or months. Charging $13 a day can be very taxing on a family. Especially if they have more than one family member driving to the facility.
'''

rev000 = '''
The medical care we received at Methodist was very good.

Billing and customer service is beyond terrible! If I lost a leg across the street from Methodist...I would find a way to drive across town to a different hospital.

My son got stitches from a nurse. The procedure never involved a doctor or any medicine, and probably took an hour from start to finish. I received a bill weeks later for nearly $2500! For stitches! 

I also received a bill from the physician's office for the same work (but only for about $250, which I happily paid). Mind you, the physician's office is billing for the people that actually did the work! Methodist is supposed to only be charging for use of the facility, equipment, and medications.

Comparing the two bills, it became apparent that not only was Methodist charging me for work done by the physician's office (double charging), they were charging far more than the physicians were. Further, nearly all of the charges were incorrect (they listed the wrong trauma level, wrong part of the body!, and wrong length of the laceration). 

When I called billing to correct these issues, they were not able to answer anything. You may as well be talking to burger king workers about your medical billing. They insist that everything is correct, while not being able to explain what any of it means, or answer your questions. Finally they transfer you to their supervisor, which is always voicemail, and then they never call you back. I have made about 10 calls to their office now, and six months later, the issue is still unresolved.

The physician's office even told me that Methodist is their worst client, and never handles their billing correctly.
'''

In [37]:
clf.predict(count_vect.transform([rev000]))

array([0])

In [38]:
clf.predict_proba(count_vect.transform([rev000]))

array([[1.00000000e+00, 1.13148292e-20]])