In [1]:
import warnings
warnings.filterwarnings('ignore')

In [35]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import time
import spacy
import pickle
import textacy
from textblob import TextBlob
from text_cleanup import normalize_corpus

## Data Directory

In [32]:
# data folder path
data_directory= os.path.join( '..','data','clean_data')
data_directory_pickle = os.path.join('..','data','pickle')

# data file location
dataset = os.path.join(data_directory,'health_text_data.csv')

In [4]:
df = pd.read_csv(dataset)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44918 entries, 0 to 44917
Data columns (total 5 columns):
business_id    44918 non-null object
review_id      44918 non-null object
name           44918 non-null object
stars          44918 non-null int64
text           44918 non-null object
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [6]:
df.head()

Unnamed: 0,business_id,review_id,name,stars,text
0,2hpi6pXIFf0taDIYCoNIuw,4ar9LmGU4rQ3vXFj325HCg,Healthcare Partner,1,If your aim is to waste hours upon hours of yo...
1,2hpi6pXIFf0taDIYCoNIuw,mZo59NzNBPr9RegkzjIGVA,Healthcare Partner,5,"Memorial Day Weekend.. I can't Thank Dr, Shuc..."
2,2hpi6pXIFf0taDIYCoNIuw,5wf0Y31knIyBcbS7ZKrW9Q,Healthcare Partner,5,This is not my primary doctor and I was pleasa...
3,2hpi6pXIFf0taDIYCoNIuw,pNVZVifb7FHtr7xMAbLZ4Q,Healthcare Partner,5,Went there the first time for myself and was n...
4,2hpi6pXIFf0taDIYCoNIuw,OfFzPtU9N0ooeQbIg6yR-g,Healthcare Partner,1,Terrible assistance. I was there to see a rash...


## Replace medial names

In [7]:
MEDICAL_MAP = {
'GYN':'Gynecologist',
'RN': 'Registered Nurse',
'PA-C':'Physician Assistant',
'PA':'Physician Assistant',
'NP-C':'Nurse Practioner',
'NP':'Nurse Practioner',
'PT'   : 'Physical Therapist',
'MD': 'Physician',
'Dr.': 'Doctor',
'Dr,': 'Doctor',
'Dr': 'Doctor',
'PHN' :  'Public Health Nurse',
'RNC' : 'Certified Registered Nurse',
'gyn':'Gynecologist',
'rn': 'Registered Nurse',
'pa-c':'Physician Assistant',
'pa':'Physician Assistant',
'np-c':'Nurse Practioner',
'np':'Nurse Practioner',
'pt' : 'Physical Therapist',
'md': 'Physician',
'dr.': 'Doctor',
'dr,': 'Doctor',
'dr': 'Doctor',
'phn' :  'Public Health Nurse',
'rnc' : 'Certified Registered Nurse'
}

In [9]:
def replace_medical(text):
    for word in text.split():
        if word  in MEDICAL_MAP:
            text = text.replace(word, MEDICAL_MAP[word])
    return text

## test

In [10]:
df.text[0]

"If your aim is to waste hours upon hours of your life (literally 2+) when you're very ill only to meet with a PA instead of a Dr. who insists on giving you nothing but over-the-counter meds, even after you call back and plead with him (ahem, Carlos Vasquez, PA-C), then this is the spot for you. Seriously, I've never been to a dirtier, less competent, more irksome place than Healthcare Partners Medical Group Urgent Care. I never take meds and was sick enough to really need them. This PA (who, btw, mispronounces his own name) is the least capable and sensitive person I've ever met. What a worthless facility and staff."

In [11]:
replace_medical(df.text[0])

"If your aim is to waste hours upon hours of your life (literally 2+) when you're very ill only to meet with a Physician Assistant instead of a Doctor who insists on giving you nothing but over-the-counter meds, even after you call back and plead with him (ahem, Carlos Vasquez, Physician Assistant-C), then this is the spot for you. Seriously, I've never been to a dirtier, less competent, more irksome place than Healthcare Partners Medical Group Urgent Care. I never take meds and was sick enough to really need them. This Physician Assistant (who, btw, mispronounces his own name) is the least capable and sensitive person I've ever met. What a worthless facility and staff."

## Apply it

In [12]:
%%time
df['text'] = df['text'].apply(replace_medical)

CPU times: user 1.12 s, sys: 125 ms, total: 1.25 s
Wall time: 1.27 s


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44918 entries, 0 to 44917
Data columns (total 5 columns):
business_id    44918 non-null object
review_id      44918 non-null object
name           44918 non-null object
stars          44918 non-null int64
text           44918 non-null object
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [14]:
df.head()

Unnamed: 0,business_id,review_id,name,stars,text
0,2hpi6pXIFf0taDIYCoNIuw,4ar9LmGU4rQ3vXFj325HCg,Healthcare Partner,1,If your aim is to waste hours upon hours of yo...
1,2hpi6pXIFf0taDIYCoNIuw,mZo59NzNBPr9RegkzjIGVA,Healthcare Partner,5,Memorial Day Weekend.. I can't Thank Doctor S...
2,2hpi6pXIFf0taDIYCoNIuw,5wf0Y31knIyBcbS7ZKrW9Q,Healthcare Partner,5,This is not my primary doctor and I was pleasa...
3,2hpi6pXIFf0taDIYCoNIuw,pNVZVifb7FHtr7xMAbLZ4Q,Healthcare Partner,5,Went there the first time for myself and was n...
4,2hpi6pXIFf0taDIYCoNIuw,OfFzPtU9N0ooeQbIg6yR-g,Healthcare Partner,1,Terrible assistance. I was there to see a rash...


In [15]:
df.text[1]

"Memorial Day Weekend..  I can't Thank Doctor Shucmacher, his head nurse and staff for saving my Life...  I had an allergic reaction and they immediately went into action when I arrived and I can't Thank them enough... It just so happened (with follow up) that I have a growth on my tongue that swells when I have an allergic reaction causing me to not be able  to swallow and difficulty speaking and breathing...  The whole staff was amazing, caring and truly interested in what they could do to help me...  They were an AMAZING STAFF and I can't thank them enough....THANK YOU From THE Bottom of my HEART!  Holly Hernandez/Pahrump NV"

# Preprocess the text


In [16]:
%%time
df['processed'] = df['text'].map(lambda x: textacy.preprocess.preprocess_text(x, lowercase=True, 
                                                                                    no_urls=True, 
                                                                                    no_punct=True, 
                                                                                    no_numbers=True))

CPU times: user 27.2 s, sys: 219 ms, total: 27.4 s
Wall time: 27.9 s


In [17]:
df.text[1030]

'I was very pleased with the friendly service, short wait and the thoroughness of the Doctor Facility was very clean and welcoming aswell.'

In [18]:
df.processed[1030]

'i was very pleased with the friendly service short wait and the thoroughness of the doctor facility was very clean and welcoming aswell'

## save index

In [40]:
np.save(data_directory_pickle +'/health_data_index.npy', df.index)

# Tokeni

In [19]:
nlp = textacy.load_spacy("en_core_web_sm", disable = ("tagger", "parser", "ner", "textcat"))

In [21]:
def token_filter(token): 
    '''shortent tokens & less than 4 stop words'''
    return not (token.is_stop | len(token.text) <= 4)

## Create our corpus

In [28]:
%%time
docs = df['processed'].astype('str').astype('unicode').tolist()


CPU times: user 15.6 ms, sys: 15.6 ms, total: 31.2 ms
Wall time: 17.5 ms


In [33]:
len(docs)

44918

In [29]:
filtered_tokens = []
start = time.time()
i = 1
for doc in nlp.pipe(docs, disable=['tagger', 'parser', 'ner', 'textcat'], batch_size=10000):
    try:
        tokens = [token.lemma_ for token in doc if token_filter(token)]
        filtered_tokens.append(tokens)
        i += 1
        if i % 10000 == 0:
            print(f'Tokenized {i} documents in {(time.time()-start)/60} minutes')
    except:
        print(f'Document {i} has an encoding error/has error characters.')

Tokenized 10000 documents in 0.8295161525408427 minutes
Tokenized 20000 documents in 1.709996183713277 minutes
Tokenized 30000 documents in 2.532618741194407 minutes
Tokenized 40000 documents in 3.325513219833374 minutes


In [37]:
len(filtered_tokens)

44918

## Save tokenized data

In [36]:
with open(data_directory_pickle + '/health_data_tokenized.pkl', 'wb') as f:
    pickle.dump(filtered_tokens, f)

## save dataframe data

In [41]:
with open(data_directory_pickle + '/health_data.pkl', 'wb') as f:
    pickle.dump(df, f)

## Sentiment Analysis
   * **Polarity**: How positive or negative a word is. -1 is very negative. +1 is very positive.
   * **Subjectivity**: How subjective, or opinionated a word is. 0 is fact. +1 is very much an opinion.

In [None]:
%%time
polarity = lambda x: TextBlob(x).sentiment.polarity
subjectivity = lambda x: TextBlob(x).sentiment.subjectivity

# df['polarity'] = df['text'].apply(polarity)
# df['subjectivity'] = df['text'].apply(subjectivity)
df['polarity_processed'] = df['processed'].apply(polarity)
df['subjectivity_processed'] = df['processed'].apply(subjectivity)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.stars.value_counts()

In [None]:
five = df.groupby('stars').get_group(5)
one = df.groupby('stars').get_group(1)

In [None]:
five.polarity.mean()

In [None]:
five.polarity_processed.mean()

In [None]:
one.polarity.mean()

In [None]:
one.polarity_processed.mean()

In [None]:
#df[['polarity','polarity_processed']]
df['polarity'] - df['polarity_processed']

In [None]:
def num2words(num):
    nums_20_90 = ['Twenty','Thirty','Forty','Fifty','Sixty','Seventy','Eighty','Ninety']
    nums_0_19 = ['Zero','One','Two','Three','Four','Five','Six','Seven','Eight',"Nine", 'Ten','Eleven','Twelve','Thirteen','Fourteen','Fifteen','Sixteen','Seventeen','Eighteen','Nineteen']
    nums_dict = {100: 'hundred',1000:'thousand', 1000000:'million', 1000000000:'billion'}
    if num < 20:
        return nums_0_19[num]
    if num < 100:
        return nums_20_90[num/10-2] + ('' if num%10 == 0 else ' ' +  nums_0_19[num%10])
    # find the largest key smaller than num
    maxkey = max([key for key in nums_dict.keys() if key <= num])
    return num2words(num/maxkey) + ' ' + nums_dict[maxkey] + ('' if num%maxkey == 0 else ' ' + num2words(num%maxkey))

In [None]:
text = 'i was 42 one years old'

In [None]:
#!pip install num2words

In [None]:
from num2words import num2words

In [None]:
num2words('42',lang='en')

In [None]:
num2words('i was 42 one years old')