In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import time
import spacy
import pickle
import textacy
from textblob import TextBlob
from contractions import MEDICAL_MAP,CONTRACTION_MAP

## Data Directory

In [3]:
# data folder path
data_directory= os.path.join( '..','data','clean_data')
data_directory_pickle = os.path.join('..','data','pickle')

# data file location
dataset = os.path.join(data_directory,'health_text_data.csv')

In [4]:
df = pd.read_csv(dataset)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44918 entries, 0 to 44917
Data columns (total 6 columns):
business_id        44918 non-null object
review_id          44918 non-null object
health_business    44918 non-null object
name               44918 non-null object
stars              44918 non-null int64
text               44918 non-null object
dtypes: int64(1), object(5)
memory usage: 2.1+ MB


In [6]:
df.head()

Unnamed: 0,business_id,review_id,health_business,name,stars,text
0,2hpi6pXIFf0taDIYCoNIuw,4ar9LmGU4rQ3vXFj325HCg,urgent care,Healthcare Partner,1,If your aim is to waste hours upon hours of yo...
1,2hpi6pXIFf0taDIYCoNIuw,mZo59NzNBPr9RegkzjIGVA,urgent care,Healthcare Partner,5,"Memorial Day Weekend.. I can't Thank Dr, Shuc..."
2,2hpi6pXIFf0taDIYCoNIuw,5wf0Y31knIyBcbS7ZKrW9Q,urgent care,Healthcare Partner,5,This is not my primary doctor and I was pleasa...
3,2hpi6pXIFf0taDIYCoNIuw,pNVZVifb7FHtr7xMAbLZ4Q,urgent care,Healthcare Partner,5,Went there the first time for myself and was n...
4,2hpi6pXIFf0taDIYCoNIuw,OfFzPtU9N0ooeQbIg6yR-g,urgent care,Healthcare Partner,1,Terrible assistance. I was there to see a rash...


## Replace medical titles and contractions

In [7]:
def replace_contraction_medical(text):
    for word in text.split():
        if word  in MEDICAL_MAP:
            text = text.replace(word, MEDICAL_MAP[word])
        if word  in CONTRACTION_MAP:
            text = text.replace(word, CONTRACTION_MAP[word])
    return text

## test

In [8]:
df.text[1]

"Memorial Day Weekend..  I can't Thank Dr, Shucmacher, his head nurse and staff for saving my Life...  I had an allergic reaction and they immediately went into action when I arrived and I can't Thank them enough... It just so happened (with follow up) that I have a growth on my tongue that swells when I have an allergic reaction causing me to not be able  to swallow and difficulty speaking and breathing...  The whole staff was amazing, caring and truly interested in what they could do to help me...  They were an AMAZING STAFF and I can't thank them enough....THANK YOU From THE Bottom of my HEART!  Holly Hernandez/Pahrump NV"

In [9]:
replace_contraction_medical(df.text[1])

'Memorial Day Weekend..  I cannot Thank Doctor Shucmacher, his head nurse and staff for saving my Life...  I had an allergic reaction and they immediately went into action when I arrived and I cannot Thank them enough... It just so happened (with follow up) that I have a growth on my tongue that swells when I have an allergic reaction causing me to not be able  to swallow and difficulty speaking and breathing...  The whole staff was amazing, caring and truly interested in what they could do to help me...  They were an AMAZING STAFF and I cannot thank them enough....THANK YOU From THE Bottom of my HEART!  Holly Hernandez/Pahrump NV'

## Apply it

In [10]:
%%time
df['text'] = df['text'].apply(replace_contraction_medical)

CPU times: user 1.02 s, sys: 62.5 ms, total: 1.08 s
Wall time: 1.05 s


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44918 entries, 0 to 44917
Data columns (total 6 columns):
business_id        44918 non-null object
review_id          44918 non-null object
health_business    44918 non-null object
name               44918 non-null object
stars              44918 non-null int64
text               44918 non-null object
dtypes: int64(1), object(5)
memory usage: 2.1+ MB


In [12]:
df.head()

Unnamed: 0,business_id,review_id,health_business,name,stars,text
0,2hpi6pXIFf0taDIYCoNIuw,4ar9LmGU4rQ3vXFj325HCg,urgent care,Healthcare Partner,1,If your aim is to waste hours upon hours of yo...
1,2hpi6pXIFf0taDIYCoNIuw,mZo59NzNBPr9RegkzjIGVA,urgent care,Healthcare Partner,5,Memorial Day Weekend.. I cannot Thank Doctor ...
2,2hpi6pXIFf0taDIYCoNIuw,5wf0Y31knIyBcbS7ZKrW9Q,urgent care,Healthcare Partner,5,This is not my primary doctor and I was pleasa...
3,2hpi6pXIFf0taDIYCoNIuw,pNVZVifb7FHtr7xMAbLZ4Q,urgent care,Healthcare Partner,5,Went there the first time for myself and was n...
4,2hpi6pXIFf0taDIYCoNIuw,OfFzPtU9N0ooeQbIg6yR-g,urgent care,Healthcare Partner,1,Terrible assistance. I was there to see a rash...


In [13]:
df.text[1]

'Memorial Day Weekend..  I cannot Thank Doctor Shucmacher, his head nurse and staff for saving my Life...  I had an allergic reaction and they immediately went into action when I arrived and I cannot Thank them enough... It just so happened (with follow up) that I have a growth on my tongue that swells when I have an allergic reaction causing me to not be able  to swallow and difficulty speaking and breathing...  The whole staff was amazing, caring and truly interested in what they could do to help me...  They were an AMAZING STAFF and I cannot thank them enough....THANK YOU From THE Bottom of my HEART!  Holly Hernandez/Pahrump NV'

# Preprocess the text


In [14]:
%%time
df['processed'] = df['text'].map(lambda x: textacy.preprocess.preprocess_text(x, lowercase=True, 
                                                                                    no_urls=True, 
                                                                                    no_punct=True, 
                                                                                    no_numbers=True))

CPU times: user 14.4 s, sys: 78.1 ms, total: 14.5 s
Wall time: 14.5 s


In [17]:
df.processedessed[1]

'memorial day weekend i cannot thank doctor shucmacher his head nurse and staff for saving my life i had an allergic reaction and they immediately went into action when i arrived and i cannot thank them enough it just so happened with follow up that i have a growth on my tongue that swells when i have an allergic reaction causing me to not be able to swallow and difficulty speaking and breathing the whole staff was amazing caring and truly interested in what they could do to help me they were an amazing staff and i cannot thank them enough thank you from the bottom of my heart holly hernandez pahrump nv'

## Sentiment Analysis
   * **Polarity**: How positive or negative a word is. -1 is very negative. +1 is very positive.
   * **Subjectivity**: How subjective, or opinionated a word is. 0 is fact. +1 is very much an opinion.

In [15]:
%%time
polarity = lambda x: TextBlob(x).sentiment.polarity
subjectivity = lambda x: TextBlob(x).sentiment.subjectivity

df['polarity'] = df['processed'].apply(polarity)
df['subjectivity'] = df['processed'].apply(subjectivity)

CPU times: user 1min 5s, sys: 297 ms, total: 1min 5s
Wall time: 1min 5s


## save dataframe data

In [16]:
with open(data_directory_pickle + '/health_data.pkl', 'wb') as f:
    pickle.dump(df, f)