## Sentiment Analysis

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
path = 'yelp_data/health_text.csv'

df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64006 entries, 0 to 64005
Data columns (total 3 columns):
stars         64006 non-null int64
text          64006 non-null object
clean_text    64006 non-null object
dtypes: int64(1), object(2)
memory usage: 1.5+ MB


In [5]:
df.head()

Unnamed: 0,stars,text,clean_text
0,1,Please stay away from this place if you can! I...,please stay away place bad care imaginable sta...
1,5,My husband has been a patient of Dr. Byrne for...,husband patient dr byrne last year half last m...
2,4,Dr. Byrne is a great doctor! She has great bed...,dr byrne great doctor great bed side manner ex...
3,3,I'm raising my review as Dr Bryne's has been m...,raise review dr bryne receptive daughter go an...
4,1,I wish I could give 0 stars. Worst office I've...,wish could give star bad office ever horrible ...


## Textblob

In [6]:
from textblob import TextBlob
import itertools

## iterate over text and get sentiment value

In [7]:
def get_sentiment(dataframe):
    '''input df w/text cols and star val
    returns: dataframe with sent value for each text col'''
    t1 = datetime.now()
    print('starting: {}'.format(t1))
    sent_val, sent_val_clean, star_val = [],[],[]
    for idx, review in itertools.islice(dataframe.iterrows(), len(dataframe)):
        blob_text = TextBlob(review['text'])
        blob_clean_text = TextBlob(review['clean_text'])
        sent_val.append(blob_text.sentiment.polarity)
        sent_val_clean.append(blob_clean_text.sentiment.polarity)
        star_val.append(review['stars'])
    # create dataframe
    print("Done with values:")
    sent_df = pd.DataFrame()
    sent_df['stars'] = star_val
    sent_df['sent_value'] = sent_val
    sent_df['sent_value_clean'] = sent_val_clean
    print('Done, total time: {}'.format(datetime.now() - t1))
    return sent_df

In [8]:
df.head()

Unnamed: 0,stars,text,clean_text
0,1,Please stay away from this place if you can! I...,please stay away place bad care imaginable sta...
1,5,My husband has been a patient of Dr. Byrne for...,husband patient dr byrne last year half last m...
2,4,Dr. Byrne is a great doctor! She has great bed...,dr byrne great doctor great bed side manner ex...
3,3,I'm raising my review as Dr Bryne's has been m...,raise review dr bryne receptive daughter go an...
4,1,I wish I could give 0 stars. Worst office I've...,wish could give star bad office ever horrible ...


In [9]:
sentiment_df = get_sentiment(df)

starting: 2018-09-25 22:14:31.392362
Done with values:
Done, total time: 0:02:55.605755


In [10]:
sentiment_df.head()

Unnamed: 0,stars,sent_value,sent_value_clean
0,1,-0.050645,-0.036719
1,5,0.024962,0.069479
2,4,0.513333,0.317778
3,3,-0.035714,0.002806
4,1,-0.048246,-0.084259


## get categorical value

- create a function that returns categorical value from the sentiment value

In [11]:
def sent_score(sent_val):
    '''inputs value, returns cat value'''
    if sent_val > 0:
        score = 'positive'
    elif sent_val < 0:
        score = 'negative'
    else:
        score = 'neutral'
    return score

#### Test

In [12]:
sentiment_df.sent_value[0]

-0.05064484126984127

In [13]:
sent_score(sentiment_df.sent_value[0])

'negative'

In [14]:
%%time
sentiment_df['sent_score'] = sentiment_df.sent_value.apply(sent_score)
sentiment_df['sent_score_clean'] = sentiment_df.sent_value_clean.apply(sent_score)

CPU times: user 31.2 ms, sys: 15.6 ms, total: 46.9 ms
Wall time: 41.4 ms


In [15]:
sentiment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64006 entries, 0 to 64005
Data columns (total 5 columns):
stars               64006 non-null int64
sent_value          64006 non-null float64
sent_value_clean    64006 non-null float64
sent_score          64006 non-null object
sent_score_clean    64006 non-null object
dtypes: float64(2), int64(1), object(2)
memory usage: 2.4+ MB


In [16]:
sentiment_df.head()

Unnamed: 0,stars,sent_value,sent_value_clean,sent_score,sent_score_clean
0,1,-0.050645,-0.036719,negative,negative
1,5,0.024962,0.069479,positive,positive
2,4,0.513333,0.317778,positive,positive
3,3,-0.035714,0.002806,negative,positive
4,1,-0.048246,-0.084259,negative,negative


In [17]:
df.head()

Unnamed: 0,stars,text,clean_text
0,1,Please stay away from this place if you can! I...,please stay away place bad care imaginable sta...
1,5,My husband has been a patient of Dr. Byrne for...,husband patient dr byrne last year half last m...
2,4,Dr. Byrne is a great doctor! She has great bed...,dr byrne great doctor great bed side manner ex...
3,3,I'm raising my review as Dr Bryne's has been m...,raise review dr bryne receptive daughter go an...
4,1,I wish I could give 0 stars. Worst office I've...,wish could give star bad office ever horrible ...


## Merge with original dataframe

In [18]:
# select only columns non duplicates
cols_to_use = df.columns.difference(sentiment_df.columns)

In [19]:
data_sentiment = pd.merge(sentiment_df, df[cols_to_use], left_index=True, right_index=True, how='outer')

In [20]:
data_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64006 entries, 0 to 64005
Data columns (total 7 columns):
stars               64006 non-null int64
sent_value          64006 non-null float64
sent_value_clean    64006 non-null float64
sent_score          64006 non-null object
sent_score_clean    64006 non-null object
clean_text          64006 non-null object
text                64006 non-null object
dtypes: float64(2), int64(1), object(4)
memory usage: 6.4+ MB


## rearange columns

In [21]:
list(data_sentiment.columns)

['stars',
 'sent_value',
 'sent_value_clean',
 'sent_score',
 'sent_score_clean',
 'clean_text',
 'text']

In [22]:
data_sentiment = data_sentiment[['stars','text','sent_value','sent_score','clean_text', 'sent_value_clean','sent_score_clean']]

In [23]:
data_sentiment.head()

Unnamed: 0,stars,text,sent_value,sent_score,clean_text,sent_value_clean,sent_score_clean
0,1,Please stay away from this place if you can! I...,-0.050645,negative,please stay away place bad care imaginable sta...,-0.036719,negative
1,5,My husband has been a patient of Dr. Byrne for...,0.024962,positive,husband patient dr byrne last year half last m...,0.069479,positive
2,4,Dr. Byrne is a great doctor! She has great bed...,0.513333,positive,dr byrne great doctor great bed side manner ex...,0.317778,positive
3,3,I'm raising my review as Dr Bryne's has been m...,-0.035714,negative,raise review dr bryne receptive daughter go an...,0.002806,positive
4,1,I wish I could give 0 stars. Worst office I've...,-0.048246,negative,wish could give star bad office ever horrible ...,-0.084259,negative


In [24]:
path = 'yelp_data/health_text_sentiment.csv'
data_sentiment.to_csv(path)