## Sentiment Values
- textblob

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
path = 'yelp_data/health_text_full.csv'

df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64006 entries, 0 to 64005
Data columns (total 14 columns):
business_id        64006 non-null object
categories         64006 non-null object
name               64006 non-null object
state              64006 non-null object
categories_list    64006 non-null object
cool               64006 non-null int64
date               64006 non-null object
funny              64006 non-null int64
review_id          64006 non-null object
stars              64006 non-null int64
text               64006 non-null object
useful             64006 non-null int64
user_id            64006 non-null object
clean_text         64006 non-null object
dtypes: int64(4), object(10)
memory usage: 6.8+ MB


In [5]:
df.head()

Unnamed: 0,business_id,categories,name,state,categories_list,cool,date,funny,review_id,stars,text,useful,user_id,clean_text
0,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2018-03-03,0,TNNkSmMfshsD3G60jTNjDA,1,Please stay away from this place if you can! I...,2,xv2V2GO5IZYvtw4oW7gQ1w,please stay away place bad care imaginable sta...
1,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2015-11-29,0,v-iKdstPdCxJr8zV1ZMdrw,5,My husband has been a patient of Dr. Byrne for...,1,SjvWP7c9toeZoV_q62zhTA,husband patient dr byrne last year half last m...
2,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2016-06-03,0,BmNDRCV9_NzQ_KCChyfdEw,4,Dr. Byrne is a great doctor! She has great bed...,2,sZVHm1aLtvyH9trAc2_MgA,dr byrne great doctor great bed side manner ex...
3,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2017-03-17,0,t_TKVMxKFYm9Hl-TIO7UUw,3,I'm raising my review as Dr Bryne's has been m...,3,8Y_irXocZdZxLs_qgzpjBw,raise review dr bryne receptive daughter go an...
4,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2016-08-31,0,NWRrpGRgWZBBj3lvCZGVKA,1,I wish I could give 0 stars. Worst office I've...,1,hVKPDGpG12z7vpScXaSakw,wish could give star bad office ever horrible ...


## Textblob

In [6]:
from textblob import TextBlob
import itertools

## iterate over text and get sentiment value

In [7]:
def get_sentiment(dataframe):
    '''input df w/text cols and star val
    returns: dataframe with sent value for each text col'''
    t1 = datetime.now()
    print('starting: {}'.format(t1))
    sent_val, sent_val_clean, star_val = [],[],[]
    for idx, review in itertools.islice(dataframe.iterrows(), len(dataframe)):
        blob_text = TextBlob(review['text'])
        blob_clean_text = TextBlob(review['clean_text'])
        sent_val.append(blob_text.sentiment.polarity)
        sent_val_clean.append(blob_clean_text.sentiment.polarity)
        star_val.append(review['stars'])
    # create dataframe
    print("Done with values:")
    sent_df = pd.DataFrame()
    sent_df['stars'] = star_val
    sent_df['sent_value'] = sent_val
    sent_df['sent_value_clean'] = sent_val_clean
    print('Done, total time: {}'.format(datetime.now() - t1))
    return sent_df

In [8]:
df.head()

Unnamed: 0,business_id,categories,name,state,categories_list,cool,date,funny,review_id,stars,text,useful,user_id,clean_text
0,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2018-03-03,0,TNNkSmMfshsD3G60jTNjDA,1,Please stay away from this place if you can! I...,2,xv2V2GO5IZYvtw4oW7gQ1w,please stay away place bad care imaginable sta...
1,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2015-11-29,0,v-iKdstPdCxJr8zV1ZMdrw,5,My husband has been a patient of Dr. Byrne for...,1,SjvWP7c9toeZoV_q62zhTA,husband patient dr byrne last year half last m...
2,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2016-06-03,0,BmNDRCV9_NzQ_KCChyfdEw,4,Dr. Byrne is a great doctor! She has great bed...,2,sZVHm1aLtvyH9trAc2_MgA,dr byrne great doctor great bed side manner ex...
3,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2017-03-17,0,t_TKVMxKFYm9Hl-TIO7UUw,3,I'm raising my review as Dr Bryne's has been m...,3,8Y_irXocZdZxLs_qgzpjBw,raise review dr bryne receptive daughter go an...
4,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2016-08-31,0,NWRrpGRgWZBBj3lvCZGVKA,1,I wish I could give 0 stars. Worst office I've...,1,hVKPDGpG12z7vpScXaSakw,wish could give star bad office ever horrible ...


In [9]:
sentiment_df = get_sentiment(df)

starting: 2018-09-30 09:44:02.059801
Done with values:
Done, total time: 0:02:51.716279


In [10]:
sentiment_df.head()

Unnamed: 0,stars,sent_value,sent_value_clean
0,1,-0.050645,-0.036719
1,5,0.024962,0.069479
2,4,0.513333,0.317778
3,3,-0.035714,0.002806
4,1,-0.048246,-0.084259


## get categorical value

- create a function that returns categorical value from the sentiment value

In [11]:
def sent_score(sent_val):
    '''inputs value, returns cat value'''
    if sent_val > 0:
        score = 'positive'
    elif sent_val < 0:
        score = 'negative'
    else:
        score = 'neutral'
    return score

#### Test

In [12]:
sentiment_df.sent_value[0]

-0.05064484126984127

In [13]:
sent_score(sentiment_df.sent_value[0])

'negative'

In [14]:
%%time
sentiment_df['sent_score'] = sentiment_df.sent_value.apply(sent_score)
sentiment_df['sent_score_clean'] = sentiment_df.sent_value_clean.apply(sent_score)

CPU times: user 31.2 ms, sys: 0 ns, total: 31.2 ms
Wall time: 40 ms


In [15]:
sentiment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64006 entries, 0 to 64005
Data columns (total 5 columns):
stars               64006 non-null int64
sent_value          64006 non-null float64
sent_value_clean    64006 non-null float64
sent_score          64006 non-null object
sent_score_clean    64006 non-null object
dtypes: float64(2), int64(1), object(2)
memory usage: 2.4+ MB


In [16]:
sentiment_df.head()

Unnamed: 0,stars,sent_value,sent_value_clean,sent_score,sent_score_clean
0,1,-0.050645,-0.036719,negative,negative
1,5,0.024962,0.069479,positive,positive
2,4,0.513333,0.317778,positive,positive
3,3,-0.035714,0.002806,negative,positive
4,1,-0.048246,-0.084259,negative,negative


In [17]:
df.head()

Unnamed: 0,business_id,categories,name,state,categories_list,cool,date,funny,review_id,stars,text,useful,user_id,clean_text
0,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2018-03-03,0,TNNkSmMfshsD3G60jTNjDA,1,Please stay away from this place if you can! I...,2,xv2V2GO5IZYvtw4oW7gQ1w,please stay away place bad care imaginable sta...
1,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2015-11-29,0,v-iKdstPdCxJr8zV1ZMdrw,5,My husband has been a patient of Dr. Byrne for...,1,SjvWP7c9toeZoV_q62zhTA,husband patient dr byrne last year half last m...
2,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2016-06-03,0,BmNDRCV9_NzQ_KCChyfdEw,4,Dr. Byrne is a great doctor! She has great bed...,2,sZVHm1aLtvyH9trAc2_MgA,dr byrne great doctor great bed side manner ex...
3,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2017-03-17,0,t_TKVMxKFYm9Hl-TIO7UUw,3,I'm raising my review as Dr Bryne's has been m...,3,8Y_irXocZdZxLs_qgzpjBw,raise review dr bryne receptive daughter go an...
4,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","Lauren Byrne, MD",AZ,"['urologists', ' doctors', ' health & medical']",0,2016-08-31,0,NWRrpGRgWZBBj3lvCZGVKA,1,I wish I could give 0 stars. Worst office I've...,1,hVKPDGpG12z7vpScXaSakw,wish could give star bad office ever horrible ...


## Merge with original dataframe

In [25]:
# select only columns non duplicates
cols_to_use = df.columns.difference(sentiment_df.columns)

In [26]:
data_sentiment = pd.merge(sentiment_df, df[cols_to_use], left_index=True, right_index=True, how='outer')

In [27]:
data_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64006 entries, 0 to 64005
Data columns (total 18 columns):
stars               64006 non-null int64
sent_value          64006 non-null float64
sent_value_clean    64006 non-null float64
sent_score          64006 non-null object
sent_score_clean    64006 non-null object
business_id         64006 non-null object
categories          64006 non-null object
categories_list     64006 non-null object
clean_text          64006 non-null object
cool                64006 non-null int64
date                64006 non-null object
funny               64006 non-null int64
name                64006 non-null object
review_id           64006 non-null object
state               64006 non-null object
text                64006 non-null object
useful              64006 non-null int64
user_id             64006 non-null object
dtypes: float64(2), int64(4), object(12)
memory usage: 11.8+ MB


## rearange columns

In [28]:
list(data_sentiment.columns)

['stars',
 'sent_value',
 'sent_value_clean',
 'sent_score',
 'sent_score_clean',
 'business_id',
 'categories',
 'categories_list',
 'clean_text',
 'cool',
 'date',
 'funny',
 'name',
 'review_id',
 'state',
 'text',
 'useful',
 'user_id']

In [22]:
#data_sentiment = data_sentiment[['stars','text','sent_value','sent_score','clean_text', 'sent_value_clean','sent_score_clean']]

In [29]:
data_sentiment.head()

Unnamed: 0,stars,sent_value,sent_value_clean,sent_score,sent_score_clean,business_id,categories,categories_list,clean_text,cool,date,funny,name,review_id,state,text,useful,user_id
0,1,-0.050645,-0.036719,negative,negative,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","['urologists', ' doctors', ' health & medical']",please stay away place bad care imaginable sta...,0,2018-03-03,0,"Lauren Byrne, MD",TNNkSmMfshsD3G60jTNjDA,AZ,Please stay away from this place if you can! I...,2,xv2V2GO5IZYvtw4oW7gQ1w
1,5,0.024962,0.069479,positive,positive,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","['urologists', ' doctors', ' health & medical']",husband patient dr byrne last year half last m...,0,2015-11-29,0,"Lauren Byrne, MD",v-iKdstPdCxJr8zV1ZMdrw,AZ,My husband has been a patient of Dr. Byrne for...,1,SjvWP7c9toeZoV_q62zhTA
2,4,0.513333,0.317778,positive,positive,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","['urologists', ' doctors', ' health & medical']",dr byrne great doctor great bed side manner ex...,0,2016-06-03,0,"Lauren Byrne, MD",BmNDRCV9_NzQ_KCChyfdEw,AZ,Dr. Byrne is a great doctor! She has great bed...,2,sZVHm1aLtvyH9trAc2_MgA
3,3,-0.035714,0.002806,negative,positive,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","['urologists', ' doctors', ' health & medical']",raise review dr bryne receptive daughter go an...,0,2017-03-17,0,"Lauren Byrne, MD",t_TKVMxKFYm9Hl-TIO7UUw,AZ,I'm raising my review as Dr Bryne's has been m...,3,8Y_irXocZdZxLs_qgzpjBw
4,1,-0.048246,-0.084259,negative,negative,y-4xTZNKVm8mAZpiXMS5ZA,"urologists, doctors, health & medical","['urologists', ' doctors', ' health & medical']",wish could give star bad office ever horrible ...,0,2016-08-31,0,"Lauren Byrne, MD",NWRrpGRgWZBBj3lvCZGVKA,AZ,I wish I could give 0 stars. Worst office I've...,1,hVKPDGpG12z7vpScXaSakw


In [31]:
path = 'yelp_data/health_text_sentiment_full.csv'
data_sentiment.to_csv(path)