In [2]:
import pandas as pd
import datetime
from re import sub
from decimal import Decimal

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.downloader.download('vader_lexicon')

from langdetect import detect

from sklearn import preprocessing

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

# Change pandas viewing options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/danielacollaguazo/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Importing Data 

In [3]:
df_reviews = pd.read_csv("../../data/new-york-city-airbnb-open-data/reviews.csv")

## Sentiment Analysis of review comments

In [4]:
# df_reviews.shape
df_reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2060,158,2008-09-22,2865,Thom,"very nice neighborhood,close enough to ""A"" tra..."
1,2595,17857,2009-11-21,50679,Jean,Notre séjour de trois nuits.\r\nNous avons app...
2,2595,19176,2009-12-05,53267,Cate,Great experience.
3,2595,19760,2009-12-10,38960,Anita,I've stayed with my friend at the Midtown Cast...
4,2595,34320,2010-04-09,71130,Kai-Uwe,"We've been staying here for about 9 nights, en..."


In [5]:
# calculates compound sentiment polarity of the sentence
vader_polarity_compound = lambda x: (SentimentIntensityAnalyzer().polarity_scores(x))['compound']

# We can retrieve scores for positive, negative or neutral sentiment. 
# We will use the compound: a normalized value: norm_score = score / math.sqrt((score * score) + alpha)
print(SentimentIntensityAnalyzer().polarity_scores('VADER is smart, handsome, and funny.'))

{'neg': 0.0, 'neu': 0.254, 'pos': 0.746, 'compound': 0.8316}


In [6]:
# determining the number of no comments
"{}% of reviews have empty comments".format((df_reviews.comments.isnull().sum() / df_reviews.comments.shape[0]) *100)
# "My name is {}, I'am {}".format("John",36)

'0.05879231633876933% of reviews have empty comments'

In [7]:
# Since the percentage is so insignificant, these rows will be dropped
df_reviews.dropna(subset=['comments'], how='any', axis=0, inplace=True)

Function that predicts the language. It needs to be passed a string with minimum amount of characters, thus the calculation on the fly of the lenght of the string passed.

In [8]:
def predict_lang(x):
    lang=''
    txt_len=len(x)
    if txt_len>50:
        try:
            lang=detect(x)
        except Exception as e:
            lang=''
    return lang

In [9]:
df_reviews['review_lang'] = df_reviews.comments.apply(lambda x: predict_lang(x))

Using the pre-trained Vader sentiment model based on NLTK go create polarity scores for all reviews:

In [10]:
df_reviews['polarity'] = df_reviews.comments.map(vader_polarity_compound)

We observe that a lot of reviews didnt get a language value. This is because their length was too short for the calculation

In [13]:
df_reviews.review_lang.value_counts(dropna=False)

en       1009550
          141117
fr         41577
es         38270
de         14027
it          6810
pt          5802
ko          2991
nl          2920
zh-cn       2668
ru          2113
ja          1617
da           838
sv           776
pl           337
no           332
ca           325
cs           289
fi           250
tr           121
hu            84
he            67
el            55
af            46
sk            39
vi            33
ro            27
th            26
zh-tw         25
ar            16
hr            13
so            11
sl            10
et            10
uk             9
id             8
bg             6
tl             5
cy             4
lt             2
sq             1
Name: review_lang, dtype: int64

Reviews were exported for both English and Spanish. I checked the spanish reviews fo sentiment and wasn't accurate what leads me to believe that this algorithm works best for English text

In [15]:
df_reviews.to_csv('reviews_with_sentiment_and_lang.csv', sep='\t', index=False)