In [1]:
from tensorflow.python.client import device_lib
def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]
print(get_available_devices())

['/device:CPU:0', '/device:XLA_CPU:0', '/device:GPU:0', '/device:XLA_GPU:0']


In [2]:
import tables
import numpy as np
import pandas as pd
import re, string, spacy
import nltk
nltk.download('vader_lexicon')
nltk.download('punkt')
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import unidecode
import en_core_web_md
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm

tqdm.pandas()
%matplotlib inline
warnings.filterwarnings("ignore")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/dickyalsyah/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/dickyalsyah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from pandas import Panel


### Load Dataset from insideairbnb.com

In [3]:
newyork_reviews = pd.read_csv('http://data.insideairbnb.com/united-states/ny/new-york-city/2020-05-06/data/reviews.csv.gz')

In [4]:
newyork_reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2060,158,2008-09-22,2865,Thom,"very nice neighborhood,close enough to ""A"" tra..."
1,2595,17857,2009-11-21,50679,Jean,Notre séjour de trois nuits.\r\nNous avons app...
2,2595,19176,2009-12-05,53267,Cate,Great experience.
3,2595,19760,2009-12-10,38960,Anita,I've stayed with my friend at the Midtown Cast...
4,2595,34320,2010-04-09,71130,Kai-Uwe,"We've been staying here for about 9 nights, en..."


In [5]:
newyork_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1226554 entries, 0 to 1226553
Data columns (total 6 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   listing_id     1226554 non-null  int64 
 1   id             1226554 non-null  int64 
 2   date           1226554 non-null  object
 3   reviewer_id    1226554 non-null  int64 
 4   reviewer_name  1226544 non-null  object
 5   comments       1225808 non-null  object
dtypes: int64(3), object(3)
memory usage: 56.1+ MB


### Clean Our Review Text

In [6]:
newyork_reviews.isnull().sum()

listing_id         0
id                 0
date               0
reviewer_id        0
reviewer_name     10
comments         746
dtype: int64

In [7]:
newyork_reviews.dropna(subset=['comments'], how='any', axis = 0, inplace = True)

In [8]:
newyork_reviews['comments'].isnull().sum()

0

In [9]:
newyork_reviews['comments'] = newyork_reviews['comments'].astype('str')

In [10]:
def decontracted(phrase):
    # specific
    phrase = phrase.lower()
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"ain\'t", "i am not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\s+im", " i am", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def clean_text(txt):
    sentence = decontracted(txt)
    sentence = re.sub(r'\"', r' ', sentence)
    sentence = re.sub(r'\s+\.', r' ', sentence)
    sentence = re.sub(r'\r\n', r' ', sentence)
    sentence = re.sub(r'(?<=[.,])(?=[^\s])', r' ', sentence)
    sentence = re.sub(r'\/|\_|\-|\:|]', r' ', sentence)
    sentence = re.sub("\S*\d\S*", "", sentence)
    sentence = unidecode.unidecode(sentence)
    return ' '.join(sentence.split())

In [11]:
newyork_reviews['clean_comments'] = newyork_reviews['comments'].progress_apply(clean_text)

100%|██████████| 1225808/1225808 [02:12<00:00, 9225.63it/s] 


### Build Sentiment with Vader Lexical Based

Demonstrate sentiment analysis via VADER(Valence Aware Dictionary for sentiment Reasoning). Vader works by relying on a dictionary which maps lexical features to emotion intensities, called sentiment scores.

Sentiment score of a text can be obtained by summing up the intensity of each word in text. In this approach, each of the words in the lexicon is rated as positive or negative, and in many cases, how positive or negative.
Let's play around a bit and get familiar with this package:

In [12]:
vader_compound = lambda s: (SentimentIntensityAnalyzer().polarity_scores(s))['compound']

In [13]:
%%timeit

# newyork_reviews['polarity'] = newyork_reviews['clean_comments'].progress_apply(vader_compound)
# newyork_reviews.to_hdf('newyork_reviews.h5', 'newyork_reviews', mode = 'w')

In [14]:
newyork_reviews = pd.read_hdf('newyork_reviews.h5', 'newyork_reviews')

The compound score is computed by summing the valence scores of each word in the lexicon, adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive). This is the most useful metric if you want a single unidimensional measure of sentiment for a given sentence. Calling it a 'normalized, weighted composite score' is accurate.

1. positive sentiment : (compound score >= 0.05)
2. neutral sentiment : (compound score > -0.05) and (compound score < 0.05)
3. negative sentiment : (compound score <= -0.05)


Source : https://github.com/cjhutto/vaderSentiment

In [15]:
def sentiment(polarity):
    if polarity >= 0.5:
        return 'Positive'
    elif polarity <= -0.5:
        return 'Negative'
    else:
        return 'Neutral'

In [16]:
newyork_reviews['sentiment'] = newyork_reviews['polarity'].progress_apply(sentiment)

100%|██████████| 1225808/1225808 [00:01<00:00, 987887.75it/s] 


In [17]:
%%time
newyork_reviews['comment_length'] = newyork_reviews['clean_comments'].str.split().str.len()

### Cleaning Text with No Punctuation and Stopword Remove

In [18]:
nlp = en_core_web_md.load()
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS
stop_words.remove('not')
stop_words |= {'th', 'rd', 'nd', 'etc', 'want', 'new york', 'nyc', 'ny', 've', 're', 'll', 'a', 'b', 'c', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', 'm',
              'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}
    
def pre_processor(text):
    # Clean basic text
    text = re.sub(r'[0-9]', r' ', text)
    text = re.sub(r'n\'t', r' not', text)
    text = re.sub(r'\'t', r' not', text)
    text = re.sub(r'\ |\?|\.|\_|\'|\-|\!|\/|\;|\:', r' ', text)
    text = unidecode.unidecode(text) # Remove accent word

    # Creating token object, which is used to create documents with linguistic annotations.
    myword = nlp(text)

    # Lemmatizing each token and converting each token into lowercase
    myword = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in myword ]

    # Removing stop words
    
    myword = [ word for word in myword if word not in stop_words and word not in punctuations ]

    return (" ".join(myword))

In [19]:
%%time
# newyork_reviews['comments_meaningful'] = newyork_reviews['comments'].progress_apply(pre_processor)
# newyork_reviews['polarity_meaningful'] = newyork_reviews['comments_meaningful'].progress_apply(vader_compound)
# newyork_reviews.to_hdf('newyork_reviews.h5', key = 'newyork_reviews', mode = 'w')

In [20]:
newyork_reviews = pd.read_hdf('newyork_reviews.h5', 'newyork_reviews')

In [21]:
%%time
newyork_reviews['sentiment_meaningful'] = newyork_reviews['polarity_meaningful'].progress_apply(sentiment)

100%|██████████| 1225808/1225808 [00:01<00:00, 1025375.34it/s]


CPU times: user 1.2 s, sys: 14 ms, total: 1.22 s
Wall time: 1.21 s


In [22]:
%%time
# newyork_reviews['comment_meaningful_length'] = newyork_reviews['comments_meaningful'].str.split().str.len()

In [23]:
newyork_reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,clean_comments,polarity,sentiment,comment_length,comments_meaningful,polarity_meaningful,sentiment_meaningful,comment_meaningful_length
0,2060,158,2008-09-22,2865,Thom,"very nice neighborhood,close enough to ""A"" tra...","very nice neighborhood, close enough to a trai...",0.8928,Positive,46,nice neighborhood close train comfortable bed ...,0.937,Positive,23
1,2595,17857,2009-11-21,50679,Jean,Notre séjour de trois nuits.\r\nNous avons app...,notre sejour de trois nuits. nous avons apprec...,-0.3612,Neutral,120,notre sejour de trois nuits nous avon apprecie...,0.2263,Neutral,110
2,2595,19176,2009-12-05,53267,Cate,Great experience.,great experience.,0.6249,Positive,2,great experience,0.6249,Positive,2
3,2595,19760,2009-12-10,38960,Anita,I've stayed with my friend at the Midtown Cast...,i have stayed with my friend at the midtown ca...,0.9248,Positive,92,stay friend midtown castle day lovely place bi...,0.9761,Positive,38
4,2595,34320,2010-04-09,71130,Kai-Uwe,"We've been staying here for about 9 nights, en...","we have been staying here for about nights, en...",0.9229,Positive,66,stay night enjoy center city sleep short way m...,0.9042,Positive,28


In [24]:
newyork_reviews.to_hdf('newyork_reviews.h5', 'newyork_reviews', mode = 'w')