## TEXT ANALYTICS DEMO ON HOTEL REVIEWS DATASET

### Load required libraries

In [56]:
import pandas as pd
import numpy as np

# For sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# For text preprocessing
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# For topic modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation



[nltk_data] Downloading package punkt to C:\Users\SPARTA-
[nltk_data]     USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\SPARTA-
[nltk_data]     USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\SPARTA-USER\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Import hotel reviews dataset

In [3]:
reviews = pd.read_excel("C:/Users/SPARTA-USER/Documents/GitHub/shopee-code-league-2021/01_Data/hotel_reviews.xlsx")
reviews.head()

Unnamed: 0,address,categories,city,country,latitude,longitude,name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
0,95 Route 17k,Hotels,Newburgh,US,41.505388,-74.073291,Howard Johnson Inn - Newburgh,12550-5009,NY,2009-12-24T00:00:00Z,2017-04-17T01:54:07Z,,,3,The only thing wrong with the room was the sme...,OK Room,,,
1,95 Route 17k,Hotels,Newburgh,US,41.505388,-74.073291,Howard Johnson Inn - Newburgh,12550-5009,NY,2010-06-12T00:00:00Z,2017-04-17T01:54:07Z,,,3,I simply needed a cost effective place to stay...,Decent for the Money,,,
2,95 Route 17k,Hotels,Newburgh,US,41.505388,-74.073291,Howard Johnson Inn - Newburgh,12550-5009,NY,2012-10-11T00:00:00Z,2017-04-17T01:54:07Z,,,3,The location was great. However the rooms need...,Good Location,,,
3,95 Route 17k,Hotels,Newburgh,US,41.505388,-74.073291,Howard Johnson Inn - Newburgh,12550-5009,NY,2009-09-30T00:00:00Z,2017-04-17T01:54:07Z,,,3,The hotel room was ok for the money I paid but...,OK room for the money,,A verified traveler,
4,95 Route 17k,Hotels,Newburgh,US,41.505388,-74.073291,Howard Johnson Inn - Newburgh,12550-5009,NY,2016-07-01T00:00:00Z,2017-04-17T01:54:07Z,,,3,"It was good for the price, good service we jus...",good for the price,,Marjorie,


### Sentiment Scoring

Create a new column for the VADER scores.

In [22]:
analyser = SentimentIntensityAnalyzer()   # Function that generates the sentiment score of a text

reviews["VADER_raw"] = reviews["reviews.text"].apply(lambda t: str(analyser.polarity_scores(t)))
reviews["VADER_compound_score"] = reviews["VADER_raw"].apply(lambda s: float(s.split("compound': ")[1].replace("}","")))
reviews[["reviews.text","VADER_raw","VADER_compound_score"]].head()


Unnamed: 0,reviews.text,VADER_raw,VADER_compound_score
0,The only thing wrong with the room was the sme...,"{'neg': 0.109, 'neu': 0.739, 'pos': 0.151, 'co...",0.0516
1,I simply needed a cost effective place to stay...,"{'neg': 0.039, 'neu': 0.772, 'pos': 0.189, 'co...",0.8074
2,The location was great. However the rooms need...,"{'neg': 0.098, 'neu': 0.606, 'pos': 0.295, 'co...",0.7351
3,The hotel room was ok for the money I paid but...,"{'neg': 0.0, 'neu': 0.838, 'pos': 0.162, 'comp...",0.5927
4,"It was good for the price, good service we jus...","{'neg': 0.0, 'neu': 0.686, 'pos': 0.314, 'comp...",0.8271


Check the overall average reviews sentiment score.

In [25]:
reviews["VADER_compound_score"].mean()

0.17654579831932765

### Topic Modeling

Perform text preprocessing.  

In [68]:
# Drop duplicates
reviews.drop_duplicates(subset="reviews.text",keep="first",inplace=True)

# Create new column for review text to be processed
reviews["text2"] = reviews["reviews.text"]

# Remove punctuations from text2
reviews["text2"] = reviews["text2"].apply(lambda x: re.sub(r'\[.*?\]', '', x))
reviews["text2"] = reviews["text2"].apply(lambda x: re.sub(r'[%s]' % re.escape(string.punctuation), '', x))

# Remove numbers
reviews.text2 = reviews.text2.str.replace('\d+', '')

# Remove double spacing
reviews["text2"] = reviews["text2"].apply(lambda x: x.replace("  "," "))

# Remove leading and trailing spaces
reviews["text2"] = reviews["text2"].apply(lambda c: c.strip())

# Convert to lowercase
reviews["text2"] = reviews["text2"].apply(lambda c: c.lower())

# Lemmatize text
#----- Create function to convert nltk tag to wordnet tag -----#
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

#----- Create function that will lemmatize a sentence -----#
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

lemmatizer = WordNetLemmatizer()
reviews["text2"] =  reviews.apply(lambda x: lemmatize_sentence(x["text2"]), axis=1)  

# Remove stopwords
stopwords_list = set(list(stopwords.words('english')) + list(ENGLISH_STOP_WORDS))

def remove_stopwords (text):
    raw_tokenized = text.lower().split()
    
    clean_text = ""
    for w in raw_tokenized:
        if w not in stopwords_list:
            clean_text = clean_text + " " + w
        else:
            clean_text = clean_text
    return clean_text

reviews["text2"] = reviews["text2"].apply(lambda w: remove_stopwords(w))

# Remove -PRON-
reviews["text2"] = reviews["text2"].str.replace('-pron-', '')  
reviews["text2"] = reviews["text2"].str.replace('-PRON-', '')

# View cleansed text column
reviews["text2"].head()

0     thing wrong room smell mustymoldy like room w...
1     simply need cost effective place stay visit w...
2     location great room need renovate desperately...
3     hotel room ok money pay hallway walk room sme...
4     good price good service use sleepand good cab...
Name: text2, dtype: object

Create a document-term matrix representation of the reviews.

In [69]:
vectorizer = TfidfVectorizer(analyzer='word', min_df=10, stop_words='english', lowercase=True, token_pattern='[a-zA-Z0-9]{3,}')
data_vectorized = vectorizer.fit_transform(reviews["text2"])

Build LDA model with sklearn.

In [78]:
lda_model = LatentDirichletAllocation(n_components=4)
lda_output = lda_model.fit_transform(data_vectorized)

Create a document-topic dataframe.  Get the dominant topic per review record.

In [79]:
# Make the pandas dataframe
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
df_review_topic = pd.DataFrame(np.round(lda_output, 4), columns=topicnames)

# Get dominant topic for each review
df_review_topic["Dominant_Topic_Wt"] = df_review_topic.max(axis=1)
dominant_topic = np.argmax(df_review_topic.values, axis=1)
df_review_topic["Dominant_Topic"] = dominant_topic
df_review_topic.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Dominant_Topic_Wt,Dominant_Topic
0,0.0746,0.7645,0.0771,0.0839,0.7645,1
1,0.2803,0.6104,0.0549,0.0544,0.6104,1
2,0.0874,0.0925,0.0874,0.7326,0.7326,3
3,0.0622,0.065,0.8104,0.0624,0.8104,2
4,0.0905,0.0916,0.0897,0.7282,0.7282,3


To enable labeling of the topics generated, print the top words per topic.  Likewise, print the top reviews per topic.

In [80]:
# Create a function that will print the top words per topic
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
             for i in topic.argsort()[:-n_top_words - 1:-1]]))

# Show top 10 words per topic
print("Topics found via LDA:")
print_topics(lda_model, vectorizer, 10)         

Topics found via LDA:

Topic #0:
dirty chambre room hotel stay carpet convenient overnight tub little

Topic #1:
room hotel clean stay breakfast comfortable need right fine night

Topic #2:
room helpful stay pour door pay hotel nous nice staff

Topic #3:
good room hotel price clean bed stay great place breakfast


In [81]:
# Show the top reviews per topic
sub_reviews = reviews[["reviews.text"]]
df_review_topic = pd.merge(left=df_review_topic,right=sub_reviews,left_index=True,right_index=True)
df_review_topic2 = df_review_topic.groupby(["Dominant_Topic"]).apply(lambda x: x.sort_values(["Dominant_Topic_Wt"], ascending = False)).reset_index(drop=True)
top_10_mentions_x_topic = df_review_topic2.groupby("Dominant_Topic").head(10)[["Dominant_Topic","reviews.text"]]
top_10_mentions_x_topic.to_excel("C:/Users/SPARTA-USER/Documents/GitHub/shopee-code-league-2021/03_Output/Top10_Reviews_x_Topic.xlsx")
print(top_10_mentions_x_topic)

     Dominant_Topic                                       reviews.text
0                 0  Hotel needed updating - very old. The toilet w...
1                 0  The hotel was not horrible for the price. The ...
2                 0  The staff are very nice and the location was c...
3                 0  It was terrible that i left next day even when...
4                 0          Pour le prix c'est correct mais sans plus
5                 0  Stayed two seperate nights, Sunday and Friday,...
6                 0  The hotel was getting work done on the plumbin...
7                 0  Didn't know if I was gonna like it cuz the out...
8                 0  This motel served it's purpose for my visit.  ...
9                 0  Not so great. My band mates and I drove 20 hou...
124               1  A quick overnight at this location was pretty ...
125               1  Nice staff but Old facilities.  Obviously a sm...
126               1  NO HEAT IN 1ST ROOM LITTLE HEAT IN 2ND MOLD IN...
127   