# Library Loading

In [7]:
#Core
import pandas as pd
import numpy as np

#Language Detection
from langdetect import detect

#Noise Removal
import string
import re
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

#display adjust
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime

# Data Loading

In [2]:
df_kaggle = pd.read_csv('Britannia_Kaggle.csv')
df_booking = pd.read_csv('Britannia_Booking.csv')
df_tripadvisor = pd.read_csv('Britannia_Tripadvisor.csv')

In [3]:
df_booking['Review_date'] = df_booking['Review_date'].str.split(':').str[1]

In [14]:
df_booking['Review_date'] = pd.to_datetime(df_booking['Review_date'])

In [16]:
df_booking['Date_stayed'] = df_booking['Date_stayed'].str.split(' · ').str[0]

In [17]:
df_kaggle['Tags'] = df_kaggle['Tags'].str.replace('[', '')
df_kaggle['Tags'] = df_kaggle['Tags'].str.replace(']', '')
df_kaggle['Tags'] = df_kaggle['Tags'].str.replace("'", "")
df_kaggle['Tags'] = df_kaggle['Tags'].str.split(',')

In [18]:
Aggregate = pd.DataFrame(columns = ['Review_Date','Reviewer_Nationality',
                                    'Room_stayed','Length_stayed','Trip_type'
                                   'Positive_Review','Negative_Review',
                                   'Condense_Positive_Review','Condense_Negative_Review'])

In [19]:
Review_Date = df_booking['Review_date']
Reviewer_Nationality = df_booking['Country']
Room_stayed = df_booking['Room_stayed']
Length_stayed = df_booking['Date_stayed']
Trip_type = df_booking['Trip_type']
Positive_Review = df_booking['Positive']
Negative_Review = df_booking['Negative']
Condense_Positive_Review = df_booking['Cleaned_Positive']
Condense_Negative_Review = df_booking['Cleaned_Negative']

In [20]:
df_kaggle['Room_stayed'] = None
df_kaggle['Date_stayed']= None
df_kaggle['Trip_type']= None
for i in range(len(df_kaggle)):
    try:
        df_kaggle['Room_stayed'][i] =  df_kaggle['Tags'][i][2]
        df_kaggle['Date_stayed'][i] = df_kaggle['Tags'][i][3]
        df_kaggle['Trip_type'][i] = df_kaggle['Tags'][i][0]
    except:
        pass

In [21]:
Review_Date.append(df_kaggle['Review_Date'])
Reviewer_Nationality.append(df_kaggle['Reviewer_Nationality'])
Room_stayed.append(df_kaggle['Room_stayed'])
Length_stayed.append(df_kaggle['Date_stayed'])
Trip_type.append(df_kaggle['Trip_type'])
Positive_Review.append(df_kaggle['Positive_Review'])
Negative_Review.append(df_kaggle['Negative_Review'])
Condense_Positive_Review.append(df_kaggle['Cleaned_Positive_Review'])
Condense_Negative_Review.append(df_kaggle['Cleaned_Negative_Review'])

0                         cig box room think forget clean
1       dirty windows dead bug find bath also heat not...
2                                                     NaN
3       first room give not clean notify staff change ...
4                                                     NaN
                              ...                        
3598                                             negative
3599                                 concierge uninformed
3600                                           complaints
3601    really shabby run hotel need total refurbish j...
3602    stain carpet peel wallpaper scruffy scuff bedr...
Length: 17496, dtype: object

In [22]:
Aggregate['Review_Date'] = Review_Date
Aggregate['Reviewer_Nationality'] = Reviewer_Nationality
Aggregate['Room_stayed'] = Room_stayed
Aggregate['Length_stayed'] = Length_stayed
Aggregate['Trip_type']= Trip_type
Aggregate['Positive_Review']= Positive_Review
Aggregate['Negative_Review']= Negative_Review
Aggregate['Condense_Positive_Review']= Condense_Positive_Review
Aggregate['Condense_Negative_Review'] = Condense_Negative_Review

# TOPIC MODELLING

In [23]:
'''Spacy'''
import spacy
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")

'''Embedding and Pipeline'''
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import LatentDirichletAllocation

In [24]:
stoplist =['not','like','work','need',
           'really','nan','nothing',
           'dislike','every','thing',
           'think','anything',
          'bite','tire','everything', 'perfect','ok']

In [25]:
df = Aggregate[Aggregate['Condense_Negative_Review'].notna()].reset_index()

In [26]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stoplist,ngram_range=(2,4))
lda = LatentDirichletAllocation(n_components=15)
pipe = make_pipeline(tfidf_vectorizer, lda)
pipe.fit(df['Condense_Negative_Review'])

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(ngram_range=(2, 4),
                                 stop_words=['not', 'like', 'work', 'need',
                                             'really', 'nan', 'nothing',
                                             'dislike', 'every', 'thing',
                                             'think', 'anything', 'bite',
                                             'tire', 'everything', 'perfect',
                                             'ok'])),
                ('latentdirichletallocation',
                 LatentDirichletAllocation(n_components=15))])

In [27]:
def print_top_words(model, feature_names, n_top_words):
    phrases = []
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        phrases.append(message)
        #print(message)
    #print(phrases)
    return phrases
phrases = print_top_words(lda, tfidf_vectorizer.get_feature_names(), n_top_words=1)
phrases

['Topic #0: decor update',
 'Topic #1: room window',
 'Topic #2: decor date',
 'Topic #3: bed comfortable',
 'Topic #4: bed uncomfortable',
 'Topic #5: room cold',
 'Topic #6: room clean',
 'Topic #7: free wifi',
 'Topic #8: staff rude',
 'Topic #9: park free',
 'Topic #10: room refurbish',
 'Topic #11: room noisy',
 'Topic #12: room date',
 'Topic #13: room update',
 'Topic #14: pay wifi']

In [28]:
condense_phrases = ['decor','wifi','room','breakfast','bed','pool','park','staff','air','price']

In [29]:
def take_string(text):
    return text.split(": ",1)[1]

for i in range(len(phrases)):
    phrases[i] = take_string(phrases[i])

In [30]:
def get_matches(my_text):
    sentence = nlp(my_text)
    topic = []
    for phrase in phrases:
        phrase = phrase.split()
        patterns = [nlp(text) for text in phrase]
        phrase_matcher = PhraseMatcher(nlp.vocab)
        phrase_matcher.add('AI', None, *patterns)
        matched_phrases = phrase_matcher(sentence)
        if len(matched_phrases) > 0:
            topic.append(" ".join(phrase))
        else:
            pass
    return topic

In [31]:
df['Topics'] = 'Others'
df.head(2)

Unnamed: 0,index,Review_Date,Reviewer_Nationality,Room_stayed,Length_stayed,Trip_typePositive_Review,Negative_Review,Condense_Positive_Review,Condense_Negative_Review,Trip_type,Positive_Review,Topics
0,0,2020-08-17,India,Standard Double Room without Window,1 night,,There was cig box in the room which i think th...,room quality hotel staff bar,cig box room think forget clean,Solo traveler,Room quality.. hotel staff.. the bar..,Others
1,1,2021-02-13,United Kingdom,Standard Double Room,1 night,,Dirty windows and dead bug found in the bath a...,bed size room bath tub feature bathroom location,dirty windows dead bug find bath also heat not...,Couple,"The bed, size of room, the bath tub feature in...",Others


In [32]:
for i in range(len(df)):
    df['Topics'].iloc[i] = get_matches(df['Condense_Negative_Review'].iloc[i])

In [33]:
for i in range(len(df)):
    if len(df['Topics'].iloc[i]) == 0:
        df['Topics'].iloc[i] = 'Others'
    else:
        pass

# Customized Search

In [34]:
condense_phrases = ['decor','wifi','room','breakfast','bed','pool','park','staff','air','price']

In [35]:
def get_matches(my_text):
    sentence = nlp(my_text)
    topic = []
    for i in range(len(condense_phrases)):
        phrase = condense_phrases[i]
        patterns = nlp(phrase)
        phrase_matcher = PhraseMatcher(nlp.vocab)
        phrase_matcher.add('AI', None, patterns)
        matched_phrases = phrase_matcher(sentence)
        if len(matched_phrases) > 0:
            topic.append(phrase)
        else:
            pass
    return topic

## Data for Dashboard

In [36]:
df['Cleaned_Topic'] = 'Others'

for i in range(len(df)):
    df['Cleaned_Topic'].iloc[i] = get_matches(df['Condense_Negative_Review'].iloc[i])

In [37]:
for i in range(len(df)):
    if len(df['Cleaned_Topic'].iloc[i]) == 0:
        df['Cleaned_Topic'].iloc[i] = 'Others'
    else:
        pass

In [38]:
df['Condense_Negative_Review'][8]

'show photo id not make clear arrival'

In [39]:
df['Cleaned_Topic'][8]

'Others'

In [40]:
df.to_csv('Data POC.csv')