* NLTK
* Scikit-learn

In [17]:
import pandas as pd
df = pd.read_csv('/Users/cynding/Desktop/Brandeis 2021 Fall/BUS 256A - Marketing Analytics/final project/marriott_hotel_reviews.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3179 entries, 0 to 3178
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Customer         3176 non-null   object 
 1   Member_Type      2277 non-null   object 
 2   Traveler_Type    3164 non-null   object 
 3   Date             3179 non-null   object 
 4   Review           3156 non-null   object 
 5   Total_Score      3179 non-null   float64
 6   Cleanliness      3158 non-null   float64
 7   Dining           2849 non-null   float64
 8   Location         3155 non-null   float64
 9   Service          3155 non-null   float64
 10  Amenities        3118 non-null   float64
 11  Value_for_Money  3152 non-null   float64
 12  Hotel            3179 non-null   object 
dtypes: float64(7), object(6)
memory usage: 323.0+ KB


### Clean Data

In [19]:
df['Review']=df['Review'].astype(str)

In [45]:
# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

'''
Part-Of-Speech (POS) tagging: assign a tag to every word to define 
if it corresponds to a noun, a verb etc. using the WordNet lexical database
'''
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text: transform every word into their root form 
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
df['Review'] = df['Review'].apply(lambda x: clean_text(x))

In [50]:
df['Review'] = df['Review'].str.replace("n't"," not")
df['Review'] = df['Review'].str.replace("I'm","I am")
df['Review'] = df['Review'].str.replace("I've","I have")
df['Review'] = df['Review'].str.replace("they're","they are")
df['Review'] = df['Review'].str.replace("they've","they have")
df['Review'] = df['Review'].str.replace("might've","might have")
df['Review'] = df['Review'].str.replace("must've","must have")
df['Review'] = df['Review'].str.replace("should've","should have")
df['Review'] = df['Review'].str.replace("would've","would have")
df['Review'] = df['Review'].str.replace("could've","could have")
df['Review'] = df['Review'].str.replace("we're","we are")
df['Review'] = df['Review'].str.replace("'ll"," will")
df['Review'] = df['Review'].str.replace("It's","It is")
df['Review'] = df['Review'].str.replace("it's","It is")
df['Review'] = df['Review'].str.replace("where's","where is")
df['Review'] = df['Review'].str.replace("that's","that is")
df['Review'] = df['Review'].str.replace("who's","who is")

In [51]:
df['Review'].head()

0    wonderful night stay… beautiful room comfortab...
1                    excellent hotel reception perfect
2                                           prepayment
3    hubby stay overnight staff hotel pleasant help...
4    great cost-benefit i will back always need gre...
Name: Review, dtype: object

### Feature Engineering

In [30]:
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
df['sentiments'] = df['Review'].apply(lambda x: sid.polarity_scores(x))
df = pd.concat([df.drop(['sentiments'], axis=1), df['sentiments'].apply(pd.Series)], axis=1)

In [47]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [52]:
# Remove rows with NA Value
df = df[df['Cleanliness'].notna()]
df = df[df['Dining'].notna()]
df = df[df['Location'].notna()]
df = df[df['Service'].notna()]
df = df[df['Amenities'].notna()]
df = df[df['Value_for_Money'].notna()]

df_x = df['Review']
df_y = df['Total_Score']
df_y1 = df['Cleanliness']
df_y2 = df['Dining']
df_y3 = df['Location']
df_y4 = df['Service']
df_y5 = df['Amenities']
df_y6 = df['Value_for_Money']

In [56]:
x_train, x_test, y_train, y_test = train_test_split(df_x,df_y,test_size=0.25,random_state=0)

cv = TfidfVectorizer(stop_words = 'english')
x_traincv = cv.fit_transform(x_train.values.astype('U'))
a=x_traincv.toarray()
a.shape

mnb = MultinomialNB()
y_train = y_train.astype('int')
mnb.fit(x_traincv,y_train)

x_testcv = cv.transform(x_test.values.astype('U'))
y_pred = mnb.predict(x_testcv)

actual = np.array(y_test)

count = 0
for i in range (len(y_pred)):
    if y_pred[i] == actual[i]:
        count = count+1
print('The accuracy rate is',count/len(y_pred))

The accuracy rate is 0.6742857142857143


In [55]:
x_train, x_test, y1_train, y1_test = train_test_split(df_x,df_y1,test_size=0.25,random_state=0)

cv = TfidfVectorizer(stop_words = 'english')
x_traincv = cv.fit_transform(x_train.values.astype('U'))
a=x_traincv.toarray()


mnb = MultinomialNB()
y1_train = y1_train.astype('int')
mnb.fit(x_traincv,y1_train)

x_testcv = cv.transform(x_test.values.astype('U'))
y1_pred = mnb.predict(x_testcv)


y1_actual = np.array(y1_test)


count1 = 0
for i in range (len(y1_pred)):
    if y1_pred[i] == y1_actual[i]:
        count1 = count1+1
print('The accuracy rate is',count1/len(y1_pred))

The accuracy rate is 0.7185714285714285


In [57]:
# %% Dining 69.10%

x_train, x_test, y2_train, y2_test = train_test_split(df_x,df_y2,test_size=0.25,random_state=0)

cv = TfidfVectorizer(stop_words = 'english')
x_traincv = cv.fit_transform(x_train.values.astype('U'))
a=x_traincv.toarray()


mnb = MultinomialNB()
y2_train = y2_train.astype('int')
mnb.fit(x_traincv,y2_train)

x_testcv = cv.transform(x_test.values.astype('U'))
y2_pred = mnb.predict(x_testcv)


y2_actual = np.array(y1_test)


count2 = 0
for i in range (len(y2_pred)):
    if y2_pred[i] == y2_actual[i]:
        count2 = count2+1
print('The accuracy rate is',count2/len(y2_pred))

The accuracy rate is 0.7157142857142857


In [58]:
# %% Location 70.29%

x_train, x_test, y3_train, y3_test = train_test_split(df_x,df_y3,test_size=0.25,random_state=0)

cv = TfidfVectorizer(stop_words = 'english')
x_traincv = cv.fit_transform(x_train.values.astype('U'))
a=x_traincv.toarray()


mnb = MultinomialNB()
y3_train = y3_train.astype('int')
mnb.fit(x_traincv,y3_train)

x_testcv = cv.transform(x_test.values.astype('U'))
y3_pred = mnb.predict(x_testcv)


y3_actual = np.array(y1_test)


count3 = 0
for i in range (len(y3_pred)):
    if y3_pred[i] == y3_actual[i]:
        count3 = count3+1
print('The accuracy rate is',count3/len(y3_pred))

The accuracy rate is 0.71


In [59]:
x_train, x_test, y4_train, y4_test = train_test_split(df_x,df_y4,test_size=0.25,random_state=0)

cv = TfidfVectorizer(stop_words = 'english')
x_traincv = cv.fit_transform(x_train.values.astype('U'))
a=x_traincv.toarray()


mnb = MultinomialNB()
y4_train = y4_train.astype('int')
mnb.fit(x_traincv,y4_train)

x_testcv = cv.transform(x_test.values.astype('U'))
y4_pred = mnb.predict(x_testcv)


y4_actual = np.array(y1_test)


count4 = 0
for i in range (len(y4_pred)):
    if y4_pred[i] == y4_actual[i]:
        count4 = count4+1
print('The accuracy rate is',count4/len(y4_pred))

The accuracy rate is 0.7171428571428572


In [60]:
# %% Amenities = 72.16%

x_train, x_test, y5_train, y5_test = train_test_split(df_x,df_y5,test_size=0.25,random_state=0)

cv = TfidfVectorizer(stop_words = 'english')
x_traincv = cv.fit_transform(x_train.values.astype('U'))
a=x_traincv.toarray()


mnb = MultinomialNB()
y5_train = y5_train.astype('int')
mnb.fit(x_traincv,y5_train)

x_testcv = cv.transform(x_test.values.astype('U'))
y5_pred = mnb.predict(x_testcv)


y5_actual = np.array(y1_test)


count5 = 0
for i in range (len(y5_pred)):
    if y5_pred[i] == y5_actual[i]:
        count5 = count5+1
print('The accuracy rate is',count5/len(y5_pred))


The accuracy rate is 0.7242857142857143


In [61]:
x_train, x_test, y6_train, y6_test = train_test_split(df_x,df_y6,test_size=0.25,random_state=0)

cv = TfidfVectorizer(stop_words = 'english')
x_traincv = cv.fit_transform(x_train.values.astype('U'))
a=x_traincv.toarray()


mnb = MultinomialNB()
y6_train = y6_train.astype('int')
mnb.fit(x_traincv,y6_train)

x_testcv = cv.transform(x_test.values.astype('U'))
y6_pred = mnb.predict(x_testcv)


y6_actual = np.array(y1_test)


count6 = 0
for i in range (len(y6_pred)):
    if y6_pred[i] == y6_actual[i]:
        count6 = count6+1
print('The accuracy rate is',count6/len(y6_pred))

The accuracy rate is 0.72


In [63]:
from prettytable import PrettyTable
x=PrettyTable()
x.field_names=['Total Score','Cleaniness','Dining','Location','Service','Amenities','Value for Money']
x.add_row(["{:.2%}".format(count/len(y_pred)),"{:.2%}".format(count1/len(y1_pred)),"{:.2%}".format(count2/len(y2_pred)),"{:.2%}".format(count3/len(y3_pred)),"{:.2%}".format(count4/len(y4_pred)),"{:.2%}".format(count5/len(y5_pred)),"{:.2%}".format(count6/len(y6_pred))])
print(x)

+-------------+------------+--------+----------+---------+-----------+-----------------+
| Total Score | Cleaniness | Dining | Location | Service | Amenities | Value for Money |
+-------------+------------+--------+----------+---------+-----------+-----------------+
|    67.43%   |   71.86%   | 71.57% |  71.00%  |  71.71% |   72.43%  |      72.00%     |
+-------------+------------+--------+----------+---------+-----------+-----------------+
