In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import pandas as pd


In [3]:
data = pd.read_csv("tripadvisor_hotel_reviews.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  109 non-null    object
 1   Rating  109 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ KB


In [5]:
data.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [6]:
data['Rating'][0]

4

In [7]:
en_stopwords = stopwords.words('english')
en_stopwords.remove('not')

data['no_stopwords'] = data['Review'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in en_stopwords]))

In [9]:
data['no_stopwords'][0]

'nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe noisy neighbors, aveda bath products nice, not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience pay 40 parking night,'

In [14]:
data['no_stopwords_no_punct'] = data['no_stopwords'].apply(lambda x: re.sub(r"[*]","star",x))
data['no_stopwords_no_punct'] = data['no_stopwords_no_punct'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [19]:
data['no_stopwords_no_punct'].head()

0    nice hotel expensive parking got good deal sta...
1    ok nothing special charge diamond member hilto...
2    nice rooms not 4star experience hotel monaco s...
3    unique great stay wonderful time hotel monaco ...
4    great stay great stay went seahawk game awesom...
Name: no_stopwords_no_punct, dtype: object

In [21]:
data['tokenized'] = data['no_stopwords_no_punct'].apply(lambda x: word_tokenize(x))
data['tokenized'].head()

0    [nice, hotel, expensive, parking, got, good, d...
1    [ok, nothing, special, charge, diamond, member...
2    [nice, rooms, not, 4star, experience, hotel, m...
3    [unique, great, stay, wonderful, time, hotel, ...
4    [great, stay, great, stay, went, seahawk, game...
Name: tokenized, dtype: object

In [22]:
ps = PorterStemmer()
data['stemmed'] = data['tokenized'].apply(lambda tokens: [ps.stem(tokem) for tokem in tokens])
data['stemmed'].head()

0    [nice, hotel, expens, park, got, good, deal, s...
1    [ok, noth, special, charg, diamond, member, hi...
2    [nice, room, not, 4star, experi, hotel, monaco...
3    [uniqu, great, stay, wonder, time, hotel, mona...
4    [great, stay, great, stay, went, seahawk, game...
Name: stemmed, dtype: object

In [23]:
lemmatizer = WordNetLemmatizer()
data['lemmatized'] = data['tokenized'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
data['lemmatized'].head()

0    [nice, hotel, expensive, parking, got, good, d...
1    [ok, nothing, special, charge, diamond, member...
2    [nice, room, not, 4star, experience, hotel, mo...
3    [unique, great, stay, wonderful, time, hotel, ...
4    [great, stay, great, stay, went, seahawk, game...
Name: lemmatized, dtype: object

In [26]:
tokens_clean = sum(data['tokenized'], [])


In [27]:
unigrams = (pd.Series(nltk.ngrams(tokens_clean, 1)).value_counts())

print(unigrams)

(hotel,)           267
(room,)            212
(great,)           126
(not,)             122
(stay,)             95
                  ... 
(175,)               1
(smackagainst,)      1
(roomlocation,)      1
(stomach,)           1
(connected,)         1
Name: count, Length: 2763, dtype: int64


In [30]:
bigrams = (pd.Series(nltk.ngrams(tokens_clean, 2)).value_counts())

print(bigrams)

(great, location)    24
(space, needle)      21
(hotel, monaco)      16
(great, hotel)       12
(staff, friendly)    12
                     ..
(open, sliding)       1
(picky, wanted)       1
(nice, able)          1
(downtown, nice)      1
(food, raffles)       1
Name: count, Length: 8362, dtype: int64


In [31]:
trigrams = (pd.Series(nltk.ngrams(tokens_clean, 3)).value_counts())

print(trigrams)

(pike, place, market)               8
(view, space, needle)               5
(hotel, great, location)            5
(inn, queen, anne)                  4
(room, king, bed)                   4
                                   ..
(warwick, seattle, hotel)           1
(seattle, hotel, dissapointment)    1
(hotel, dissapointment, trip)       1
(dissapointment, trip, 3)           1
(hotel, right, street)              1
Name: count, Length: 9294, dtype: int64
