In [1]:
import pandas as pd
import csv

In [2]:
df=pd.read_csv('reviews.csv')

In [3]:
df.head()

Unnamed: 0,Text,Sentiment,Topic
0,"The rooms are extremely small, practically onl...",negative,Comfort
1,Room safe did not work.,negative,Facilities
2,Mattress very comfortable.,positive,Comfort
3,"Very uncomfortable, thin mattress, with plasti...",negative,Comfort
4,No bathroom in room,negative,Facilities


In [4]:
data=[row for row in csv.reader(open("reviews.csv"))]
data=data[1:]
data[:5]

[['The rooms are extremely small, practically only a bed.',
  'negative',
  'Comfort'],
 ['Room safe did not work.', 'negative', 'Facilities'],
 ['Mattress very comfortable.', 'positive', 'Comfort'],
 ['Very uncomfortable, thin mattress, with plastic cover that rustles every time you move.',
  'negative',
  'Comfort'],
 ['No bathroom in room', 'negative', 'Facilities']]

In [5]:
reviews = [row[0] for row in data]
sentiment = [row[1] for row in data]
topics = [row[2] for row in data]

In [6]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
 
def process_text(text): #clean text
    text = re.sub('[^A-Za-z]', ' ', text.lower())
    tokenized_text = word_tokenize(text)
    clean_text = [
        stemmer.stem(word) for word in tokenized_text
        if word not in stopwords.words('english')
    ]
    return clean_text

In [7]:
cleaned_reviews=[" ".join(process_text(review)) for review in reviews] #clean reviews

### Topic Classification

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(max_features=1000)
vectors = matrix.fit_transform(cleaned_reviews).toarray()
vectors.shape

(207, 500)

In [9]:
matrix

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=1000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [10]:
from sklearn.model_selection import train_test_split
vectors_train, vectors_test, topics_train, topics_test = train_test_split(vectors, topics)

In [11]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(vectors_train, topics_train)
topics_pred = classifier.predict(vectors_test)

from sklearn.metrics import classification_report
print(classification_report(topics_test, topics_pred))

              precision    recall  f1-score   support

 Cleanliness       0.57      0.67      0.62         6
     Comfort       0.58      0.79      0.67        19
  Facilities       0.79      0.56      0.65        27

    accuracy                           0.65        52
   macro avg       0.65      0.67      0.64        52
weighted avg       0.69      0.65      0.65        52



In [12]:
comf = 'I really like the room. The bed sheets are so comfortable.'
cleanli = 'The room was not tidy. But the sheets were nice.'
facil = 'The room has nice amenities. The mini bar was well-equipped.'
new = [comf,cleanli,facil]

In [13]:
cleaned_new=[" ".join(process_text(rev)) for rev in new]
new_vec = matrix.transform(cleaned_new).toarray()
pred = classifier.predict(new_vec)
pd.DataFrame(list(zip(new,pred)),columns=['reviews','topic'])

Unnamed: 0,reviews,topic
0,I really like the room. The bed sheets are so ...,Comfort
1,The room was not tidy. But the sheets were nice.,Comfort
2,The room has nice amenities. The mini bar was ...,Facilities


### Topic Modeling

We will use the cleaned_review variable used above.

In [14]:
from gensim import corpora
r = [process_text(x) for x in cleaned_reviews] #list of words of each review
dictionary = corpora.Dictionary(r)
corpus = [dictionary.doc2bow(rev) for rev in r]

unable to import 'smart_open.gcs', disabling that module


In [15]:
r[:3]

[['room', 'extrem', 'small', 'practic', 'bed'],
 ['room', 'safe', 'work'],
 ['mattress', 'comfort']]

In [16]:
from gensim import models
model = models.ldamodel.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)
topics = model.print_topics(num_words=3)
for topic in topics:
    print(topic)

(0, '0.014*"water" + 0.012*"place" + 0.011*"hot"')
(1, '0.089*"room" + 0.022*"shower" + 0.020*"small"')
(2, '0.036*"bed" + 0.030*"room" + 0.027*"clean"')
