In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('tripadvisor_hotel_reviews.csv')

In [3]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [4]:
df.isnull().sum()

Review    0
Rating    0
dtype: int64

In [5]:
blanks = []
for index in df.Review:
    if index.isspace() == True:
        blanks.append(index)

In [6]:
blanks

[]

In [7]:
df["Rating"].value_counts().sort_values()

1    1421
2    1793
3    2184
4    6039
5    9054
Name: Rating, dtype: int64

In [8]:
def rating(rating):
    if rating > 3 and rating<=5:
        return "Positive"
    if rating > 0 and rating <=3:
        return "Negative"
df["Rating_posorneg"] = df.Rating.apply(rating)

In [10]:
df.head()

Unnamed: 0,Review,Rating,Rating_posorneg
0,nice hotel expensive parking got good deal sta...,4,Positive
1,ok nothing special charge diamond member hilto...,2,Negative
2,nice rooms not 4* experience hotel monaco seat...,3,Negative
3,"unique, great stay, wonderful time hotel monac...",5,Positive
4,"great stay great stay, went seahawk game aweso...",5,Positive


In [10]:
df["Rating_posorneg"].value_counts()

Positive    15093
Negative     5398
Name: Rating_posorneg, dtype: int64

In [12]:
#Sentiment Analysis

In [11]:
import nltk
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\onata\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [12]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentanalyze = SentimentIntensityAnalyzer()

In [13]:
df["Scores"] = df["Review"].apply(lambda review:sentanalyze.polarity_scores(review))

In [17]:
df.head()

Unnamed: 0,Review,Rating,Rating_posorneg,Scores
0,nice hotel expensive parking got good deal sta...,4,Positive,"{'neg': 0.072, 'neu': 0.643, 'pos': 0.285, 'co..."
1,ok nothing special charge diamond member hilto...,2,Negative,"{'neg': 0.11, 'neu': 0.701, 'pos': 0.189, 'com..."
2,nice rooms not 4* experience hotel monaco seat...,3,Negative,"{'neg': 0.081, 'neu': 0.7, 'pos': 0.219, 'comp..."
3,"unique, great stay, wonderful time hotel monac...",5,Positive,"{'neg': 0.06, 'neu': 0.555, 'pos': 0.385, 'com..."
4,"great stay great stay, went seahawk game aweso...",5,Positive,"{'neg': 0.135, 'neu': 0.643, 'pos': 0.221, 'co..."


In [14]:
df["compound"] = df['Scores'].apply(lambda f:f["compound"])

In [15]:
df.head()

Unnamed: 0,Review,Rating,Rating_posorneg,Scores,compound
0,nice hotel expensive parking got good deal sta...,4,Positive,"{'neg': 0.072, 'neu': 0.643, 'pos': 0.285, 'co...",0.9747
1,ok nothing special charge diamond member hilto...,2,Negative,"{'neg': 0.11, 'neu': 0.701, 'pos': 0.189, 'com...",0.9787
2,nice rooms not 4* experience hotel monaco seat...,3,Negative,"{'neg': 0.081, 'neu': 0.7, 'pos': 0.219, 'comp...",0.9889
3,"unique, great stay, wonderful time hotel monac...",5,Positive,"{'neg': 0.06, 'neu': 0.555, 'pos': 0.385, 'com...",0.9912
4,"great stay great stay, went seahawk game aweso...",5,Positive,"{'neg': 0.135, 'neu': 0.643, 'pos': 0.221, 'co...",0.9797


In [16]:
df["compound_posorneg"] = df["compound"].apply(lambda score:'Positive' 
                                               if score >= 0 else 'Negative' )

In [17]:
df.head(100)

Unnamed: 0,Review,Rating,Rating_posorneg,Scores,compound,compound_posorneg
0,nice hotel expensive parking got good deal sta...,4,Positive,"{'neg': 0.072, 'neu': 0.643, 'pos': 0.285, 'co...",0.9747,Positive
1,ok nothing special charge diamond member hilto...,2,Negative,"{'neg': 0.11, 'neu': 0.701, 'pos': 0.189, 'com...",0.9787,Positive
2,nice rooms not 4* experience hotel monaco seat...,3,Negative,"{'neg': 0.081, 'neu': 0.7, 'pos': 0.219, 'comp...",0.9889,Positive
3,"unique, great stay, wonderful time hotel monac...",5,Positive,"{'neg': 0.06, 'neu': 0.555, 'pos': 0.385, 'com...",0.9912,Positive
4,"great stay great stay, went seahawk game aweso...",5,Positive,"{'neg': 0.135, 'neu': 0.643, 'pos': 0.221, 'co...",0.9797,Positive
...,...,...,...,...,...,...
95,"excellent stay staff friendly helpful, nice ho...",4,Positive,"{'neg': 0.0, 'neu': 0.428, 'pos': 0.572, 'comp...",0.9761,Positive
96,return going seattle booked hotel knowing budg...,4,Positive,"{'neg': 0.112, 'neu': 0.651, 'pos': 0.237, 'co...",0.9097,Positive
97,terrible hotel approximately 2 weeks ago april...,1,Negative,"{'neg': 0.115, 'neu': 0.749, 'pos': 0.136, 'co...",0.2350,Positive
98,great price okay experience stayed inn queen a...,3,Negative,"{'neg': 0.123, 'neu': 0.615, 'pos': 0.262, 'co...",0.9771,Positive


In [18]:
example = "The hotel is bad"
sentanalyze.polarity_scores(example)

{'neg': 0.538, 'neu': 0.462, 'pos': 0.0, 'compound': -0.5423}

# TEXT CLASSIFICATION

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X = df["Review"]
y=df["Rating_posorneg"]

In [21]:
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size = 0.3)

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [23]:
textclf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [24]:
textclf.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [25]:
preds = textclf.predict(X_test)

In [26]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,preds))

0.897690305790501


In [28]:
newreview = [("Hotel was bushy")]
textclf.predict(newreview)


array(['Positive'], dtype=object)