In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv("Final_Dataset.csv")
dataset.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Review,Ratings,Date
0,0,0,Booked 3 rooms for a family staycation but sad...,1,Dec 2020
1,1,1,We booked a Bay Suite for family of 5 with a s...,5,Dec 2020
2,2,2,World Class Service! Highly recommended for an...,5,Dec 2020
3,3,3,Our family of 2 adults and 2 kids had a thoro...,4,Dec 2020
4,4,4,If you are looking for an unforgettable stayca...,5,Dec 2020


In [3]:
dataset.drop(['Unnamed: 0','Unnamed: 0.1'],axis=1,inplace=True)
dataset.head()

Unnamed: 0,Review,Ratings,Date
0,Booked 3 rooms for a family staycation but sad...,1,Dec 2020
1,We booked a Bay Suite for family of 5 with a s...,5,Dec 2020
2,World Class Service! Highly recommended for an...,5,Dec 2020
3,Our family of 2 adults and 2 kids had a thoro...,4,Dec 2020
4,If you are looking for an unforgettable stayca...,5,Dec 2020


In [4]:
import nltk
from nltk.stem import LancasterStemmer, WordNetLemmatizer, PorterStemmer
from nltk import FreqDist
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.util import ngrams
from nltk.corpus import stopwords, wordnet
import numpy as np
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sonih\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sonih\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
def cleanReviews(s):
    s = s.lower()                   
    s = re.sub(r'[^\w\s]', ' ', s)  
    s = re.sub(r'[\d+]', ' ', s)    
    s = s.strip()                   
    s = re.sub(' +', ' ', s)        
    return s

In [6]:
dataset["Review"] = dataset["Review"].apply(lambda x: cleanReviews(x))

In [7]:
stop_words = stopwords.words("english")

In [8]:
dataset["Review"] = dataset["Review"].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

In [9]:
dataset.head()

Unnamed: 0,Review,Ratings,Date
0,booked rooms family staycation sadly forgettab...,1,Dec 2020
1,booked bay suite family seamless booking proce...,5,Dec 2020
2,world class service highly recommended anyone ...,5,Dec 2020
3,family adults kids thoroughly enjoyable stayca...,4,Dec 2020
4,looking unforgettable staycation experience pl...,5,Dec 2020


In [10]:
newData = dataset.drop(['Date'], axis=1)
newData.head()

Unnamed: 0,Review,Ratings
0,booked rooms family staycation sadly forgettab...,1
1,booked bay suite family seamless booking proce...,5
2,world class service highly recommended anyone ...,5
3,family adults kids thoroughly enjoyable stayca...,4
4,looking unforgettable staycation experience pl...,5


In [11]:
from nltk.tokenize import word_tokenize, RegexpTokenizer
newData['Review']=newData['Review'].apply(str)

In [12]:
tokenizer = RegexpTokenizer(r'\w+')
newData["Review"] = newData["Review"].apply(lambda x: tokenizer.tokenize(x))

In [13]:
lemm = WordNetLemmatizer()

In [14]:
def toWordNet(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatizeText(text):
    nltk_tagged = nltk.pos_tag(text)
    wordnet_tagged = map(lambda x: (x[0], toWordNet(x[1])), nltk_tagged)
    lemm_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemm_sentence.append(word)
        else:
            lemm_sentence.append(lemm.lemmatize(word, tag))
    return lemm_sentence

In [15]:
newData["Review"] = newData["Review"].apply(lambda x: lemmatizeText(x))

In [16]:
newData.head()

Unnamed: 0,Review,Ratings
0,"[book, room, family, staycation, sadly, forget...",1
1,"[book, bay, suite, family, seamless, book, pro...",5
2,"[world, class, service, highly, recommend, any...",5
3,"[family, adult, kid, thoroughly, enjoyable, st...",4
4,"[look, unforgettable, staycation, experience, ...",5


In [17]:
for i in range(len(newData['Review'])):
    newData['Review'][i] = ' '.join([str(elem) for elem in newData['Review'][i]])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
newData['Ratings'] = newData['Ratings'].apply(lambda x: 'Positive' if x > 3 else('Neutral' if x==3 else 'Negative'))

newData.head()

Unnamed: 0,Review,Ratings
0,book room family staycation sadly forgettable ...,Negative
1,book bay suite family seamless book process ho...,Positive
2,world class service highly recommend anyone be...,Positive
3,family adult kid thoroughly enjoyable staycati...,Positive
4,look unforgettable staycation experience pleas...,Positive


In [19]:
newData['Ratings'].value_counts()

Positive    9545
Neutral     1149
Negative     756
Name: Ratings, dtype: int64

In [21]:
df_Positive = newData[newData['Ratings'] == 'Positive'][0:1000]
df_Neutral = newData[newData['Ratings'] == 'Neutral']
df_Negative = newData[newData['Ratings'] == 'Negative']

In [22]:
df_Neutral_over = df_Neutral.sample(1000, replace=True)
df_Negative_over = df_Negative.sample(1000, replace=True)
finalData = pd.concat([df_Positive, df_Neutral_over, df_Negative_over], axis=0)

In [24]:
finalData.shape

(3000, 2)

In [25]:
finalData.head()

Unnamed: 0,Review,Ratings
1,book bay suite family seamless book process ho...,Positive
2,world class service highly recommend anyone be...,Positive
3,family adult kid thoroughly enjoyable staycati...,Positive
4,look unforgettable staycation experience pleas...,Positive
5,great staycation celebrate anniversary alice p...,Positive


In [26]:
finalData.to_csv("modelData.csv")