Pandas is imported as pd and used as it is a module to help in structing data and manipulating it.

In [4]:
import pandas as pd
dataset = pd.read_csv("Restaurant_review.tsv",delimiter='\t')

In [5]:
# .Head() prints the first 5 rows of the dataset

In [6]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [7]:
# we are declaring review so that we apply the cleaning process on it and test it on the dataset

In [8]:
review="The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer."

In [9]:
# re is a library which is used for regular expressions

In [10]:
import re

In [11]:
# re.sub is used to substitute the characters in the string and removes other than alphabets

In [12]:
review1 = re.sub('[^a-zA-Z]' ,repl=' ', string=review)

In [None]:
# we are going to convert the string into lower case to avoid the confusion between the words

In [13]:
review2=review1.lower()

In [None]:
# nltk is module used to perform natural language processing

In [15]:
import nltk

In [16]:
#nltk.download('stopwords')

In [None]:
# we are going to import stopwords from nltk.corpus to remove the words which are not useful for the sentiment analysis

In [None]:
# we are going to import stem from nltk.stem.porter to perform stemming which is used to convert the words into their root form

In [17]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [18]:
ps.stem('catching')

'catch'

In [None]:
# we are going to import english stopwords from stopwords

In [19]:
english_stopwords=stopwords.words('english')

In [None]:
# We are going to remove not,no from that list as they are useful for the sentiment analysis

In [20]:
english_stopwords.remove('not')
english_stopwords.remove('no')

In [None]:
#We are split the words which are filtered by stopwords and stemming and join them to form a sentence

In [21]:
words=review2.split(' ')

In [None]:
 # we are going to use stemming on every word in the list of words with the following loop

In [22]:
corpus = []
for i in words:
    if i not in english_stopwords:
        corpus.append(ps.stem(i))
print(corpus)


['potato', 'like', 'rubber', 'could', 'tell', 'made', 'ahead', 'time', 'kept', 'warmer', '']


In [23]:
review2

'the potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer '

In [24]:
' '.join(corpus)

'potato like rubber could tell made ahead time kept warmer '

In [25]:
#for i in dataset['Review']

In [None]:
# we are going to do the same operations on the dataset which are done the sample sentence

In [26]:
reviews = []
for i in range(1000):
    review = dataset['Review'][i]
    review1 = re.sub('[^a-zA-Z]' ,repl=' ', string=review)
    review2=review1.lower()
    words=review2.split(' ')
    corpus = []
    for i in words:
        if i not in english_stopwords:
            corpus.append(ps.stem(i))
    sent = ' '.join(corpus)
    reviews.append(sent)


In [27]:
print(reviews)

['wow    love place ', 'crust not good ', 'not tasti textur nasti ', 'stop late may bank holiday rick steve recommend love ', 'select menu great price ', 'get angri want damn pho ', 'honeslti tast fresh  ', 'potato like rubber could tell made ahead time kept warmer ', 'fri great ', 'great touch ', 'servic prompt ', 'would not go back ', 'cashier no care ever say still end wayyy overpr ', 'tri cape cod ravoli  chicken  cranberri   mmmm ', 'disgust pretti sure human hair ', 'shock no sign indic cash ', 'highli recommend ', 'waitress littl slow servic ', 'place not worth time  let alon vega ', 'not like ', 'burritto blah ', 'food  amaz ', 'servic also cute ', 'could care less    interior beauti ', 'perform ', 'right    red velvet cake     ohhh stuff good ', '  never brought salad ask ', 'hole wall great mexican street taco  friendli staff ', 'took hour get food   tabl restaur food luke warm  sever run around like total overwhelm ', 'worst salmon sashimi ', 'also combo like burger  fri  be

In [None]:
# we are going to use countvectorizer to convert the words into vectors and Textblob to perform sentiment analysis on the dataset

In [28]:
import string
from collections import Counter
from textblob import TextBlob

In [None]:
# we are going to calculate percentage of positive and negative reviews in the dataset through Textblob

In [29]:
k=0.0
for i in reviews:
    blob = TextBlob(i)
    sent = blob.sentiment.polarity
    k += sent
print(k)

122.18139430014425


In [None]:
# we are going to import sklearn to perform machine learning algorithms on the dataset or vectors of words

In [30]:
import sklearn

In [None]:
# to get vector of words and their count we are going to use countvectorizer

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=800)

In [32]:
cv.fit(reviews)

In [None]:
# This is the positivity rate of the reviews in the dataset

In [34]:
print(k)

122.18139430014425


In [None]:
# This is the vector of words and their count

In [33]:
cv.vocabulary_

{'wow': 786,
 'love': 385,
 'place': 486,
 'crust': 164,
 'not': 441,
 'good': 289,
 'tasti': 679,
 'textur': 685,
 'nasti': 428,
 'stop': 654,
 'late': 363,
 'may': 399,
 'rick': 562,
 'recommend': 544,
 'select': 605,
 'menu': 408,
 'great': 292,
 'price': 507,
 'get': 282,
 'want': 750,
 'damn': 169,
 'pho': 480,
 'tast': 677,
 'fresh': 271,
 'potato': 499,
 'like': 373,
 'rubber': 576,
 'could': 153,
 'tell': 681,
 'made': 390,
 'ahead': 10,
 'time': 695,
 'kept': 354,
 'warmer': 752,
 'fri': 272,
 'touch': 704,
 'servic': 609,
 'would': 784,
 'go': 286,
 'back': 47,
 'cashier': 118,
 'no': 438,
 'care': 117,
 'ever': 227,
 'say': 596,
 'still': 651,
 'end': 218,
 'overpr': 459,
 'tri': 708,
 'ravoli': 535,
 'chicken': 128,
 'mmmm': 416,
 'disgust': 192,
 'pretti': 506,
 'sure': 666,
 'human': 333,
 'hair': 303,
 'sign': 618,
 'highli': 318,
 'waitress': 747,
 'littl': 376,
 'slow': 627,
 'worth': 783,
 'let': 370,
 'vega': 723,
 'food': 269,
 'amaz': 18,
 'also': 15,
 'cute': 168,