In [2]:
import pandas as pd
dataset = pd.read_table('data/Restaurant_Reviews.tsv')
dataset.head()


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
dataset.shape

(1000, 2)

In [4]:
#importing all the nlp packages
import re
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

tokenizer = WhitespaceTokenizer()
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [5]:
#Filter the reviews by removing punctuations, stopwords, lemmatizing and stemming
corpus = []
for i in range(len(dataset.Review)):
    text = dataset.Review[i].lower()
    text = re.sub('[^a-z0-9]', ' ', text)
    tokens = tokenizer.tokenize(text)
    filtered_tokens = [w for w in tokens if w not in stop_words]
    filtered  = [stemmer.stem(lemmatizer.lemmatize(w)) for w in filtered_tokens]
    filtered_text = ' '.join(filtered)
    corpus.append(filtered_text)

In [6]:
corpus[:5]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

In [7]:
#Bag of Words (BoW) technique to convert corpus into X
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [8]:
#each word as feature and column index 
cv.vocabulary_

{'wow': 1566,
 'love': 822,
 'place': 1048,
 'crust': 346,
 'good': 615,
 'tasti': 1383,
 'textur': 1395,
 'nasti': 920,
 'stop': 1332,
 'late': 782,
 'may': 854,
 'bank': 114,
 'holiday': 688,
 'rick': 1169,
 'steve': 1325,
 'recommend': 1135,
 'select': 1225,
 'menu': 872,
 'great': 629,
 'price': 1082,
 'get': 599,
 'angri': 55,
 'want': 1516,
 'damn': 355,
 'pho': 1036,
 'honestli': 692,
 'tast': 1381,
 'fresh': 575,
 'potato': 1074,
 'like': 805,
 'rubber': 1183,
 'could': 319,
 'tell': 1388,
 'made': 833,
 'ahead': 37,
 'time': 1416,
 'kept': 765,
 'warmer': 1518,
 'fri': 576,
 'touch': 1434,
 'servic': 1235,
 'prompt': 1094,
 'would': 1564,
 'go': 609,
 'back': 105,
 'cashier': 235,
 'care': 227,
 'ever': 483,
 'say': 1210,
 'still': 1327,
 'end': 467,
 'wayyy': 1526,
 'overpr': 987,
 'tri': 1444,
 'cape': 223,
 'cod': 281,
 'ravoli': 1125,
 'chicken': 258,
 'cranberri': 332,
 'mmmm': 892,
 'disgust': 406,
 'pretti': 1081,
 'sure': 1367,
 'human': 707,
 'hair': 649,
 'shock': 12

In [10]:
features = list(cv.vocabulary_.keys())
features = sorted(features)

In [11]:
#Transform text to matrix for further modelling
X = pd.DataFrame(cv.transform(corpus).toarray(),columns=features)
y = dataset.Liked

In [12]:
X.iloc[0,1566] #word wow in the first review

1

Now we can use a classification model to classify text. Validation of the model

In [13]:
from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state =0)

In [14]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.7

In [15]:
cross_val_score(LogisticRegression(),X,y,cv=4).mean()

0.7749999999999999

Deployment of the model

In order to test on new reviews we will have to transform the review to matrix as well using our BoW transformer. 

In [16]:
review = ['good food good place','plate dirt','nice decor']

In [17]:
model.fit(X,y)
model.predict(cv.transform(review).toarray())

array([1, 0, 1], dtype=int64)

We can use TF-IDF instead of BoW for the transformation of text to matrix

In [18]:
#TF-IDF technique to convert corpus into X
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(corpus)
features = list(tfidf.vocabulary_.keys())
features = sorted(features)
X = pd.DataFrame(tfidf.transform(corpus).toarray(),columns=features)
y = dataset.Liked

In [19]:
cross_val_score(LogisticRegression(),X,y,cv=4).mean()

0.7769999999999999

In [20]:
X.head()

Unnamed: 0,00,10,100,11,12,15,17,1979,20,2007,...,year,yellow,yellowtail,yelper,yet,yucki,yukon,yum,yummi,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
tfidf.vocabulary_

{'wow': 1566,
 'love': 822,
 'place': 1048,
 'crust': 346,
 'good': 615,
 'tasti': 1383,
 'textur': 1395,
 'nasti': 920,
 'stop': 1332,
 'late': 782,
 'may': 854,
 'bank': 114,
 'holiday': 688,
 'rick': 1169,
 'steve': 1325,
 'recommend': 1135,
 'select': 1225,
 'menu': 872,
 'great': 629,
 'price': 1082,
 'get': 599,
 'angri': 55,
 'want': 1516,
 'damn': 355,
 'pho': 1036,
 'honestli': 692,
 'tast': 1381,
 'fresh': 575,
 'potato': 1074,
 'like': 805,
 'rubber': 1183,
 'could': 319,
 'tell': 1388,
 'made': 833,
 'ahead': 37,
 'time': 1416,
 'kept': 765,
 'warmer': 1518,
 'fri': 576,
 'touch': 1434,
 'servic': 1235,
 'prompt': 1094,
 'would': 1564,
 'go': 609,
 'back': 105,
 'cashier': 235,
 'care': 227,
 'ever': 483,
 'say': 1210,
 'still': 1327,
 'end': 467,
 'wayyy': 1526,
 'overpr': 987,
 'tri': 1444,
 'cape': 223,
 'cod': 281,
 'ravoli': 1125,
 'chicken': 258,
 'cranberri': 332,
 'mmmm': 892,
 'disgust': 406,
 'pretti': 1081,
 'sure': 1367,
 'human': 707,
 'hair': 649,
 'shock': 12

In [22]:
X.iloc[0,1566]

0.7681483384958535

In [24]:
#idf refers the how popular the word is across documents in the corpus. 
#The more frequent the term in the corpus, the lesser its idf value
tfidf.idf_[tfidf.vocabulary_['recommend']]

4.96431580014878