In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [3]:
data=pd.read_csv(r'/content/drive/MyDrive/Restaurant_Reviews.tsv', sep='\t')

In [4]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
data_shuffled=data.sample(frac=1, random_state=42)

In [6]:
data_shuffled.head()

Unnamed: 0,Review,Liked
521,If you haven't gone here GO NOW!,1
737,Try them in the airport to experience some tas...,1
740,The restaurant is very clean and has a family ...,1
660,"I personally love the hummus, pita, baklava, f...",1
411,"Come hungry, leave happy and stuffed!",1


In [7]:
data_shuffled['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data_shuffled['Review'], data_shuffled['Liked'], test_size=0.2, random_state=42)

In [10]:
round(sum([len(i.split()) for i in X_train])/len(X_train)) #avg number of words in a review

11

In [11]:
len(X_train),len(X_test)

(800, 200)

In [12]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [13]:
text_vectorizer = TextVectorization(max_tokens=10000, 
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace", 
                                    ngrams=None,
                                    output_mode="int",
                                    output_sequence_length=11)

In [14]:
text_vectorizer.adapt(X_train)

In [15]:
text_vectorizer(["Food and service was really amazing"])

<tf.Tensor: shape=(1, 11), dtype=int64, numpy=array([[13,  3, 18,  5, 43, 73,  0,  0,  0,  0,  0]])>

In [16]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding = layers.Embedding( input_dim=10000,
                             output_dim=128,
                             embeddings_initializer="uniform",
                             input_length=11,
                             name="embedding_1") 

embedding

<keras.layers.embeddings.Embedding at 0x7f151e916990>

In [17]:
sample=embedding(text_vectorizer(["Food and service was really amazing"]))
sample

<tf.Tensor: shape=(1, 11, 128), dtype=float32, numpy=
array([[[-0.01906666, -0.01067507,  0.00669341, ...,  0.04949195,
         -0.02162114, -0.02049688],
        [-0.04284013, -0.01489798, -0.0159496 , ..., -0.01166106,
          0.03061062,  0.01972148],
        [-0.01299536, -0.03285937,  0.02369577, ...,  0.01118424,
         -0.04721683,  0.00531303],
        ...,
        [ 0.01645621, -0.00589932, -0.01471175, ..., -0.02511839,
          0.00912381, -0.00024097],
        [ 0.01645621, -0.00589932, -0.01471175, ..., -0.02511839,
          0.00912381, -0.00024097],
        [ 0.01645621, -0.00589932, -0.01471175, ..., -0.02511839,
          0.00912381, -0.00024097]]], dtype=float32)>

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [19]:
model_0=Pipeline([("Tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])

model_0.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('Tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [20]:
nb_score=model_0.score(X_test, y_test)
nb_score

0.85

In [21]:
model_0.predict([""])

array([0])

In [22]:
text_vectorizer_2 = TextVectorization( max_tokens=None, 
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace", 
                                    ngrams=None,
                                    output_mode="int",
                                    output_sequence_length=None)

In [23]:
text_vectorizer_2.adapt(X_train)

In [24]:
tf.random.set_seed(42)
from tensorflow.keras import layers

embedding_1 = layers.Embedding(input_dim=1000,
                             output_dim=128,
                             embeddings_initializer="uniform",
                             name="embedding_1") 

embedding_1

<keras.layers.embeddings.Embedding at 0x7f15100cf550>

In [25]:
model_1=Pipeline([("Tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])

model_1.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('Tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [26]:
model_1.score(X_test, y_test)

0.85

In [27]:
import pickle

In [28]:
file = open('restaurant_nlp.pkl', 'wb')
pickle.dump(model_1, file)

In [31]:
mod=open('restaurant_nlp.pkl','rb')
rest = pickle.load(mod)

In [33]:
rest.predict(["Worst food"])

array([0])

In [34]:
rest.score(X_test, y_test)

0.85

In [35]:
rest.score(X_train, y_train)

0.96