# Zomato Recommendation System

## importing required packages and libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re

import warnings
warnings.filterwarnings("ignore")


In [2]:
from sklearn.metrics import r2_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
import re
import spacy
nlp=spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

## loading the dataset

In [4]:
df=pd.read_csv("zomato.csv")

In [5]:
df.head(2)

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari


## cleaning and performing feature engineering on the data

### Cleaning and performing feature engineering involves the following:
#### 1) Removing unrequired columns
#### 2) Removing Duplicates
#### 3) Handeling missing values
#### 4) Data Transformation
#### 5) Data Cleaning

In [6]:
df.columns

Index(['url', 'address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'phone', 'location', 'rest_type', 'dish_liked', 'cuisines',
       'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)', 'listed_in(city)'],
      dtype='object')


### Removing unrequired columns
#### Removing url, dish_liked, phone

In [7]:
df.drop(["url","dish_liked","phone"], axis=1, inplace=True)

In [8]:
## removing duplicated values in the database

In [9]:
df.duplicated().sum()

43

In [10]:
df.drop_duplicates(inplace=True)

In [12]:
## checking for missing values in the database

In [14]:
df.isnull().sum()

address                           0
name                              0
online_order                      0
book_table                        0
rate                           7767
votes                             0
location                         21
rest_type                       227
cuisines                         45
approx_cost(for two people)     345
reviews_list                      0
menu_item                         0
listed_in(type)                   0
listed_in(city)                   0
dtype: int64

In [15]:
df.dropna(inplace=True)

In [16]:
df.shape

(43499, 14)

## renaming columns

In [18]:
df.rename(columns={"approx_cost(for two people)":"cost","listed_in(type)":"type","listed_in(city)":"city"}, inplace=True)

## performing transformations

In [22]:
df["cost"]=df.cost.astype(str)

In [23]:
df.cost=df.cost.apply(lambda x: x.replace(",","."))

In [24]:
df.cost=df.cost.astype(float)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43499 entries, 0 to 51716
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   address       43499 non-null  object 
 1   name          43499 non-null  object 
 2   online_order  43499 non-null  object 
 3   book_table    43499 non-null  object 
 4   rate          43499 non-null  object 
 5   votes         43499 non-null  int64  
 6   location      43499 non-null  object 
 7   rest_type     43499 non-null  object 
 8   cuisines      43499 non-null  object 
 9   cost          43499 non-null  float64
 10  reviews_list  43499 non-null  object 
 11  menu_item     43499 non-null  object 
 12  type          43499 non-null  object 
 13  city          43499 non-null  object 
dtypes: float64(1), int64(1), object(12)
memory usage: 5.0+ MB


#### managing rate, we have already removed missing rate values, now removing NEW values

In [26]:
def funt(x):
    if type(x)==str:
        return x.replace("/5"," ")
    else:
        return x

In [27]:
df=df[df.rate!="NEW"]

In [28]:
df.rate=df.rate.astype(str)
df=df[df.rate!="-"]
df.rate=df.rate.apply(lambda x: funt(x))

In [29]:
df.rate=df.rate.astype(float)

In [30]:
## manaeging columns
df.name = df.name.apply(lambda x:x.title())
df.online_order.replace(('Yes','No'),(True, False),inplace=True)
df.book_table.replace(('Yes','No'),(True, False),inplace=True)


In [31]:
df.sample(3)

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city
38804,"Nahar's Heritage Hotel, 14, St. Marks Road, Ba...",Sidewalk Cafe - Nahar'S Heritage Hotel,False,False,3.9,135,St. Marks Road,Casual Dining,"Italian, Fast Food",800.0,"[('Rated 5.0', 'RATED\n Absolutely superb pla...",[],Dine-out,Lavelle Road
30842,"478, Ground Floor, Krishna Temple Road, Korama...",Natuna Seafood House,True,False,4.1,76,Koramangala 5th Block,Quick Bites,Seafood,450.0,"[('Rated 5.0', 'RATED\n Really delicious food...",[],Delivery,Koramangala 5th Block
29940,"1st Floor, 1st Cross, Besides Bosch, Koramanga...",Yumyumsouth,True,False,3.5,36,Koramangala 7th Block,Delivery,South Indian,600.0,"[('Rated 1.0', ""RATED\n The food used to be p...","['Grand Madras Tiffin', 'Uthappam Delight Tiff...",Delivery,Koramangala 5th Block


In [32]:
rest = list(df['name'].unique())

In [33]:
df["mean"]=0

In [34]:

for i in rest:
    #df[df['name']==i]["mean"] = df[df['name']==i]["rate"].mean()
    df['mean'][df['name'] == i] = df['rate'][df['name'] == i].mean()


In [35]:
df.head()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city,mean
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari,4.118182
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,True,False,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari,4.1
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,True,False,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.0,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari,3.8
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,False,False,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari,3.7
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,False,False,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari,3.8


### text transformations
#### lower casing
#### removal of punctuation
#### removal of stopwords
#### removal of urls
#### spelling correction

In [36]:
d=df.copy()

In [37]:
def splitter(x):
    val=[]
    for words in x.split():
        ch= [char for char in words if char not in ".,()/1234+%[]5?6';\"7890:{}-_!"]
        w="".join(ch)
        val.append(w)
    return " ".join(val)

In [38]:
def funt(x):
    x= re.sub(" \d+"," ",x)
    #x= re.sub("http\S+\s*"," ", x)##url removal
    #x= re.sub("[a-zA-Z.-_]+[a-zA-Z.-_]+\.[a-zA-Z.-_]+"," ",x)## website removal
    #x= re.sub("[a-zA-Z.-_]+@[a-zA-Z.-_]+\.[a-zA-Z.-_]+"," ",x) ##email removal
    #x= re.sub("#\S+"," ",x)### hastage removal
    #x= re.sub("@\S+"," ",x)###mention removal
    x=" ".join([t for t in x.split() if t not in "!@#$:%^&-_*(),."])## special character remobal
    x=x.lower()
    x=x.replace("rated"," ")
    x=re.sub("\s+"," ",x)
    x=" ".join([t for t in x.split() if t not in stopwords])
    x="".join(x)
    x=x.replace("  "," " )
    return x

In [39]:
df.reviews_list=df.reviews_list.apply(lambda x: splitter(x))

In [40]:
df.reviews_list=df.reviews_list.apply(lambda x: funt(x))

In [41]:
df.reviews_list=df.reviews_list.apply(lambda x: x.replace("\\n", ""))

In [42]:
df[['reviews_list', 'cuisines']].sample(5)

Unnamed: 0,reviews_list,cuisines
46817,fried rice gud taste yummyã\xã\xã\xã\xã\...,"Fast Food, Street Food"
12482,wonderful idea charging ice creams weight fin...,Desserts
16073,time place hsr layout serves delicious samosa...,"Street Food, North Indian"
45015,came place birthday year nice place veg food ...,"North Indian, Street Food"
11801,tender coconut milk shake sandwich yesterday ...,"Fast Food, Street Food"


In [43]:
df.columns

Index(['address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'location', 'rest_type', 'cuisines', 'cost', 'reviews_list',
       'menu_item', 'type', 'city', 'mean'],
      dtype='object')

In [44]:
df.drop(["address","rest_type","votes","type","menu_item"],axis=1, inplace=True)

In [45]:
df.head()

Unnamed: 0,name,online_order,book_table,rate,location,cuisines,cost,reviews_list,city,mean
0,Jalsa,True,True,4.1,Banashankari,"North Indian, Mughlai, Chinese",800.0,beautiful place dine inthe interiors mughal e...,Banashankari,4.118182
1,Spice Elephant,True,False,4.1,Banashankari,"Chinese, North Indian, Thai",800.0,dinner family turned good choose suitable age...,Banashankari,4.1
2,San Churro Cafe,True,False,3.8,Banashankari,"Cafe, Mexican, Italian",800.0,ambience good pocket friendly cafe quantity g...,Banashankari,3.8
3,Addhuri Udupi Bhojana,False,False,3.7,Banashankari,"South Indian, North Indian",300.0,great food proper karnataka style meals twice...,Banashankari,3.7
4,Grand Village,False,False,3.8,Basavanagudi,"North Indian, Rajasthani",600.0,good restaurant neighbourhood buffet system p...,Banashankari,3.8


## TFIDF Vectorizer

In [46]:
zomato=df.sample(frac=0.35)

In [47]:
zomato.set_index("name", inplace=True)

In [107]:
df.head()

Unnamed: 0,name,online_order,book_table,rate,location,cuisines,cost,reviews_list,city,mean
0,Jalsa,True,True,4.1,Banashankari,"North Indian, Mughlai, Chinese",800.0,beautiful place dine inthe interiors mughal er...,Banashankari,4.118182
1,Spice Elephant,True,False,4.1,Banashankari,"Chinese, North Indian, Thai",800.0,dinner family turned good choose suitable ages...,Banashankari,4.1
2,San Churro Cafe,True,False,3.8,Banashankari,"Cafe, Mexican, Italian",800.0,ambience good pocket friendly cafe quantity go...,Banashankari,3.8
3,Addhuri Udupi Bhojana,False,False,3.7,Banashankari,"South Indian, North Indian",300.0,great food proper karnataka style meals twice ...,Banashankari,3.7
4,Grand Village,False,False,3.8,Basavanagudi,"North Indian, Rajasthani",600.0,good restaurant neighbourhood buffet system pr...,Banashankari,3.8


In [77]:
indices=pd.Series(zomato.index)

In [53]:
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(zomato['reviews_list'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

### Main function 

In [62]:
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(zomato.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'mean', 'cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(zomato[['cuisines','mean', 'cost']][zomato.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','mean', 'cost'], keep=False)
    df_new = df_new.sort_values(by='mean', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new


## Testing the Recommendation system

In [64]:
recommend('Chattar Mattar')

TOP 10 RESTAURANTS LIKE Chattar Mattar WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,mean,cost
Purani Dilli By Anand Sweets,"North Indian, Continental, Street Food",4.114286,500.0
Deli Chats And Sweets,"Street Food, Mithai",4.1,200.0
Sukh Sagar Food Court,"North Indian, South Indian, Chinese, Desserts,...",3.8,350.0
Chaatimes,"Street Food, Fast Food",3.776667,250.0
Chaatimes,"Street Food, Fast Food",3.776667,200.0
Baba Bhature,North Indian,3.7,200.0
The Modak,"North Indian, Street Food, Desserts, Chinese",3.6,400.0
Imli,"North Indian, Beverages, Fast Food, Street Food",3.6,250.0
Imli,"North Indian, Street Food",3.6,800.0
Chattar Mattar,"North Indian, Chinese",3.5,150.0
