In [5]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import re
import nltk
nltk.download('stopwords')
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import streamlit as st




# Load dataset
zomato_real = pd.read_csv("/content/zomato.csv.zip")
zomato_real.head()

# Deleting unnecessary columns
zomato = zomato_real.drop(['url', 'dish_liked', 'phone'], axis=1)

# Removing duplicates
zomato.drop_duplicates(inplace=True)

# Remove NaN values
zomato.dropna(how='any', inplace=True)

# Changing column names
zomato = zomato.rename(columns={'approx_cost(for two people)': 'cost', 'listed_in(type)': 'type', 'listed_in(city)': 'city'})

# Data transformations
zomato['cost'] = zomato['cost'].astype(str)
zomato['cost'] = zomato['cost'].apply(lambda x: x.replace(',', '.'))
zomato['cost'] = zomato['cost'].astype(float)

# Removing 'NEW' and '-' values from 'rate' and reset index
zomato = zomato.loc[zomato.rate != 'NEW']
zomato = zomato.loc[zomato.rate != '-'].reset_index(drop=True)

# Remove '/5' from 'rate'
remove_slash = lambda x: x.replace('/5', '') if type(x) == str else x
zomato.rate = zomato.rate.apply(remove_slash).str.strip().astype('float')

# Adjust the column names
zomato.name = zomato.name.apply(lambda x: x.title())
zomato.online_order.replace(('Yes', 'No'), (True, False), inplace=True)
zomato.book_table.replace(('Yes', 'No'), (True, False), inplace=True)

# Compute Mean Rating
restaurants = list(zomato['name'].unique())
zomato['Mean Rating'] = 0

for i in range(len(restaurants)):
    zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()

# Normalize 'Mean Rating'
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(1, 5))
zomato[['Mean Rating']] = scaler.fit_transform(zomato[['Mean Rating']]).round(2)

# Lower casing reviews
zomato["reviews_list"] = zomato["reviews_list"].str.lower()

# Remove punctuations
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """Custom function to remove punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_punctuation(text))

# Remove stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """Custom function to remove stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_stopwords(text))

# Remove URLs
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

zomato["reviews_list"] = zomato["reviews_list"].apply(lambda text: remove_urls(text))

# Drop additional unnecessary columns
zomato = zomato.drop(['address', 'rest_type', 'type', 'menu_item', 'votes'], axis=1)

# Randomly sample 50% of the dataframe
df_percent = zomato.sample(frac=0.5)
df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

# Creating TF-IDF matrix
tfidf = TfidfVectorizer(min_df=1)
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

# Calculate cosine similarities
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

# Recommendation function
def recommend(name, cosine_similarities=cosine_similarities):

    # Check if the restaurant exists in the dataset
    if name not in indices.values:
        print(f"{name} not found in the dataset.")
        return None

    # Create a list to put top restaurants
    recommend_restaurant = []

    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]

    # Find the restaurants with a similar cosine-sim value and order them from biggest number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)

    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)

    # Names of the top 30 restaurants
    for idx in top30_indexes:
        recommend_restaurant.append(df_percent.index[idx])

    # Initialize df_new as an empty DataFrame
    df_new = pd.DataFrame()

    # Concatenate each recommended restaurant's data into df_new
    for restaurant in recommend_restaurant:
        df_new = pd.concat([df_new, pd.DataFrame(df_percent[['cuisines', 'Mean Rating', 'cost']][df_percent.index == restaurant].sample())])

    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines', 'Mean Rating', 'cost'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)

    print(f'TOP {len(df_new)} RESTAURANTS LIKE {name} WITH SIMILAR REVIEWS:')


    return df_new

# Test the recommendation system
recommend('Pai Vihar')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


TOP 10 RESTAURANTS LIKE Pai Vihar WITH SIMILAR REVIEWS:


Unnamed: 0_level_0,cuisines,Mean Rating,cost
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Vegetarea,South Indian,3.67,250.0
Cinnamon,"North Indian, Chinese, Biryani",3.62,550.0
New Friends,"North Indian, Continental, Chinese, Steak",3.58,900.0
Juice Junction Food Court,"Juices, South Indian, Chinese, Fast Food",3.5,200.0
Shanthi Sagar,"South Indian, North Indian, Chinese, Street Fo...",3.44,400.0
Dakshin Kitchen,South Indian,3.32,100.0
Sri Sai Bhavan,South Indian,3.32,300.0
Juice 99,"Fast Food, Beverages",3.17,150.0
Melange - Hotel Ekaa,"North Indian, Chinese, Continental, Mangalorean",2.81,900.0
Tamarind,"Chinese, North Indian, Continental",2.16,750.0
