In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Reading the dataset

In [3]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/zomato.csv')

In [4]:
df.sample(5)

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
16342,https://www.zomato.com/bangalore/naaniz-biryan...,"G 04, Lavanya Serenity, 13th D Cross, Venkatap...",Naani'z Biryani,No,No,3.3/5,6,+91 7337735365,Koramangala 1st Block,Quick Bites,,Biryani,300,"[('Rated 5.0', ""RATED\n Good food, value of m...",[],Delivery,HSR
21949,https://www.zomato.com/bangalore/a-m-biryani-h...,"958/60/A2A, Opposite BGS School, Hulimavu, Ban...",A M Biryani Hotel,Yes,No,3.3/5,6,+91 8294750933,Bannerghatta Road,Quick Bites,,"North Indian, Chinese, Biryani, Fast Food",300,"[('Rated 3.5', 'RATED\n Not so popular Hotel ...","['Biryani Rice', 'Ambur Chicken Biryani', 'Chi...",Delivery,JP Nagar
42155,https://www.zomato.com/bangalore/pelican-marat...,"84/1, kadubeesanahalli, Panathur Main Road",Pelican,No,No,3.4 /5,20,065535 43666\n065535 43666,Marathahalli,"Pub, Casual Dining",,"North Indian, Chinese",1200,"[('Rated 5.0', 'RATED\n Nice place, good serv...",[],Drinks & nightlife,Marathahalli
22334,https://www.zomato.com/bangalore/gelato-italia...,"Shop 9, Cashier Layout, 3rd Cross, Thavarekere...",Gelato Italiano,Yes,No,3.8/5,32,+91 9916341943,BTM,"Food Court, Dessert Parlor",,"Ice Cream, Bakery, Desserts",200,"[('Rated 4.0', ""RATED\n Tried black currant a...","['Butterscotch Indulge', 'Fruit and Nut Overlo...",Delivery,JP Nagar
51457,https://www.zomato.com/bangalore/south-treat-w...,"3rd Floor, Virginia Mall, Varthur Main Road, W...",South Treat,Yes,No,3.4 /5,5,+91 8861028111,Whitefield,Food Court,,South Indian,400,"[('Rated 3.0', 'RATED\n i loved the chettinad...",[],Dine-out,Whitefield


In [5]:
df.columns

Index(['url', 'address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'phone', 'location', 'rest_type', 'dish_liked', 'cuisines',
       'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)', 'listed_in(city)'],
      dtype='object')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   url                          51717 non-null  object
 1   address                      51717 non-null  object
 2   name                         51717 non-null  object
 3   online_order                 51717 non-null  object
 4   book_table                   51717 non-null  object
 5   rate                         43942 non-null  object
 6   votes                        51717 non-null  int64 
 7   phone                        50509 non-null  object
 8   location                     51696 non-null  object
 9   rest_type                    51490 non-null  object
 10  dish_liked                   23639 non-null  object
 11  cuisines                     51672 non-null  object
 12  approx_cost(for two people)  51371 non-null  object
 13  reviews_list                 51

# Dropping unnecessary columns

In [6]:
# menu_item,reviews list,dish liked columns have too many empty values which provide no information. So dropping them too
# city and location are related. Location cannot be choosen as a feature as it has too many unique values
df_copy=df.drop(['url','address','phone', 'reviews_list','menu_item','dish_liked','location','name'],axis=1)

# Renaming the columns

In [11]:
df_copy=df_copy.rename(columns={'listed_in(city)':'location',
                                'rate':'rating',
                                'rest_type':'type',
                                'listed_in(type)':'listed_type',
                                'approx_cost(for two people)':'cost'
                                })

# Feature Engineering

In [13]:
# 1. votes column
# dropping restaurants with no votes
df_copy=df_copy[df_copy['votes']!=0]
df_copy = df_copy.reset_index(drop=True)

In [14]:
# 2. rating column
# converting rating of the form x/5 to x
import math
def apply_convert(value):
    val = str(value).split('/')
    return round(float(val[0]),1)
df_copy['rating'] = df_copy['rating'].apply(apply_convert)
df_copy = df_copy[df_copy['rating'].apply(lambda x: not math.isnan(x))]

In [15]:
# 3. cost column
# replacing the missing values in cost_for_two column by mean of the groupby objects
df_copy['cost'] = pd.to_numeric(df_copy['cost'], errors='coerce').astype('Float64')
df_copy['cost'] = df_copy.groupby(['location', 'listed_type'])['cost'].transform(lambda x: x.fillna(x.mean()))
df_copy['cost'] = df_copy.groupby(['location'])['cost'].transform(lambda x: x.fillna(x.mean()))

In [16]:
# 4. cuisines/dishes column
# each cuisne offered at atleast 100 restaurants is taken as a feature

df_copy.dropna(subset='cuisines',inplace=True)
df_copy = df_copy.reset_index(drop=True)

dic={}
for i in df_copy['cuisines']:
  for j in i.split(','):
    if j.lstrip() in dic:
      dic[j.lstrip()]+=1
    else:
      dic[j.lstrip()]=1

cuisines_to_keep=[]
for i,j in dic.items():
  if j>100:
    cuisines_to_keep.append(i)

df_copy['cuisines']=df_copy['cuisines'].apply(lambda x: [value.lstrip() for value in x.split(',')])

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

one_hot_encoded = pd.DataFrame(mlb.fit_transform(df_copy['cuisines']), columns=mlb.classes_)

one_hot_encoded = one_hot_encoded[cuisines_to_keep]

df_combined = pd.concat([df_copy, one_hot_encoded], axis=1)

df_combined = df_combined.loc[~(df_combined[cuisines_to_keep] == 0).all(axis=1)]

df_combined = df_combined.drop(columns=['cuisines']).reset_index(drop=True)

In [17]:
# 5. listed_type column

df_combined.drop(columns=['listed_type'],inplace=True)

In [18]:
# 6. type column
# similar to cuisines column. The type column is one hot encoded with each type as a feature

df_combined.dropna(subset='type',inplace=True)
df_combined = df_combined.reset_index(drop=True)

df_combined['type']=df_combined['type'].apply(lambda x: [value.lstrip() for value in x.split(',')])

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

one_hot_encoded = pd.DataFrame(mlb.fit_transform(df_combined['type']), columns=mlb.classes_)

df_final = pd.concat([df_combined, one_hot_encoded], axis=1)

df_final = df_final.drop(columns=['type'])

In [19]:
# 7. location column
# location column is also one hot encoded

df_final = pd.get_dummies(df_final, columns=['location'], prefix='', prefix_sep='')

In [20]:
# 8. online order and book table columns

df_final['online_order'] = df_final['online_order'].map({'Yes': 1, 'No': 0})
df_final['book_table'] = df_final['book_table'].map({'Yes': 1, 'No': 0})

# Final columns

In [21]:
df_final.columns

Index(['online_order', 'book_table', 'rating', 'votes', 'cost', 'North Indian',
       'Mughlai', 'Chinese', 'Thai', 'Cafe',
       ...
       'Lavelle Road', 'MG Road', 'Malleshwaram', 'Marathahalli',
       'New BEL Road', 'Old Airport Road', 'Rajajinagar', 'Residency Road',
       'Sarjapur Road', 'Whitefield'],
      dtype='object', length=113)

# Standard Scaling

In [22]:
from sklearn.preprocessing import StandardScaler
cost_values = df_final['cost'].values.reshape(-1, 1)
scaler = StandardScaler()
new = scaler.fit_transform(cost_values)

# Train-Test split

In [23]:
from sklearn.model_selection import train_test_split
y=df_final['rating']
X=df_final.drop(['rating'],axis=1)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Model selection and training

In [24]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()

In [25]:
rfr.fit(X_train,y_train)

In [26]:
print(rfr.score(X_train,y_train))
print(rfr.score(X_test,y_test))

0.9830813437789361
0.8945167715780609
