In [None]:
# !pip install pandas
# !pip install scikit-learn
# !pip install matplotlib
# !pip install plotly

In [33]:
# Step 1:
# Import libraries
import pandas as pd
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import plotly.express as px

In [34]:
# Step 2:
# loading the dataset
df = pd.read_csv('dataset/swiggy.csv')

In [77]:
df.columns

Index(['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine',
       'lic_no', 'link', 'address', 'menu'],
      dtype='object')

In [35]:
# Step 3:
# Checking for missing value
df.isnull().sum()

id                0
name             86
city              0
rating           86
rating_count     86
cost            131
cuisine          99
lic_no          229
link              0
address          86
menu              0
dtype: int64

In [None]:
# To check which rows have name as NA and to check other features in those rows
# df["name"].isnull().to_csv('dataset/swiggy_name.csv')

In [36]:
# Step 4:
# Row with name : NA are dropped because important feature like rating, rating_count, cost, cuisine, lic_no is also NA
df = df.dropna(subset=['name'])

In [None]:
    # To check null values in cuisine and cost columns
null_cuisine = df[df['cuisine'].isnull()]
null_cost = df[df['cost'].isnull()]
null = pd.concat([null_cuisine, null_cost]).drop_duplicates()
    # To check which rows have cuisine or cost as NA
null.to_csv('dataset/swiggy_null_cuisine_cost.csv')

In [37]:
# Step 5:
# Converting cost to float
df["cost"] = df["cost"].replace('₹ ', "", regex=True).astype(float)

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148455 entries, 0 to 148540
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            148455 non-null  int64  
 1   name          148455 non-null  object 
 2   city          148455 non-null  object 
 3   rating        148455 non-null  object 
 4   rating_count  148455 non-null  object 
 5   cost          148410 non-null  float64
 6   cuisine       148442 non-null  object 
 7   lic_no        148312 non-null  object 
 8   link          148455 non-null  object 
 9   address       148455 non-null  object 
 10  menu          148455 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 13.6+ MB


In [38]:
# Step 6:
# Filling missing values in cost and cuisine columns
for col in ["cost", "cuisine"]:
     default_value = 0 if col == "cost" else "Unknown"
     df[col] = (
          df.groupby('name')[col]
          .transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
          .fillna(default_value)
          )

  .transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
  .transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
  .transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))


In [39]:
# Step 7:
# Checking for missing values
df.isnull().sum()

id                0
name              0
city              0
rating            0
rating_count      0
cost              0
cuisine           0
lic_no          143
link              0
address           0
menu              0
dtype: int64

In [40]:
# Step 8:
# Converting rating to float
df["rating"]=df["rating"].replace('--',0).astype(float)

In [85]:
# Checking for duplicate rows
df.duplicated().sum()

np.int64(0)

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148455 entries, 0 to 148540
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            148455 non-null  int64  
 1   name          148455 non-null  object 
 2   city          148455 non-null  object 
 3   rating        148455 non-null  float64
 4   rating_count  148455 non-null  object 
 5   cost          148455 non-null  float64
 6   cuisine       148455 non-null  object 
 7   lic_no        148312 non-null  object 
 8   link          148455 non-null  object 
 9   address       148455 non-null  object 
 10  menu          148455 non-null  object 
dtypes: float64(2), int64(1), object(8)
memory usage: 13.6+ MB


In [154]:
df.to_csv('dataset/swiggy_cleaned.csv', index=False)

In [76]:
for col in df.columns:
    print(f"{col}: {df[col].nunique()}")

id: 148455
name: 112818
city: 821
rating: 42
rating_count: 8
cost: 364
cuisine: 2133
lic_no: 108763
link: 148455
address: 148401
menu: 148455


In [41]:
# Step 9:
# Splitting city into area and main city
df[["area", "city_main"]] = df["city"].str.split(',', n=1, expand=True)
# cleaning area and city_main columns
df["area"] = df["area"].str.strip()
df["city_main"] = df["city_main"].str.strip()
# Filling missing values in city_main column
df['city_main'].fillna("Other", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['city_main'].fillna("Other", inplace=True)


In [42]:
# step 10:
df.drop(columns=['city'], inplace=True)

In [None]:
# Checking cuisine column number pffered in each row
cuisine_1, cuisine_2, cuisine_3 = [], [], []

for cuisines in df['cuisine']:
    cuisine_list = [c.strip() for c in cuisines.split(',')]
    cuisine_1.append(cuisine_list[0])
    if len(cuisine_list) > 1:
        cuisine_2.append(cuisine_list[0])
    if len(cuisine_list) > 2:
        cuisine_3.append(cuisine_list[0])
    
print("cusine_1 :", len(cuisine_1))
print("cusine_2 :", len(cuisine_2))
print("cusine_3 :", len(cuisine_3))


cusine_1 : 148455
cusine_2 : 108072
cusine_3 : 0


In [43]:
# Step 11:
# splitting cuisine into cuisine_1 and cuisine_2
df[["cuisine_1", "cuisine_2"]] = df["cuisine"].str.split(',', n=1, expand=True)

In [137]:
df.to_csv('dataset/swiggy_final.csv', index=False)

In [44]:
# Step 12:
# Enconding cuisine_1 and cuisine_2 columns using OneHotEncoder and saving it as pickle file
oneHot_encoder_cuisine = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encode_oneHot_cuisine = oneHot_encoder_cuisine.fit_transform(df[["cuisine_1", "cuisine_2"]]) 
pickle.dump(oneHot_encoder_cuisine, open('pickles/oneHot_cuisine.pkl', 'wb'))
encoded_df_oneHot_cuisine = pd.DataFrame(encode_oneHot_cuisine, columns=oneHot_encoder_cuisine.get_feature_names_out(['cuisine_1', 'cuisine_2']))
df = pd.concat([df.reset_index(drop=True), encoded_df_oneHot_cuisine.reset_index(drop=True)], axis=1)
df.drop(columns=['cuisine_1', 'cuisine_2', 'cuisine'], inplace=True)

In [92]:
len(df.columns)

244

In [45]:
# Step 13:
# Enconding city_main column using OneHotEncoder and saving it as pickle file
oneHot_encoder_city = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encode_oneHot_city = oneHot_encoder_city.fit_transform(df[['city_main']])
pickle.dump(oneHot_encoder_city, open('pickles/oneHot_city.pkl', 'wb'))
encoded_df_oneHot_city = pd.DataFrame(encode_oneHot_city, columns=oneHot_encoder_city.get_feature_names_out(['city_main']))
df = pd.concat([df.reset_index(drop=True), encoded_df_oneHot_city.reset_index(drop=True)], axis=1)
df.drop(columns=['city_main'], inplace=True)
 

In [94]:
len(df.columns)

271

In [46]:
# Step 14:
area_to_oneHot = False  # Set to False to use LabelEncoder instead of OneHotEncoder
if area_to_oneHot:
    # Encoding area column using OneHotEncoder and saving it as pickle file
    oneHot_encoder_area = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encode_oneHot_area = oneHot_encoder_area.fit_transform(df[['area']])
    pickle.dump(oneHot_encoder_area, open('pickles/oneHot_area.pkl', 'wb'))
    encoded_df_oneHot_area = pd.DataFrame(encode_oneHot_area, columns=oneHot_encoder_area.get_feature_names_out(['area']))
    df = pd.concat([df.reset_index(drop=True), encoded_df_oneHot_area.reset_index(drop=True)], axis=1)
    df.drop(columns=['area'], inplace=True)
else:
    # Encoding area column using LabelEncoder and saving it as pickle file
    label_encoder_area = LabelEncoder()
    df["area"]= label_encoder_area.fit_transform(df['area'])
    pickle.dump(label_encoder_area, open('pickles/label_area.pkl', 'wb'))
    

In [96]:
len(df.columns)

271

In [39]:
# --Ignored--
df.to_csv('dataset/swiggy_test.csv')

In [47]:
# Step 15:
# Encoding name column using LabelEncoder and saving it as pickle file
label_encoder_name = LabelEncoder()
df["name"]=label_encoder_name.fit_transform(df["name"])
pickle.dump(label_encoder_name, open('pickles/label_name.pkl', 'wb'))

In [98]:
len(df.columns)

271

In [48]:
# Step 16:
# Encoding rating_count column using LabelEncoder and saving it as pickle file
label_encoder_rating_count = LabelEncoder()
df["rating_count"]=label_encoder_rating_count.fit_transform(df["rating_count"])
pickle.dump(label_encoder_rating_count, open('pickles/label_rating_count.pkl', 'wb'))

In [100]:
len(df.columns)

271

In [49]:
# Step 17:
# Drop lic_no, address, menu and link columns as they are not impactfull for analysis
df.drop(columns=['lic_no', 'address', 'menu', 'link'], inplace=True)

In [102]:
len(df.columns)

267

In [50]:
# Step 18:
# index id
df.set_index('id', inplace=True)

In [104]:
len(df.columns)

266

In [None]:
df.corr()

In [86]:
df.corr().to_csv('dataset/corr.csv', index=False)

In [105]:
# Step ignored because data performences better without standardization 
# Standardizing the dataset and saving it as pickle file
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
pickle.dump(scaler, open('pickles/scaler.pkl', 'wb'))

In [51]:
# Step 19:
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import numpy as np

In [146]:
# Elbow method to find optimal number of clusters
inertia = []
k = range(1, 20)
for i in k:
    # model = KMeans(n_clusters=i).fit(df_scaled)
    model = KMeans(n_clusters=i).fit(df)
    inertia.append(model.inertia_) 

In [147]:
px.line(x=k,y=inertia)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [52]:
# Step 20:
# clustering with optimal number of clusters
best_k = 6  # From elbow graph
model = KMeans(n_clusters=best_k).fit(df)
df['cluster'] = model.predict(df)
pickle.dump(model, open('pickles/kmeans_model.pkl', 'wb'))

In [None]:
# Prediction Process

In [None]:
### loading Ecoders 
# oneHot_encoder_cuisine
# oneHot_encoder_city
# label_encoder_area
# label_encoder_name
# label_encoder_rating_count

### loadind Model
# model

In [21]:
user_input = {
    'name': ['Hotel Aasare'],
    'rating': [0],
    'rating_count': ["Too Few Ratings"],
    'cost': [0],
    'area': ['Indiranagar'],
    'cuisine_1': ["Biryani"],
    'cuisine_2': ['South Indian'],
    'city_main': ['Bangalore']
}
user_df = pd.DataFrame(user_input)
 

In [None]:
choosen_name = label_encoder_name.transform(user_df['name'])
choosen_area = label_encoder_area.transform(user_df['area'])
choosen_rating_count = label_encoder_rating_count.transform(user_df['rating_count'])
choosen_cuisine = oneHot_encoder_cuisine.transform(user_df[['cuisine_1', "cuisine_2"]])
choosen_city = oneHot_encoder_city.transform(user_df[['city_main']])
 

In [None]:
# Convert label encoded / numeric features to 2D
name_2d = choosen_name.reshape(1, -1)
rating_2d = np.array([[user_df['rating'].iloc[0]]])
rating_count_2d = choosen_rating_count.reshape(1, -1)
cost_2d = np.array([[user_df['cost'].iloc[0]]])
area_2d = choosen_area.reshape(1, -1)

# Cuisine and city are already 2D from OneHotEncoder
cuisine_2d = choosen_cuisine
city_2d = choosen_city

# Now stack safely
final_vector = np.hstack([
    name_2d,
    rating_2d,
    rating_count_2d,
    cost_2d,
    area_2d,
    cuisine_2d,
    city_2d
])
 

In [74]:
len(final_vector[0])

266

In [71]:
df.columns

Index(['name', 'rating', 'rating_count', 'cost', 'area',
       'cuisine_1_8:15 To 11:30 Pm', 'cuisine_1_Afghani', 'cuisine_1_African',
       'cuisine_1_American', 'cuisine_1_Andhra',
       ...
       'city_main_Nagpur', 'city_main_New BEL Road,Bangalore',
       'city_main_Noida', 'city_main_Other', 'city_main_Pune',
       'city_main_Surat', 'city_main_Vadodara', 'city_main_Vijayawada',
       'city_main_Vizag', 'cluster'],
      dtype='object', length=267)

In [None]:
cluster = model.predict(final_vector)
 



In [None]:
recommended = df[df['cluster'] == cluster[0]]
 

In [156]:
recommended.reset_index(inplace=True)
df.reset_index(inplace=True)

mask = df['id'].isin(recommended['id'])
recommended_decoded = df[mask].copy()

Unnamed: 0,index,id,name,rating,rating_count,cost,area,cuisine_1_8:15 To 11:30 Pm,cuisine_1_Afghani,cuisine_1_African,...,city_main_Nagpur,"city_main_New BEL Road,Bangalore",city_main_Noida,city_main_Other,city_main_Pune,city_main_Surat,city_main_Vadodara,city_main_Vijayawada,city_main_Vizag,cluster
1,1,531342,47417,4.4,4,200.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
8,8,156602,41048,4.2,3,100.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
13,13,156601,47928,0.0,7,100.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
23,23,427610,48304,0.0,7,300.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
30,30,475283,47963,0.0,7,200.0,1,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148442,148442,193858,45550,0.0,7,120.0,815,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
148443,148443,407798,56999,0.0,7,150.0,815,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
148449,148449,214210,53004,0.0,7,300.0,815,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
148451,148451,562647,56581,0.0,7,300.0,815,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0


In [56]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

def recommend_restaurants(user_input, df, model,
                          label_encoder_name,
                          label_encoder_area,
                          label_encoder_rating_count,
                          oneHot_encoder_cuisine,
                          oneHot_encoder_city,
                          top_n=10):

    # -------------------------
    # Convert dict → DataFrame
    # -------------------------
    user_df = pd.DataFrame([user_input])

    # -------------------------
    # 1. NAME PLACEHOLDER FIX
    # -------------------------
    # If user does not enter any name (usually they won't)
    # we use a safe default name from training dataframe
    safe_name = df['name'].iloc[0]   # first name in training data

    # Encode this name
    enc_name = label_encoder_name.transform([safe_name]).reshape(1, -1)

    # -------------------------
    # 2. Encode numeric + label encoded values
    # -------------------------
    enc_rating = np.array([[user_df['rating'].iloc[0]]])
    enc_cost = np.array([[user_df['cost'].iloc[0]]])
    enc_rating_count = label_encoder_rating_count.transform(
        user_df['rating_count']
    ).reshape(1, -1)
    enc_area = label_encoder_area.transform(
        user_df['area']
    ).reshape(1, -1)

    # -------------------------
    # 3. OneHot encodings
    # -------------------------
    enc_cuisine = oneHot_encoder_cuisine.transform(
        user_df[['cuisine_1', 'cuisine_2']]
    )
    enc_city = oneHot_encoder_city.transform(
        user_df[['city']]
    )

    # -------------------------
    # 4. Build FINAL VECTOR (Exact feature order used in training)
    # -------------------------
    final_vector = np.hstack([
        enc_name,           # feature_name (required)
        enc_rating,         # feature_rating
        enc_rating_count,   # feature_rating_count
        enc_cost,           # feature_cost
        enc_area,           # feature_area
        enc_cuisine,        # feature_cuisine_*
        enc_city            # feature_city_*
    ])

    # -------------------------
    # 5. Predict Cluster
    # -------------------------
    cluster = model.predict(final_vector)[0]
    cluster_df = df[df['cluster'] == cluster].copy()

    # -------------------------
    # 6. Cosine Similarity Ranking
    # -------------------------
    feature_cols = [c for c in df.columns if c.startswith("feature_")]
    cluster_vectors = cluster_df[feature_cols].values

    sim_scores = cosine_similarity(final_vector, cluster_vectors)[0]
    cluster_df["similarity_score"] = sim_scores

    # -------------------------
    # 7. Sort Top N Recommendations
    # -------------------------
    recommended = cluster_df.sort_values(
        by="similarity_score",
        ascending=False
    ).head(top_n)

    return recommended[[
        "name", "area", "city",
        "rating", "rating_count", "cost",
        "cuisine_1", "cuisine_2",
        "similarity_score"
    ]]


In [58]:
user_input = {
    'name': ['Hotel Aasare'],
    "rating": 4.0,
    "rating_count": "50+ ratings",
    "cost": 250,
    "area": "Indiranagar",
    "cuisine_1": "Biryani",
    "cuisine_2": "South Indian",
    "city_main": "Bangalore"
}

recommend_restaurants(
    user_input,
    df=df,
    model=model,
    label_encoder_name=label_encoder_name,
    label_encoder_area=label_encoder_area,
    label_encoder_rating_count=label_encoder_rating_count,
    oneHot_encoder_cuisine=oneHot_encoder_cuisine,
    oneHot_encoder_city=oneHot_encoder_city,
    top_n=5
)

ValueError: y contains previously unseen labels: np.int64(1583)