In [61]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
df = pd.read_csv(r"D:\PROJECTS\SWIGGY_RESTAURANT_RECOMMENDATION_SYSTEM\DATA\swiggy_raw.csv")

In [63]:
df.head()

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [64]:
df.shape

(148541, 11)

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148541 entries, 0 to 148540
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            148541 non-null  int64 
 1   name          148455 non-null  object
 2   city          148541 non-null  object
 3   rating        148455 non-null  object
 4   rating_count  148455 non-null  object
 5   cost          148410 non-null  object
 6   cuisine       148442 non-null  object
 7   lic_no        148312 non-null  object
 8   link          148541 non-null  object
 9   address       148455 non-null  object
 10  menu          148541 non-null  object
dtypes: int64(1), object(10)
memory usage: 12.5+ MB


In [66]:
df.describe()

Unnamed: 0,id
count,148541.0
mean,363466.378912
std,167890.977174
min,211.0
25%,233320.0
50%,412628.0
75%,502223.0
max,581031.0


In [69]:
df.duplicated().sum()

np.int64(0)

In [70]:
df.isna().sum()

id                0
name             86
city              0
rating           86
rating_count     86
cost            131
cuisine          99
lic_no          229
link              0
address          86
menu              0
dtype: int64

In [71]:
df.dropna(inplace=True)

In [72]:
df.columns

Index(['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine',
       'lic_no', 'link', 'address', 'menu'],
      dtype='object')

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148255 entries, 0 to 148540
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            148255 non-null  int64 
 1   name          148255 non-null  object
 2   city          148255 non-null  object
 3   rating        148255 non-null  object
 4   rating_count  148255 non-null  object
 5   cost          148255 non-null  object
 6   cuisine       148255 non-null  object
 7   lic_no        148255 non-null  object
 8   link          148255 non-null  object
 9   address       148255 non-null  object
 10  menu          148255 non-null  object
dtypes: int64(1), object(10)
memory usage: 13.6+ MB


In [74]:
df['rating'] = df['rating'].replace('--', np.nan)
df['rating'] = df['rating'].astype(float)

In [75]:
def clean_rating_count(x):
    if pd.isna(x):
        return 0
    
    x = str(x).strip()
    x = x.replace("+ ratings", "")
    x = x.replace("+", "")
    x = x.replace("Too Few Ratings", "0")
    x = x.replace(",", "")
    
    if "K" in x:
        return float(x.replace("K", "")) * 1000
    
    try:
        return float(x)
    except:
        return 0

df["rating_count"] = df["rating_count"].apply(clean_rating_count)

In [76]:
df['cost'] = df['cost'].str.replace('₹', '', regex=False).astype(float)
df['cost'].fillna(df['cost'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cost'].fillna(df['cost'].median(), inplace=True)


In [77]:
df['rating'].fillna(df['rating'].mean(), inplace=True)
df['city'] = df['city'].str.split(',').str[-1]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating'].fillna(df['rating'].mean(), inplace=True)


In [78]:
df['cuisine'] = (df['cuisine'].str.lower().str.split(',').str[0].str.strip())

In [79]:
df['cuisine'].unique()

array(['beverages', 'sweets', 'fast food', 'italian-american',
       'continental', 'north indian', 'snacks', 'indian', 'juices',
       'tandoor', 'punjabi', 'mughlai', 'pizzas', 'chinese', 'ice cream',
       'chaat', 'bakery', 'american', 'european', 'biryani',
       'south indian', 'desserts', 'street food', 'nepalese', 'paan',
       'healthy food', 'bengali', 'thalis', 'waffle', 'arabian', 'combo',
       'tibetan', 'burgers', 'bihari', 'italian', 'salads', 'kebabs',
       'asian', 'thai', 'north eastern', 'cafe', 'pan-asian', 'lebanese',
       'maharashtrian', 'mexican', 'pastas', 'rajasthani', 'gujarati',
       'seafood', 'japanese', 'sushi', 'middle eastern', 'french',
       'hyderabadi', 'grill', 'svanidhi street food vendor', 'afghani',
       'oriental', 'home food', 'barbecue', 'korean', 'kerala', 'andhra',
       'british', 'oriya', 'portuguese', 'lucknowi', 'mangalorean',
       'chettinad', 'mediterranean', 'naga', 'turkish', 'assamese',
       'steakhouse', 'coas

In [80]:
df = df[~df['cuisine'].str.contains(
    'discount|offer|delivery|pm|am|grocery',
    case=False,
    na=False
)]

In [81]:
df

Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,3.894513,0.0,200.0,beverages,22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.400000,50.0,200.0,sweets,12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.800000,100.0,100.0,beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.700000,20.0,250.0,fast food,22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
5,158204,Sam Uncle,Abohar,3.600000,20.0,200.0,continental,22119652000052,https://www.swiggy.com/restaurants/sam-uncle-c...,"Sam Uncle, hanumangarh road near raja bajaj sh...",Menu/158204.json
...,...,...,...,...,...,...,...,...,...,...,...
148536,553122,The Food Delight,Yavatmal,3.894513,0.0,200.0,fast food,21522053000452,https://www.swiggy.com/restaurants/the-food-de...,"The Food Delight, 94MC+X35, New Singhania Naga...",Menu/553122.json
148537,562647,MAITRI FOODS & BEVERAGES,Yavatmal,3.894513,0.0,300.0,pizzas,license,https://www.swiggy.com/restaurants/maitri-food...,"MAITRI FOODS & BEVERAGES, POLIC MITRYA SOCIETY...",Menu/562647.json
148538,559435,Cafe Bella Ciao,Yavatmal,3.894513,0.0,300.0,fast food,21522251000378,https://www.swiggy.com/restaurants/cafe-bella-...,"Cafe Bella Ciao, SHOP NO 2 NEMANI MARKET SBI S...",Menu/559435.json
148539,418989,GRILL ZILLA,Yavatmal,3.894513,0.0,250.0,continental,21521251000241,https://www.swiggy.com/restaurants/grill-zilla...,"GRILL ZILLA, SHO NO 2/6, POSTEL GROUND CHOWPAT...",Menu/418989.json


In [82]:
df_cleaned = df[['name','city','rating','rating_count','cost','cuisine']]

In [83]:
df_cleaned.reset_index(drop=True, inplace=True)

In [84]:
df_cleaned.to_csv(r"D:\PROJECTS\SWIGGY_RESTAURANT_RECOMMENDATION_SYSTEM\DATA\cleaned_data.csv", index=False)


In [86]:
len(df_cleaned)

141739

In [87]:
categorical_cols = ['city','cuisine']
numerical_cols = ['rating','rating_count','cost']

In [88]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoded_cat = encoder.fit_transform(df_cleaned[categorical_cols])


In [89]:
encoded_cat_df = pd.DataFrame(
    encoded_cat,
    columns=encoder.get_feature_names_out(['city','cuisine'])
)


encoded_df = pd.concat(
    [df[numerical_cols].reset_index(drop=True), encoded_cat_df],
    axis=1
)

In [90]:
encoded_df = encoded_df.fillna(0)

In [91]:
encoded_df.to_csv(r"D:\PROJECTS\SWIGGY_RESTAURANT_RECOMMENDATION_SYSTEM\DATA\encoded_data.csv", index=False)

In [93]:
with open("D:\PROJECTS\SWIGGY_RESTAURANT_RECOMMENDATION_SYSTEM\DATA\encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

  with open("D:\PROJECTS\SWIGGY_RESTAURANT_RECOMMENDATION_SYSTEM\DATA\encoder.pkl", "wb") as f:


In [94]:
scaler = StandardScaler()

scaled_data = scaler.fit_transform(encoded_df)

scaled_df = pd.DataFrame(scaled_data,columns=encoded_df.columns,index=df.index)

In [95]:
scaled_df.to_csv(r"D:\PROJECTS\SWIGGY_RESTAURANT_RECOMMENDATION_SYSTEM\DATA\scaled_data.csv", index=False)

In [96]:
with open(r"D:\PROJECTS\SWIGGY_RESTAURANT_RECOMMENDATION_SYSTEM\DATA\scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)