In [2]:
# Step 1: Import tools we need
import pandas as pd

# Step 2: Load the data
df = pd.read_csv('../data/zomato.csv')

# Step 3: Show first 5 rows
df.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [3]:
# Clean the 'rate' column
df['rate'] = df['rate'].str.replace('/5', '')  # Remove "/5"
df['rate'] = df['rate'].str.replace('NEW', '0')  # Replace "NEW" with 0
df['rate'] = df['rate'].str.replace('-', '0')    # Replace "-" with 0
df['rate'] = pd.to_numeric(df['rate'], errors='coerce')  # Convert to number

# Show cleaned ratings
df[['name', 'rate']].head(10)

Unnamed: 0,name,rate
0,Jalsa,4.1
1,Spice Elephant,4.1
2,San Churro Cafe,3.8
3,Addhuri Udupi Bhojana,3.7
4,Grand Village,3.8
5,Timepass Dinner,3.8
6,Rosewood International Hotel - Bar & Restaurant,3.6
7,Onesta,4.6
8,Penthouse Cafe,4.0
9,Smacznego,4.2


In [4]:
# Clean 'votes' — make sure it’s a number
df['votes'] = pd.to_numeric(df['votes'], errors='coerce')
df['votes'] = df['votes'].fillna(0)  # Replace empty with 0

# Clean 'approx_cost(for two people)' — remove commas and convert to number
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].str.replace(',', '')
df['approx_cost(for two people)'] = pd.to_numeric(df['approx_cost(for two people)'], errors='coerce')
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].fillna(0)

# Show cleaned data
df[['name', 'votes', 'approx_cost(for two people)']].head(10)

Unnamed: 0,name,votes,approx_cost(for two people)
0,Jalsa,775,800.0
1,Spice Elephant,787,800.0
2,San Churro Cafe,918,800.0
3,Addhuri Udupi Bhojana,88,300.0
4,Grand Village,166,600.0
5,Timepass Dinner,286,600.0
6,Rosewood International Hotel - Bar & Restaurant,8,800.0
7,Onesta,2556,600.0
8,Penthouse Cafe,324,700.0
9,Smacznego,504,550.0


In [5]:
# Save cleaned data
df.to_csv('../data/zomato_cleaned.csv', index=False)

print("✅ Data cleaned and saved to data/zomato_cleaned.csv")

✅ Data cleaned and saved to data/zomato_cleaned.csv


In [6]:
import pandas as pd
import numpy as np

# List of restaurant names from cleaned data
restaurant_names = df['name'].tolist()

# Create fake data
np.random.seed(42)  # This makes random numbers same every time (so we can reproduce)

data = {
    'order_id': range(1001, 1501),  # 500 orders
    'user_id': [f"U{np.random.randint(100,999)}" for _ in range(500)],
    'restaurant_name': np.random.choice(restaurant_names, 500),
    'order_value': np.random.randint(100, 1000, 500),  # ₹100 to ₹1000
    'delivery_time_mins': np.random.randint(20, 60, 500),  # 20 to 60 mins
    'order_date': pd.date_range('2024-03-01', periods=500, freq='H').strftime('%Y-%m-%d'),
    'rating_given': np.round(np.random.uniform(2.5, 5.0, 500), 1)  # 2.5 to 5.0 stars
}

# Create DataFrame
orders_df = pd.DataFrame(data)

# Save to CSV
orders_df.to_csv('../data/orders.csv', index=False)

print(" Fake orders created and saved to data/orders.csv")
orders_df.head(10)

 Fake orders created and saved to data/orders.csv


Unnamed: 0,order_id,user_id,restaurant_name,order_value,delivery_time_mins,order_date,rating_given
0,1001,U202,Zero Mile Punjab,888,52,2024-03-01,3.7
1,1002,U535,Chinese Fire Dragon,104,47,2024-03-01,3.0
2,1003,U960,The Terrace at Gilly's Redefined,565,27,2024-03-01,3.4
3,1004,U370,California Burrito,703,53,2024-03-01,4.4
4,1005,U206,Tandoor Box,781,54,2024-03-01,4.9
5,1006,U171,Sea Horse,928,51,2024-03-01,2.7
6,1007,U800,Bhojohori Manna,633,43,2024-03-01,3.8
7,1008,U120,Mycakeshop,504,33,2024-03-01,4.7
8,1009,U714,Nossa Goa,937,51,2024-03-01,2.7
9,1010,U221,Inhouse Burger,612,35,2024-03-01,4.6


In [8]:
# Load cleaned restaurant data
restaurants_df = pd.read_csv('../data/zomato_cleaned.csv')

# Load fake orders data
orders_df = pd.read_csv('../data/orders.csv')

# STEP 1: Group orders by restaurant → calculate stats
order_stats = orders_df.groupby('restaurant_name').agg(
    total_orders=('order_id', 'count'),          # How many orders?
    avg_rating=('rating_given', 'mean'),         # Average rating
    avg_delivery_time=('delivery_time_mins', 'mean')  # Avg delivery time
).reset_index()

# STEP 2: Merge with restaurant data
combined_df = restaurants_df.merge(order_stats, left_on='name', right_on='restaurant_name', how='left')

# STEP 3: Fill missing values (some restaurants have no orders yet)
combined_df['total_orders'] = combined_df['total_orders'].fillna(0)
combined_df['avg_rating'] = combined_df['avg_rating'].fillna(0)
combined_df['avg_delivery_time'] = combined_df['avg_delivery_time'].fillna(0)

# STEP 4: Create new useful columns (FEATURE ENGINEERING!)

# How many cuisines does this restaurant have?
combined_df['cuisine_count'] = combined_df['cuisines'].str.split(', ').str.len().fillna(1)

# Is this restaurant expensive? (create categories)
combined_df['cost_category'] = pd.cut(
    combined_df['approx_cost(for two people)'],
    bins=[0, 300, 600, 5000],
    labels=['Low', 'Medium', 'High']
)

# Is rating high? (above 4.0)
combined_df['is_high_rating'] = (combined_df['rate'] > 4.0).astype(int)

# Show the new super table!
combined_df[['name', 'rate', 'total_orders', 'avg_rating', 'avg_delivery_time', 'cuisine_count', 'cost_category']].head(10)

Unnamed: 0,name,rate,total_orders,avg_rating,avg_delivery_time,cuisine_count,cost_category
0,Jalsa,4.1,0.0,0.0,0.0,3.0,High
1,Spice Elephant,4.1,0.0,0.0,0.0,3.0,High
2,San Churro Cafe,3.8,0.0,0.0,0.0,3.0,High
3,Addhuri Udupi Bhojana,3.7,0.0,0.0,0.0,2.0,Low
4,Grand Village,3.8,0.0,0.0,0.0,2.0,Medium
5,Timepass Dinner,3.8,0.0,0.0,0.0,1.0,Medium
6,Rosewood International Hotel - Bar & Restaurant,3.6,0.0,0.0,0.0,4.0,High
7,Onesta,4.6,2.0,3.7,50.0,3.0,Medium
8,Penthouse Cafe,4.0,0.0,0.0,0.0,3.0,High
9,Smacznego,4.2,1.0,2.7,38.0,5.0,Medium


In [9]:
# Save the final enriched dataset
combined_df.to_csv('../data/restaurants_enriched.csv', index=False)

print(" Combined data saved to data/restaurants_enriched.csv")

 Combined data saved to data/restaurants_enriched.csv


In [10]:
df_sample = pd.read_csv('../data/restaurants_enriched.csv').head(100)
df_sample.to_csv('../data/restaurants_enriched_sample.csv', index=False)