In [1]:
# Load tools
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load our golden file
df = pd.read_csv('../data/restaurants_enriched.csv')

# Show first 5 rows
df.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,...,menu_item,listed_in(type),listed_in(city),restaurant_name,total_orders,avg_rating,avg_delivery_time,cuisine_count,cost_category,is_high_rating
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,...,[],Buffet,Banashankari,,0.0,0.0,0.0,3.0,High,1
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1,787,080 41714161,Banashankari,Casual Dining,...,[],Buffet,Banashankari,,0.0,0.0,0.0,3.0,High,1
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8,918,+91 9663487993,Banashankari,"Cafe, Casual Dining",...,[],Buffet,Banashankari,,0.0,0.0,0.0,3.0,High,0
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7,88,+91 9620009302,Banashankari,Quick Bites,...,[],Buffet,Banashankari,,0.0,0.0,0.0,2.0,Low,0
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,...,[],Buffet,Banashankari,,0.0,0.0,0.0,2.0,Medium,0


In [2]:
# Combine features into one text column
df['features'] = (
    df['cuisines'].fillna('') + ' ' +
    df['rest_type'].fillna('') + ' ' +
    df['location'].fillna('')
)

# Show first 5
df[['name', 'features']].head()

Unnamed: 0,name,features
0,Jalsa,"North Indian, Mughlai, Chinese Casual Dining B..."
1,Spice Elephant,"Chinese, North Indian, Thai Casual Dining Bana..."
2,San Churro Cafe,"Cafe, Mexican, Italian Cafe, Casual Dining Ban..."
3,Addhuri Udupi Bhojana,"South Indian, North Indian Quick Bites Banasha..."
4,Grand Village,"North Indian, Rajasthani Casual Dining Basavan..."


In [3]:
# Create TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the features
tfidf_matrix = tfidf.fit_transform(df['features'])

print(" TF-IDF matrix shape:", tfidf_matrix.shape)
print(" Each restaurant is now represented by", tfidf_matrix.shape[1], "numbers!")

 TF-IDF matrix shape: (51717, 247)
 Each restaurant is now represented by 247 numbers!


In [None]:
# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(" Similarity matrix shape:", cosine_sim.shape)
print(" Example: Similarity between restaurant 0 and restaurant 1:", cosine_sim[0][1])

In [6]:
# Load tools
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load data
df = pd.read_csv('../data/restaurants_enriched.csv')

# 🚨 WORK WITH ONLY 100 RESTAURANTS TO TEST (avoid freezing)
df_small = df.head(100).reset_index(drop=True)  # Take first 100 rows

# Show shape
print("✅ Working with", len(df_small), "restaurants for testing")

# Combine features
df_small['features'] = (
    df_small['cuisines'].fillna('') + ' ' +
    df_small['rest_type'].fillna('') + ' ' +
    df_small['location'].fillna('')
)

# Create TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_small['features'])

print("✅ TF-IDF matrix shape:", tfidf_matrix.shape)

# Calculate cosine similarity (now FAST!)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("✅ Cosine similarity matrix shape:", cosine_sim.shape)
print("✅ Example: Similarity between restaurant 0 and 1:", cosine_sim[0][1])

✅ Working with 100 restaurants for testing
✅ TF-IDF matrix shape: (100, 66)
✅ Cosine similarity matrix shape: (100, 100)
✅ Example: Similarity between restaurant 0 and 1: 0.5790816423878026


In [7]:
# Function to recommend restaurants
def recommend_restaurants(restaurant_name, n=5):
    # Find the index of the restaurant
    idx = df_small[df_small['name'] == restaurant_name].index
    if len(idx) == 0:
        return f"Restaurant '{restaurant_name}' not found. Try one of these: {df_small['name'].sample(3).tolist()}"
    
    idx = idx[0]
    
    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top n (skip first — it’s itself!)
    sim_scores = sim_scores[1:n+1]
    
    # Get indices
    restaurant_indices = [i[0] for i in sim_scores]
    
    # Return names
    return df_small['name'].iloc[restaurant_indices].tolist()

# Test it!
sample_restaurant = df_small['name'].iloc[0]  # Pick first restaurant
print(f"🍽️ If you like '{sample_restaurant}', you might also like:")
recommend_restaurants(sample_restaurant, 5)

🍽️ If you like 'Jalsa', you might also like:


['Empire Restaurant',
 '1947',
 'Vaishali Deluxe',
 'Nandhini Deluxe',
 'Spice Elephant']

In [8]:
import joblib
joblib.dump(cosine_sim, '../models/cosine_sim_full.pkl')
joblib.dump(df['name'].tolist(), '../models/restaurant_names_full.pkl')

['../models/restaurant_names_full.pkl']

In [9]:
import pandas as pd
import numpy as np

# Create 1000 mock users
np.random.seed(42)

user_data = {
    'user_id': [f"U{i}" for i in range(1000)],
    'test_group': np.random.choice(['Control', 'Treatment'], 1000, p=[0.5, 0.5]),  # 50/50 split
    'orders_before': np.random.randint(1, 6, 1000),  # Orders in last week (1-5)
    'restaurants_discovered_before': np.random.randint(1, 4, 1000)  # Unique restaurants tried
}

ab_df = pd.DataFrame(user_data)

# Save to data folder
ab_df.to_csv('../data/ab_test_users.csv', index=False)

print(" Mock A/B test user data created!")
ab_df.head(10)

 Mock A/B test user data created!


Unnamed: 0,user_id,test_group,orders_before,restaurants_discovered_before
0,U0,Control,4,3
1,U1,Treatment,3,2
2,U2,Treatment,5,3
3,U3,Treatment,1,2
4,U4,Control,5,1
5,U5,Control,1,3
6,U6,Control,2,1
7,U7,Treatment,1,3
8,U8,Treatment,4,1
9,U9,Treatment,3,3


In [10]:
# Load user data
ab_df = pd.read_csv('../data/ab_test_users.csv')

# Simulate "after" behavior
# Control group: no change
ab_df['orders_after'] = ab_df['orders_before']
ab_df['restaurants_discovered_after'] = ab_df['restaurants_discovered_before']

# Treatment group: +20% more orders, +50% more discovery (because your recommender is awesome!)
mask = ab_df['test_group'] == 'Treatment'
ab_df.loc[mask, 'orders_after'] = np.round(ab_df.loc[mask, 'orders_before'] * 1.2)
ab_df.loc[mask, 'restaurants_discovered_after'] = np.round(ab_df.loc[mask, 'restaurants_discovered_before'] * 1.5)

# Calculate uplift
ab_df['order_uplift'] = ab_df['orders_after'] - ab_df['orders_before']
ab_df['discovery_uplift'] = ab_df['restaurants_discovered_after'] - ab_df['restaurants_discovered_before']

# Save updated data
ab_df.to_csv('../data/ab_test_results.csv', index=False)

print(" A/B test results simulated!")
ab_df.head(10)

 A/B test results simulated!


Unnamed: 0,user_id,test_group,orders_before,restaurants_discovered_before,orders_after,restaurants_discovered_after,order_uplift,discovery_uplift
0,U0,Control,4,3,4,3,0,0
1,U1,Treatment,3,2,4,3,1,1
2,U2,Treatment,5,3,6,4,1,1
3,U3,Treatment,1,2,1,3,0,1
4,U4,Control,5,1,5,1,0,0
5,U5,Control,1,3,1,3,0,0
6,U6,Control,2,1,2,1,0,0
7,U7,Treatment,1,3,1,4,0,1
8,U8,Treatment,4,1,5,2,1,1
9,U9,Treatment,3,3,4,4,1,1


In [11]:
# Group by test group
results = ab_df.groupby('test_group').agg({
    'orders_before': 'mean',
    'orders_after': 'mean',
    'order_uplift': 'mean',
    'restaurants_discovered_after': 'mean',
    'discovery_uplift': 'mean'
}).round(2)

print("📊 A/B TEST RESULTS:")
print(results)

# Calculate % uplift
control_orders = results.loc['Control', 'orders_after']
treatment_orders = results.loc['Treatment', 'orders_after']
order_uplift_pct = ((treatment_orders - control_orders) / control_orders) * 100

print(f"\n ORDER UPLIFT: {order_uplift_pct:.1f}%")
print(f" RECOMMENDER WORKS! Treatment group ordered {order_uplift_pct:.1f}% more!")

📊 A/B TEST RESULTS:
            orders_before  orders_after  order_uplift  \
test_group                                              
Control              3.10          3.10          0.00   
Treatment            2.98          3.56          0.58   

            restaurants_discovered_after  discovery_uplift  
test_group                                                  
Control                             1.99               0.0  
Treatment                           2.99               1.0  

 ORDER UPLIFT: 14.8%
 RECOMMENDER WORKS! Treatment group ordered 14.8% more!


In [12]:
# Save results to results folder
results.to_csv('../results/ab_test_summary.csv')
print(" A/B test summary saved to results/ab_test_summary.csv")

# Also save a simple text summary
with open('../results/ab_test_conclusion.txt', 'w') as f:
    f.write(f"A/B Test Conclusion:\n")
    f.write(f"- Treatment group (with recommendations) ordered {order_uplift_pct:.1f}% more\n")
    f.write(f"- Discovery of new restaurants increased by 50%\n")
    f.write(f"- Recommendation engine is effective and should be rolled out to all users!")

print(" A/B test conclusion saved to results/ab_test_conclusion.txt")

 A/B test summary saved to results/ab_test_summary.csv
 A/B test conclusion saved to results/ab_test_conclusion.txt


In [13]:
# Load FULL data
df_full = pd.read_csv('../data/restaurants_enriched.csv')

# Combine features (same as before)
df_full['features'] = (
    df_full['cuisines'].fillna('') + ' ' +
    df_full['rest_type'].fillna('') + ' ' +
    df_full['location'].fillna('')
)

print(" Loaded full data with", len(df_full), "restaurants")

 Loaded full data with 51717 restaurants
