<a href="https://colab.research.google.com/github/ekang100/taste-twin/blob/main/taste_twin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#pip install pandas scikit-learn sentence-transformers

# Playing with Mock Data

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [None]:
## mock data

# restaurants
restaurant_metadata = {
    "leo": {
        "cuisines": ["Italian"],
        "tags": ["date night", "natural wine", "cozy"],
        "price": "$$$",
        "location": "SoHo"
    },
    "ugly baby": {
        "cuisines": ["Thai"],
        "tags": ["spicy", "authentic", "no reservations"],
        "price": "$$",
        "location": "Carroll Gardens"
    },
    "kiki": {
        "cuisines": ["Greek"],
        "tags": ["casual", "cheap eats", "group-friendly"],
        "price": "$",
        "location": "LES"
    },
    "roscioli": {
        "cuisines": ["Italian"],
        "tags": ["pasta", "tasting menu", "famous"],
        "price": "$$$$",
        "location": "Rome"  # could be normalized to "International"
    },
    "jeju noodle bar": {
        "cuisines": ["Korean"],
        "tags": ["spicy", "trendy", "noodle-forward"],
        "price": "$$$",
        "location": "West Village"
    },
    "cervo’s": {
        "cuisines": ["Spanish", "Seafood"],
        "tags": ["natural wine", "cozy", "date night"],
        "price": "$$$",
        "location": "Chinatown"
    }
}

# users
userA = {
    "username": "jaysen",
    "been": {
        "leo": 1.0,
        "ugly baby": 2.0,
        "kiki": 4.5
    },
    "want_to_try": ["roscioli", "jeju noodle bar"],
    "reviews": [
        "vibes and wine were immaculate at leo",
        "flavors at ugly baby were wild — spicy and rich",
        "kiki is super casual, went with friends after work"
    ]
}

userB = {
    "username": "alex",
    "been": {
        "ugly baby": 1.2,
        "jeju noodle bar": 2.5,
        "cervo’s": 3.7
    },
    "want_to_try": ["leo", "roscioli"],
    "reviews": [
        "jeju was clean, spicy, modern — loved it",
        "cervo’s was cozy and great with natural wine",
        "ugly baby is always my go-to for heat"
    ]
}

In [None]:
# build "taste" vectors

from collections import defaultdict
model = SentenceTransformer('all-MiniLM-L6-v2')

def normalize_dict(d):
    total = sum(d.values())
    return {k: v / total for k, v in d.items()} if total > 0 else d

def build_vector(user, metadata, field, want_weight=0.3):
    vec = defaultdict(float)

    # Strong signal: Been list
    for r, rank in user["been"].items():
        weight = 1 / rank
        values = metadata[r][field]
        if isinstance(values, list):
            for val in values:
                vec[val] += weight
        else:
            vec[values] += weight

    # Weak signal: Want to try list
    for r in user.get("want_to_try", []):
        if r in metadata:
            values = metadata[r][field]
            if isinstance(values, list):
                for val in values:
                    vec[val] += want_weight
            else:
                vec[values] += want_weight

    return normalize_dict(vec)


def build_price_vector(user, metadata):
    return build_vector(user, metadata, field="price")

def build_cuisine_vector(user, metadata):
    return build_vector(user, metadata, field="cuisines")

def build_tag_vector(user, metadata):
    return build_vector(user, metadata, field="tags")

def build_location_vector(user, metadata):
    return build_vector(user, metadata, field="location")

def get_review_embedding(user):
    if not user["reviews"]:
        return np.zeros(384)
    embeddings = model.encode(user["reviews"])
    return np.mean(embeddings, axis=0)

In [None]:
# taste vectors for mock users
userA_vectors = {
    "cuisine_vector": build_cuisine_vector(userA, restaurant_metadata),
    "tag_vector": build_tag_vector(userA, restaurant_metadata),
    "price_vector": build_price_vector(userA, restaurant_metadata),
    "location_vector": build_location_vector(userA, restaurant_metadata),
    "review_vector": get_review_embedding(userA)
}

userB_vectors = {
    "cuisine_vector": build_cuisine_vector(userB, restaurant_metadata),
    "tag_vector": build_tag_vector(userB, restaurant_metadata),
    "price_vector": build_price_vector(userB, restaurant_metadata),
    "location_vector": build_location_vector(userB, restaurant_metadata),
    "review_vector": get_review_embedding(userB)
}

In [None]:
# compatibility with cosine
def dict_cosine(d1, d2):
    keys = set(d1.keys()).union(d2.keys())
    v1 = np.array([d1.get(k, 0) for k in keys])
    v2 = np.array([d2.get(k, 0) for k in keys])
    return cosine_similarity([v1], [v2])[0][0]

def compute_compatibility(u1, u2):
    return round(100 * (
        0.3 * dict_cosine(u1["cuisine_vector"], u2["cuisine_vector"]) +
        0.2 * dict_cosine(u1["tag_vector"], u2["tag_vector"]) +
        0.15 * dict_cosine(u1["price_vector"], u2["price_vector"]) +
        0.15 * dict_cosine(u1["location_vector"], u2["location_vector"]) +
        0.2 * cosine_similarity([u1["review_vector"]], [u2["review_vector"]])[0][0]
    ), 2)

In [None]:
score = compute_compatibility(userA_vectors, userB_vectors)
print(f"🧠 Jaysen and Alex's taste compatibility: {score}%")

# Yelp Data

In [None]:
import json
import pandas as pd
import os
import tarfile

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# extact data
import tarfile

tar_path = '/content/drive/MyDrive/data/yelp_dataset.tar'
extract_path = '/content/drive/MyDrive/data/yelp_data'

with tarfile.open(tar_path, 'r') as tar:
    tar.extractall(path=extract_path)

# this didnt get all of the jsons so had to individually extract..

In [None]:
!apt-get install -y tar

In [None]:
# get business data - ONLY NEED TO DO ONCE
tar_path = '/content/drive/MyDrive/data/yelp_dataset.tar'
output_dir = '/content/drive/MyDrive/data/yelp_data'

with tarfile.open(tar_path, 'r') as tar:
    members = [m for m in tar.getmembers() if 'yelp_academic_dataset_business.json' in m.name]
    tar.extractall(path=output_dir, members=members)

print("✅ Extracted business JSON only")

In [None]:
# get review data - ONLY NEED TO DO ONCE
with tarfile.open(tar_path, 'r') as tar:
    members = [m for m in tar.getmembers() if 'yelp_academic_dataset_review.json' in m.name]
    tar.extractall(path=output_dir, members=members)

print("✅ Extracted review JSON only")

In [None]:
# filtering rules
# whitelist beli-relevant keywords, blacklist extremely irrelevant keywords
csv_path = '/content/drive/MyDrive/data/Updated_Whitelist_and_Blacklist.csv'
lists_df = pd.read_csv(csv_path)

whitelist = set(lists_df['Whitelist'].dropna().str.lower())
blacklist = set(lists_df['Blacklist'].dropna().str.lower())

def is_food_related(categories):
    if not isinstance(categories, str):
        return False
    category_list = [cat.strip().lower() for cat in categories.split(',')]
    return any(cat in whitelist for cat in category_list)

def is_blacklisted(categories):
    if not isinstance(categories, str):
        return False
    category_list = [cat.strip().lower() for cat in categories.split(',')]
    return any(cat in blacklist for cat in category_list)

In [None]:
# make a new json with filtered businesses (beli-relevant)
# ONLY NEED TO DO ONCE BC WE STORED NEW JSON

# base_dir = '/content/drive/MyDrive/data/yelp_data'
# original_file = os.path.join(base_dir, 'yelp_academic_dataset_business.json')
# filtered_file = os.path.join(base_dir, 'filtered_businesses.jsonl')


# with open(original_file, 'r', encoding='utf-8') as infile, \
#      open(filtered_file, 'w', encoding='utf-8') as outfile:

#     for line in infile:
#         try:
#             business = json.loads(line)
#             cats = business.get('categories')
#             if is_food_related(cats) and not is_blacklisted(cats):
#                 outfile.write(json.dumps(business) + '\n')
#         except json.JSONDecodeError:
#             continue


In [None]:
# possibly delete the original file for space
# os.remove(original_file)
# print(f"Deleted original file: {original_file}")

In [None]:
# turn filtered business json into df

filtered_path = '/content/drive/MyDrive/data/yelp_data/filtered_businesses.jsonl'

# Load into DataFrame
filtered_businesses_df = pd.read_json(filtered_path, lines=True)

# get business ids to use for matching/fitlering other datasets
filtered_business_ids = set(filtered_businesses_df['business_id'])

print(f"✅ Loaded {len(filtered_businesses_df)} businesses")

In [None]:
# filter reviews to match businesses
# ONLY NEED TO DO ONCE BC WE STORED NEW JSON

# review_input_path = '/content/drive/MyDrive/data/yelp_data/yelp_academic_dataset_review.json'
# filtered_review_output_path = '/content/drive/MyDrive/data/yelp_data/filtered_reviews.jsonl'

# # Stream and filter
# with open(review_input_path, 'r', encoding='utf-8') as infile, \
#      open(filtered_review_output_path, 'w', encoding='utf-8') as outfile:

#     for line in infile:
#         try:
#             review = json.loads(line)
#             if review['business_id'] in filtered_business_ids:
#                 outfile.write(json.dumps(review) + '\n')
#         except json.JSONDecodeError:
#             continue

# print("✅ Finished filtering reviews")

In [None]:
# get user ids from reviews without making a df bc ram is cooked

filtered_reviews_path = '/content/drive/MyDrive/data/yelp_data/filtered_reviews.jsonl'
relevant_user_ids = set()

with open(filtered_reviews_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            review = json.loads(line)
            relevant_user_ids.add(review['user_id'])
        except json.JSONDecodeError:
            continue

print(f"✅ Found {len(relevant_user_ids):,} unique user IDs")

In [None]:
# filter users to match reviews
# ONLY DO ONCE

# user_json_path = '/content/drive/MyDrive/data/yelp_data/yelp_academic_dataset_user.json'
# filtered_users_path = '/content/drive/MyDrive/data/yelp_data/filtered_users.jsonl'

# # Filter and write
# with open(user_json_path, 'r', encoding='utf-8') as infile, \
#      open(filtered_users_path, 'w', encoding='utf-8') as outfile:

#     for line in infile:
#         try:
#             user = json.loads(line)
#             if user['user_id'] in relevant_user_ids:
#                 outfile.write(json.dumps(user) + '\n')
#         except json.JSONDecodeError:
#             continue

# print(f"✅ Done! Filtered users saved to: {filtered_users_path}")

**Look at data**

In [None]:
sampled_businesses = []

with open('/content/drive/MyDrive/data/yelp_data/filtered_businesses.jsonl', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 10: break
        sampled_businesses.append(json.loads(line))

# Display
import pandas as pd
pd.DataFrame(sampled_businesses)

In [None]:
sampled_reviews = []

with open('/content/drive/MyDrive/data/yelp_data/filtered_reviews.jsonl', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 10: break
        sampled_reviews.append(json.loads(line))

pd.DataFrame(sampled_reviews)

In [None]:
sampled_users = []

with open('/content/drive/MyDrive/data/yelp_data/filtered_users.jsonl', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 10: break
        sampled_users.append(json.loads(line))

pd.DataFrame(sampled_users)

In [None]:
# change this so that we do all the preprocessing at once !!!!!!

input_path = '/content/drive/MyDrive/data/yelp_data/filtered_businesses.jsonl'
output_path = '/content/drive/MyDrive/data/filtered_businesses_with_price.jsonl'

def extract_price_range(business):
    attr = business.get('attributes')
    if isinstance(attr, dict):
        business['price_range'] = attr.get('RestaurantsPriceRange2')
    else:
        business['price_range'] = None
    return business

# Stream and write
with open(input_path, 'r', encoding='utf-8') as infile, \
     open(output_path, 'w', encoding='utf-8') as outfile:

    for line in infile:
        try:
            business = json.loads(line)
            business = extract_price_range(business)
            outfile.write(json.dumps(business) + '\n')
        except json.JSONDecodeError:
            continue

print("✅ Saved businesses with price_range extracted.")
