<a href="https://colab.research.google.com/github/ekang100/taste-twin/blob/main/taste_twin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#pip install pandas scikit-learn sentence-transformers

# Playing with Mock Data

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [None]:
## mock data

# restaurants
restaurant_metadata = {
    "leo": {
        "cuisines": ["Italian"],
        "tags": ["date night", "natural wine", "cozy"],
        "price": "$$$",
        "location": "SoHo"
    },
    "ugly baby": {
        "cuisines": ["Thai"],
        "tags": ["spicy", "authentic", "no reservations"],
        "price": "$$",
        "location": "Carroll Gardens"
    },
    "kiki": {
        "cuisines": ["Greek"],
        "tags": ["casual", "cheap eats", "group-friendly"],
        "price": "$",
        "location": "LES"
    },
    "roscioli": {
        "cuisines": ["Italian"],
        "tags": ["pasta", "tasting menu", "famous"],
        "price": "$$$$",
        "location": "Rome"  # could be normalized to "International"
    },
    "jeju noodle bar": {
        "cuisines": ["Korean"],
        "tags": ["spicy", "trendy", "noodle-forward"],
        "price": "$$$",
        "location": "West Village"
    },
    "cervo’s": {
        "cuisines": ["Spanish", "Seafood"],
        "tags": ["natural wine", "cozy", "date night"],
        "price": "$$$",
        "location": "Chinatown"
    }
}

# users
userA = {
    "username": "jaysen",
    "been": {
        "leo": 1.0,
        "ugly baby": 2.0,
        "kiki": 4.5
    },
    "want_to_try": ["roscioli", "jeju noodle bar"],
    "reviews": [
        "vibes and wine were immaculate at leo",
        "flavors at ugly baby were wild — spicy and rich",
        "kiki is super casual, went with friends after work"
    ]
}

userB = {
    "username": "alex",
    "been": {
        "ugly baby": 1.2,
        "jeju noodle bar": 2.5,
        "cervo’s": 3.7
    },
    "want_to_try": ["leo", "roscioli"],
    "reviews": [
        "jeju was clean, spicy, modern — loved it",
        "cervo’s was cozy and great with natural wine",
        "ugly baby is always my go-to for heat"
    ]
}

In [None]:
# build "taste" vectors

from collections import defaultdict
model = SentenceTransformer('all-MiniLM-L6-v2')

def normalize_dict(d):
    total = sum(d.values())
    return {k: v / total for k, v in d.items()} if total > 0 else d

def build_vector(user, metadata, field, want_weight=0.3):
    vec = defaultdict(float)

    # Strong signal: Been list
    for r, rank in user["been"].items():
        weight = 1 / rank
        values = metadata[r][field]
        if isinstance(values, list):
            for val in values:
                vec[val] += weight
        else:
            vec[values] += weight

    # Weak signal: Want to try list
    for r in user.get("want_to_try", []):
        if r in metadata:
            values = metadata[r][field]
            if isinstance(values, list):
                for val in values:
                    vec[val] += want_weight
            else:
                vec[values] += want_weight

    return normalize_dict(vec)


def build_price_vector(user, metadata):
    return build_vector(user, metadata, field="price")

def build_cuisine_vector(user, metadata):
    return build_vector(user, metadata, field="cuisines")

def build_tag_vector(user, metadata):
    return build_vector(user, metadata, field="tags")

def build_location_vector(user, metadata):
    return build_vector(user, metadata, field="location")

def get_review_embedding(user):
    if not user["reviews"]:
        return np.zeros(384)
    embeddings = model.encode(user["reviews"])
    return np.mean(embeddings, axis=0)

In [None]:
# taste vectors for mock users
userA_vectors = {
    "cuisine_vector": build_cuisine_vector(userA, restaurant_metadata),
    "tag_vector": build_tag_vector(userA, restaurant_metadata),
    "price_vector": build_price_vector(userA, restaurant_metadata),
    "location_vector": build_location_vector(userA, restaurant_metadata),
    "review_vector": get_review_embedding(userA)
}

userB_vectors = {
    "cuisine_vector": build_cuisine_vector(userB, restaurant_metadata),
    "tag_vector": build_tag_vector(userB, restaurant_metadata),
    "price_vector": build_price_vector(userB, restaurant_metadata),
    "location_vector": build_location_vector(userB, restaurant_metadata),
    "review_vector": get_review_embedding(userB)
}

In [None]:
# compatibility with cosine
def dict_cosine(d1, d2):
    keys = set(d1.keys()).union(d2.keys())
    v1 = np.array([d1.get(k, 0) for k in keys])
    v2 = np.array([d2.get(k, 0) for k in keys])
    return cosine_similarity([v1], [v2])[0][0]

def compute_compatibility(u1, u2):
    return round(100 * (
        0.3 * dict_cosine(u1["cuisine_vector"], u2["cuisine_vector"]) +
        0.2 * dict_cosine(u1["tag_vector"], u2["tag_vector"]) +
        0.15 * dict_cosine(u1["price_vector"], u2["price_vector"]) +
        0.15 * dict_cosine(u1["location_vector"], u2["location_vector"]) +
        0.2 * cosine_similarity([u1["review_vector"]], [u2["review_vector"]])[0][0]
    ), 2)

In [None]:
score = compute_compatibility(userA_vectors, userB_vectors)
print(f"🧠 Jaysen and Alex's taste compatibility: {score}%")

# Yelp Data

In [1]:
import json
import pandas as pd
import os
import tarfile

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# extact data
import tarfile

tar_path = '/content/drive/MyDrive/data/yelp_dataset.tar'
extract_path = '/content/drive/MyDrive/data/yelp_data'

with tarfile.open(tar_path, 'r') as tar:
    tar.extractall(path=extract_path)

# this didnt get all of the jsons so had to individually extract..

In [None]:
!apt-get install -y tar

In [None]:
# get business data - ONLY NEED TO DO ONCE
tar_path = '/content/drive/MyDrive/data/yelp_dataset.tar'
output_dir = '/content/drive/MyDrive/data/yelp_data'

with tarfile.open(tar_path, 'r') as tar:
    members = [m for m in tar.getmembers() if 'yelp_academic_dataset_business.json' in m.name]
    tar.extractall(path=output_dir, members=members)

print("✅ Extracted business JSON only")

In [None]:
# get review data - ONLY NEED TO DO ONCE
with tarfile.open(tar_path, 'r') as tar:
    members = [m for m in tar.getmembers() if 'yelp_academic_dataset_review.json' in m.name]
    tar.extractall(path=output_dir, members=members)

print("✅ Extracted review JSON only")

In [None]:
# filtering rules
# whitelist beli-relevant keywords, blacklist extremely irrelevant keywords
csv_path = '/content/drive/MyDrive/data/Updated_Whitelist_and_Blacklist.csv'
lists_df = pd.read_csv(csv_path)

whitelist = set(lists_df['Whitelist'].dropna().str.lower())
blacklist = set(lists_df['Blacklist'].dropna().str.lower())

def is_food_related(categories):
    if not isinstance(categories, str):
        return False
    category_list = [cat.strip().lower() for cat in categories.split(',')]
    return any(cat in whitelist for cat in category_list)

def is_blacklisted(categories):
    if not isinstance(categories, str):
        return False
    category_list = [cat.strip().lower() for cat in categories.split(',')]
    return any(cat in blacklist for cat in category_list)

In [None]:
# make a new json with filtered businesses (beli-relevant)
# ONLY NEED TO DO ONCE BC WE STORED NEW JSON

# base_dir = '/content/drive/MyDrive/data/yelp_data'
# original_file = os.path.join(base_dir, 'yelp_academic_dataset_business.json')
# filtered_file = os.path.join(base_dir, 'filtered_businesses.jsonl')


# with open(original_file, 'r', encoding='utf-8') as infile, \
#      open(filtered_file, 'w', encoding='utf-8') as outfile:

#     for line in infile:
#         try:
#             business = json.loads(line)
#             cats = business.get('categories')
#             if is_food_related(cats) and not is_blacklisted(cats):
#                 outfile.write(json.dumps(business) + '\n')
#         except json.JSONDecodeError:
#             continue


In [None]:
# possibly delete the original file for space
# os.remove(original_file)
# print(f"Deleted original file: {original_file}")

In [None]:
# turn filtered business json into df

filtered_path = '/content/drive/MyDrive/data/yelp_data/filtered_businesses.jsonl'

# Load into DataFrame
filtered_businesses_df = pd.read_json(filtered_path, lines=True)

# get business ids to use for matching/fitlering other datasets
filtered_business_ids = set(filtered_businesses_df['business_id'])

print(f"✅ Loaded {len(filtered_businesses_df)} businesses")

In [None]:
# filter reviews to match businesses
# ONLY NEED TO DO ONCE BC WE STORED NEW JSON

# review_input_path = '/content/drive/MyDrive/data/yelp_data/yelp_academic_dataset_review.json'
# filtered_review_output_path = '/content/drive/MyDrive/data/yelp_data/filtered_reviews.jsonl'

# # Stream and filter
# with open(review_input_path, 'r', encoding='utf-8') as infile, \
#      open(filtered_review_output_path, 'w', encoding='utf-8') as outfile:

#     for line in infile:
#         try:
#             review = json.loads(line)
#             if review['business_id'] in filtered_business_ids:
#                 outfile.write(json.dumps(review) + '\n')
#         except json.JSONDecodeError:
#             continue

# print("✅ Finished filtering reviews")

In [None]:
# get user ids from reviews without making a df bc ram is cooked

filtered_reviews_path = '/content/drive/MyDrive/data/yelp_data/filtered_reviews.jsonl'
relevant_user_ids = set()

with open(filtered_reviews_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            review = json.loads(line)
            relevant_user_ids.add(review['user_id'])
        except json.JSONDecodeError:
            continue

print(f"✅ Found {len(relevant_user_ids):,} unique user IDs")

In [None]:
# filter users to match reviews
# ONLY DO ONCE

# user_json_path = '/content/drive/MyDrive/data/yelp_data/yelp_academic_dataset_user.json'
# filtered_users_path = '/content/drive/MyDrive/data/yelp_data/filtered_users.jsonl'

# # Filter and write
# with open(user_json_path, 'r', encoding='utf-8') as infile, \
#      open(filtered_users_path, 'w', encoding='utf-8') as outfile:

#     for line in infile:
#         try:
#             user = json.loads(line)
#             if user['user_id'] in relevant_user_ids:
#                 outfile.write(json.dumps(user) + '\n')
#         except json.JSONDecodeError:
#             continue

# print(f"✅ Done! Filtered users saved to: {filtered_users_path}")

**Look at data**

In [3]:
sampled_businesses = []

with open('/content/drive/MyDrive/data/yelp_data/filtered_businesses.jsonl', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 10: break
        sampled_businesses.append(json.loads(line))

# Display
import pandas as pd
pd.DataFrame(sampled_businesses)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
1,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
2,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '..."
3,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,"{'Caters': 'True', 'Alcohol': 'u'full_bar'', '...","Pubs, Restaurants, Italian, Bars, American (Tr...",
4,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10,1,"{'RestaurantsAttire': ''casual'', 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '..."
5,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."
6,il_Ro8jwPlHresjw9EGmBg,Denny's,8901 US 31 S,Indianapolis,IN,46227,39.637133,-86.127217,2.5,28,1,"{'RestaurantsReservations': 'False', 'Restaura...","American (Traditional), Restaurants, Diners, B...","{'Monday': '6:0-22:0', 'Tuesday': '6:0-22:0', ..."
7,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,2575 E Bay Dr,Largo,FL,33771,27.916116,-82.760461,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","Food, Delis, Italian, Bakeries, Restaurants","{'Monday': '10:0-18:0', 'Tuesday': '10:0-20:0'..."
8,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,PA,19106,39.953949,-75.143226,4.0,245,1,"{'RestaurantsReservations': 'True', 'Restauran...","Sushi Bars, Restaurants, Japanese","{'Tuesday': '13:30-22:0', 'Wednesday': '13:30-..."
9,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,PA,19147,39.943223,-75.162568,4.5,205,1,"{'NoiseLevel': 'u'quiet'', 'GoodForMeal': '{'d...","Korean, Restaurants","{'Monday': '11:30-20:30', 'Tuesday': '11:30-20..."


In [None]:
sampled_reviews = []

with open('/content/drive/MyDrive/data/yelp_data/filtered_reviews.jsonl', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 10: break
        sampled_reviews.append(json.loads(line))

pd.DataFrame(sampled_reviews)

In [None]:
sampled_users = []

with open('/content/drive/MyDrive/data/yelp_data/filtered_users.jsonl', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 10: break
        sampled_users.append(json.loads(line))

pd.DataFrame(sampled_users)

In [4]:
# get categories so we can label cuisines and categorize further

input_path = '/content/drive/MyDrive/data/yelp_data/filtered_businesses.jsonl'

unique_categories = set()

with open(input_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            business = json.loads(line)
            cats = business.get('categories')
            if isinstance(cats, str):
                tags = [tag.strip().lower() for tag in cats.split(',')]
                unique_categories.update(tags)
        except json.JSONDecodeError:
            continue

# Display sorted unique categories
print(f"Total unique tags: {len(unique_categories)}")
for tag in sorted(unique_categories):
    print(tag)


Total unique tags: 270
acai bowls
afghan
african
american (new)
american (traditional)
arabic
argentine
armenian
asian fusion
australian
austrian
bagels
bakeries
bangladeshi
bar crawl
barbeque
bars
basque
beach bars
beer
beer bar
beer gardens
beer hall
belgian
bistros
brazilian
breakfast & brunch
breweries
brewing supplies
brewpubs
british
bubble tea
buffets
burgers
burmese
cafes
cafeteria
cajun/creole
calabrian
cambodian
canadian (new)
candy stores
cantonese
caribbean
champagne bars
cheese shops
cheesesteaks
chicken shop
chicken wings
chinese
chocolatiers & shops
cideries
cigar bars
club crawl
cocktail bars
coffee & tea
coffee & tea supplies
coffee roasteries
coffeeshops
colombian
comedy clubs
comfort food
convenience stores
conveyor belt sushi
creperies
cuban
cucina campana
cupcakes
custom cakes
czech
dance clubs
delicatessen
delis
desserts
dim sum
diners
dinner theater
discount store
distilleries
dive bars
do-it-yourself food
dominican
donairs
donburi
donuts
drive-thru bars
dumpling

Preproccesing for features

In [13]:
# keywords
cuisine_keywords = {
    'afghan', 'african', 'american', 'arabic', 'argentine', 'armenian', 'asian',
    'australian', 'austrian', 'bangladeshi', 'basque', 'belgian', 'brazilian',
    'british', 'burmese', 'cajun', 'cajun/creole', 'calabrian', 'cambodian', 'canadian',
    'cantonese', 'caribbean', 'chinese', 'colombian', 'cuban', 'czech',
    'dominican', 'eastern european', 'egyptian', 'ethiopian', 'filipino',
    'french', 'fuzhou',
    'georgian', 'german', 'greek', 'guamanian', 'hainan', 'haitian', 'hakka',
    'hawaiian', 'himalayan', 'honduran', 'hungarian', 'iberian', 'indian',
    'indonesian', 'international', 'irish', 'israeli', 'italian', 'japanese',
    'korean', 'laotian', 'latin american', 'lebanese', 'malaysian',
    'mediterranean', 'mexican', 'middle eastern', 'modern european',
    'mongolian', 'moroccan', 'new mexican', 'nicaraguan', 'oriental',
    'pakistani', 'pan asian', 'persian', 'peruvian', 'polish', 'portuguese',
    'puerto rican', 'roman', 'russian', 'salvadoran', 'sardinian',
    'scandinavian', 'scottish', 'senegalese', 'serbo croatian', 'shanghainese',
    'sicilian', 'singaporean','somali', 'south african', 'southern', 'spanish',
    'sri lankan', 'syrian', 'szechuan', 'taiwanese', 'thai', 'trinidadian',
    'turkish', 'tuscan', 'ukrainian', 'uzbek', 'venezuelan', 'vietnamese'
}

type_map = {
    'bar': {'bars', 'bar crawl', 'beer bar', 'cocktail bars', 'champagne bars',
            'cigar bars','comedy clubs', 'dance clubs', 'dive bars',
            'drive-thru bars', 'gay bars', 'hookah bars', 'hotel bar',
            'irish pub', 'karaoke', 'lounges', 'nightlife', 'piano bars',
            'pool halls','speakeasies', 'sports bars', 'tiki bars',
            'vermouth bars', 'whiskey bars', 'wine bars'},

    'coffee & tea': {'coffee & tea', 'coffee roasteries', 'coffeeshops',
                     'tea rooms', 'juice bars & smoothies', 'kombucha',
                     'bubble tea', 'parent cafes', 'internet cafes'},

    'ice cream & dessert': {'ice cream & frozen yogurt', 'desserts', 'gelato',
                            'shaved ice', 'shaved snow', 'cupcakes', 'macarons',
                            'popcorn shops', 'chocolatiers & shops',
                            'acai bowls'},

    'bakery': {'bakeries', 'bagels', 'cheese shops', 'candy stores',
               'patisserie/cake shop', 'donuts','pretzels', 'custom cakes',
               'cake shop'},

    'restaurant': {'restaurants', 'food', 'cafes', 'bistros', 'diners',
                   'steakhouses', 'delis', 'pizza','fast food', 'buffets',
                   'sushi bars', 'ramen', 'hot pot', 'noodles', 'kebab',
                   'sandwiches', 'burgers', 'tacos', 'poke', 'salad', 'wraps',
                   'pasta shops', 'conveyor belt sushi', 'fondue', 'creperies',
                   'falafel', 'ethnic food','vegan', 'vegetarian',
                   'comfort food', 'soul food', 'halal', 'gluten-free'}
}

type_priority = ['bar', 'coffee & tea', 'ice cream & dessert', 'bakery',
                 'restaurant']

In [14]:
#preprocessing
# extract price, cuisine, type of restaurant

input_path = '/content/drive/MyDrive/data/yelp_data/filtered_businesses.jsonl'
output_path = '/content/drive/MyDrive/data/filtered_businesses_with_features.jsonl'

def extract_price_range(business):
    attr = business.get('attributes')
    if isinstance(attr, dict):
        business['price_range'] = attr.get('RestaurantsPriceRange2')
    else:
        business['price_range'] = None
    return business

def extract_cuisines(business):
    cats = business.get('categories')
    if not isinstance(cats, str):
        business['cuisines'] = []
        return business

    tokens = [cat.strip().lower() for cat in cats.split(',')]
    cuisines = list(set(token for token in tokens if token in cuisine_keywords))
    business['cuisines'] = cuisines
    return business

def assign_type(business):
    cats = business.get('categories')
    if not isinstance(cats, str):
        business['type_tags'] = []
        business['primary_type'] = 'restaurant'
        return business

    tokens = set(cat.strip().lower() for cat in cats.split(','))
    type_tags = []

    for label, keywords in type_map.items():
        if tokens & keywords:
            type_tags.append(label)

    primary = next((t for t in type_priority if t in type_tags), 'restaurant')

    business['type_tags'] = type_tags
    business['primary_type'] = primary
    return business

# Stream and write
with open(input_path, 'r', encoding='utf-8') as infile, \
     open(output_path, 'w', encoding='utf-8') as outfile:

    for line in infile:
        try:
            business = json.loads(line)
            business = extract_price_range(business)
            business = extract_cuisines(business)
            business = assign_type(business)
            outfile.write(json.dumps(business) + '\n')
        except json.JSONDecodeError:
            continue

print("✅ Saved businesses with features extracted.")


✅ Saved businesses with features extracted.


In [None]:
# look over what we're working with now

file_path = '/content/drive/MyDrive/data/filtered_businesses_with_features.jsonl'

businesses = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            businesses.append(json.loads(line))
        except json.JSONDecodeError:
            continue

# Dataset length
print(f"✅ Total businesses loaded: {len(businesses)}") #56287

# businesses with labeled cuisines
num_with_cuisines = sum(1 for b in businesses if b.get('cuisines')) #20501
print(f"✅ Businesses with at least one cuisine: {num_with_cuisines}")

# samples
df = pd.DataFrame(businesses)
df[['name', 'categories', 'cuisines', 'price_range', 'type_tags', 'primary_type']].sample(10)

Restaurant-only data with extracted price range, cuisine, type: /content/drive/MyDrive/data/filtered_businesses_with_features.jsonl

Join Restaurants with Reviews (by user)

In [6]:
import json
from collections import defaultdict

# Load all businesses into a dictionary
business_metadata = {}
with open('/content/drive/MyDrive/data/filtered_businesses_with_features.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        try:
            biz = json.loads(line)
            business_metadata[biz['business_id']] = biz
        except json.JSONDecodeError:
            continue

# Group reviews by user_id with business metadata
user_reviews = defaultdict(list)

with open('/content/drive/MyDrive/data/yelp_data/filtered_reviews.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        try:
            review = json.loads(line)
            business_id = review['business_id']
            user_id = review['user_id']

            if business_id not in business_metadata:
                continue  # skip if we don't have metadata

            biz_meta = business_metadata[business_id]
            user_reviews[user_id].append({
                'business_id': business_id,
                'stars': review['stars'],
                'text': review['text'],
                'price_range': biz_meta.get('price_range'),
                'cuisines': biz_meta.get('cuisines'),
                'categories': biz_meta.get('categories'),
                'types': biz_meta.get('type_tags'),
                'primary_category': biz_meta.get('primary_type'),
                'address': biz_meta.get('address'),
                'city': biz_meta.get('city'),
                'state': biz_meta.get('state'),
                'postal_code': biz_meta.get('postal_code'),
                'latitude': biz_meta.get('latitude'),
                'longitude': biz_meta.get('longitude')
            })
        except json.JSONDecodeError:
            continue

# Save the grouped result
output_path = '/content/drive/MyDrive/data/user_reviews_with_metadata.jsonl'
with open(output_path, 'w', encoding='utf-8') as f:
    for user_id, reviews in user_reviews.items():
        f.write(json.dumps({'user_id': user_id, 'reviews': reviews}) + '\n')

print("✅ Saved user-level reviews with business metadata.")

✅ Saved user-level reviews with business metadata.


In [None]:
# TEST AND FIX THIS

file_path = '/content/drive/MyDrive/data/user_reviews_with_metadata.jsonl'

businesses_with_reviews = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            businesses_with_reviews.append(json.loads(line))
        except json.JSONDecodeError:
            continue

# samples
df = pd.DataFrame(businesses_with_reviews)
df.head()