# Harvesting restaurant details from Yelp API

In [None]:
import requests
import json
import time
import os
import numpy as np

API_KEY = 'FoF_VUDAtcQlIg24PGQbjXTS1rE16Gtg5OOqEQgpsuaxyxdB1VJqRfYNI0MHoQUl14-KWITKu0FRp1AI3lJpcXFFWYigCMGpv73nkeps2EigzA67DUZbg8UzCwaSZnYx'
HEADERS = {'Authorization': f'Bearer {API_KEY}'}
URL = 'https://api.yelp.com/v3/businesses/search'
DATABASE_FILE = 'restaurants_data.json'

def fetch_yelp_data(location, offset):
    params = {
        'term': 'restaurants',
        'location': location,
        'limit': 50,
        'offset': offset
    }
    response = requests.get(URL, headers=HEADERS, params=params)
    return response.json()

def write_to_file(data):

    if os.path.exists(DATABASE_FILE):
        with open(DATABASE_FILE, 'r') as file:
            file_data = json.load(file)
            existing_business_ids = {business['id'] for business in file_data['businesses']}
    else:
        file_data = {'businesses': []}
        existing_business_ids = set()

    new_businesses = [business for business in data['businesses'] if business['id'] not in existing_business_ids]

    if new_businesses:
        # Append new businesses to the existing data
        file_data['businesses'].extend(new_businesses)

        # Write updated data back to the file
        with open(DATABASE_FILE, 'w') as file:
            json.dump(file_data, file, indent=4)

locations = [
    "Los Angeles, CA", "San Diego, CA", "San Jose, CA", "San Francisco, CA",
    "Fresno, CA", "Sacramento, CA", "Long Beach, CA", "Oakland, CA",
    "Bakersfield, CA", "Anaheim, CA", "Santa Ana, CA", "Riverside, CA",
    "Stockton, CA", "Chula Vista, CA", "Irvine, CA", "Fremont, CA",
    "San Bernardino, CA", "Modesto, CA", "Oxnard, CA", "Fontana, CA",
    "Moreno Valley, CA", "Huntington Beach, CA", "Glendale, CA",
    "Santa Clarita, CA", "Garden Grove, CA", "Oceanside, CA",
    "Rancho Cucamonga, CA", "Santa Rosa, CA", "Ontario, CA",
    "Elk Grove, CA", "Corona, CA", "Lancaster, CA", "Palmdale, CA",
    "Hayward, CA", "Salinas, CA", "Pomona, CA", "Escondido, CA",
    "Torrance, CA", "Pasadena, CA", "Orange, CA", "Fullerton, CA",
    "Roseville, CA", "Visalia, CA", "Concord, CA", "Thousand Oaks, CA",
    "Simi Valley, CA", "Victorville, CA", "Vallejo, CA", "Berkeley, CA"
]


for location in locations:
    offset = 0

    while True:
        data = fetch_yelp_data(location, offset)
        if not data.get('businesses'):
            print("Not valid data")
            print(data)
            offset += 50
            break

        write_to_file(data)
        offset += 50
        time.sleep(1)

        if offset >= data['total']:
            print("No more data")
            break

    print("Moving to next location")
print("Finished fetching data")

# Storing image links in JSON file

In [12]:
IMAGES_FILE = "restaurant_images.json"

with open(DATABASE_FILE, 'r') as file:
    data = json.load(file)

image_links = []

for restaurant in data['businesses']:
    image_links.append({
                    "id" : restaurant['id'],
                    "name" : restaurant['name'],
                    "image_url" : restaurant["image_url"]
                })

with open(IMAGES_FILE, 'w') as file:
    json.dump(image_links, file, indent=4)


# Importing data from JSON to SQLite DB

In [15]:
import sqlite3

IMAGES_FILE = "restaurant_images.json"
DATABASE_FILE = "restaurants_data.json"
DB_FILE = "restaurants.db"

with open(DATABASE_FILE, 'r') as file:
    data = json.load(file)

with open(IMAGES_FILE, 'r') as file:
    images = json.load(file)

conn = sqlite3.connect(DB_FILE)
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE IF NOT EXISTS restaurants (
    id TEXT PRIMARY KEY,
    name TEXT,
    cuisine TEXT,
    review_count INTEGER,
    price TEXT,
    phone TEXT,
    image_url TEXT,
    rating REAL,
    address1 TEXT,
    city TEXT,
    state TEXT,
    zip_code TEXT
)
''')

for restaurant in data['businesses']:
    cursor.execute('''
    INSERT OR REPLACE INTO restaurants (id, name, cuisine, review_count, price, phone, image_url, rating, address1, city, state, zip_code)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', (
        restaurant['id'],
        restaurant['name'],
        restaurant['categories'][0]['title'],
        restaurant['review_count'],
        restaurant.get('price'),
        restaurant['phone'],
        restaurant['image_url'],
        restaurant['rating'],
        restaurant['location']['address1'],
        restaurant['location']['city'],
        restaurant['location']['state'],
        restaurant['location']['zip_code']
    ))
    
conn.commit()

print("imported data to sqlite db")

cursor.execute('SELECT image_url, name FROM restaurants LIMIT 5')
rows = cursor.fetchall()

print("Sample data from the database:")
for row in rows:
    print(row)

# Close the connection
conn.close()

imported data to sqlite db
Sample data from the database:
('https://s3-media2.fl.yelpcdn.com/bphoto/IawDcF1QmHSzUQDczHYVuw/o.jpg', 'Bottega')
('https://s3-media2.fl.yelpcdn.com/bphoto/SwPweYT6eqLJxgE_588q2A/o.jpg', 'Bettola')
('https://s3-media4.fl.yelpcdn.com/bphoto/kpN_8IX_eSVwOETLJu-8pQ/o.jpg', 'Blind Butcher')
('https://s3-media2.fl.yelpcdn.com/bphoto/by8Hh63BLPv_HUqRUdsp_w/o.jpg', 'Fog Harbor Fish House')
('https://s3-media3.fl.yelpcdn.com/bphoto/PlZQ7Lm6giFXmp2TUKGVVw/o.jpg', 'The Snug')


# Form pandas DF

In [16]:
import pandas as pd
import json
import os

DATABASE_FILE = 'restaurants_data.json'

def parse_yelp_data(json_data):
    restaurants = []
    for business in json_data['businesses']:
        if 'price' in business  and business['location']['address1']:
            restaurant = {
                'id' : business['id'],
                'name': business['name'],
                'cuisine': ', '.join([category['title'] for category in business['categories']]),
                'city': business['location']['city'],
                'address': business['location']['address1'],
                'price_range': business.get('price', 'N/A'),
                'average_rating': business['rating'],
                'review_count': business['review_count']
            }
            restaurants.append(restaurant)
    return restaurants


def load_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)



json_data = load_json_file(DATABASE_FILE)
parsed_data = parse_yelp_data(json_data)


restaurants_df = pd.DataFrame(parsed_data)
print(restaurants_df.shape)
restaurants_df.head()
    


(9356, 8)


Unnamed: 0,id,name,cuisine,city,address,price_range,average_rating,review_count
0,QueFVMcMlT-6aZFv2M47mg,Bottega,"Italian, Pasta Shops, Pizza",San Francisco,1132 Valencia St,$$,4.3,1119
1,ZSzXw0NgJTyOzcHwKY5eMA,Blind Butcher,"New American, Wine Bars",San Francisco,4058 18th St,$$,4.1,376
2,f-m7-hyFzkf0HSEeQ2s-9A,Fog Harbor Fish House,"Seafood, Wine Bars, Cocktail Bars",San Francisco,39 Pier,$$,4.4,10499
3,_hOVIgjVRl_HzvLaa65KJg,The Snug,"Cocktail Bars, New American",San Francisco,2301 Fillmore St,$$,4.0,439
4,lUUQi1b2rV3glIn4t2I1Iw,Horsefeather,"Cocktail Bars, New American",San Francisco,528 Divisadero St,$$,4.0,606


# Vectorize features and create similarity matrix

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

restaurants_df['combined_features'] = (
    restaurants_df['cuisine'] + ' ' +
    restaurants_df['price_range'] + ' ' +
    restaurants_df['city'] + ' ' +
    restaurants_df['average_rating'].astype(str) + ' ' +
    restaurants_df['review_count'].astype(str)
)

vectorizer = TfidfVectorizer()
feature_matrix = vectorizer.fit_transform(restaurants_df['combined_features'])
similarity_matrix = cosine_similarity(feature_matrix)


# Use cosine similarity to find n most similar restaurants to single given restaurant

In [20]:
def get_recommendations(restaurant_name, df, similarity_matrix, top_n=10):
    try:
        idx = df[df['name'].str.contains(restaurant_name, case=False, na=False)].index[0]
        sim_scores = list(enumerate(similarity_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:top_n+1]
        restaurant_indices = [i[0] for i in sim_scores]
        return df.iloc[restaurant_indices][['name', 'cuisine', 'city', 'average_rating', 'price_range']]
    except IndexError:
        return pd.DataFrame(columns=['name', 'cuisine', 'city', 'average_rating', 'price_range'])


In [21]:
recommendations = get_recommendations('Marufuku Ramen', restaurants_df, similarity_matrix, top_n=5)
recommendations.head()

Unnamed: 0,name,cuisine,city,average_rating,price_range
102,Ushi Taro Ramen,Ramen,San Francisco,4.3,$$
44,Noodle in a Haystack,"Ramen, New American",San Francisco,4.8,$$$$
5123,Sun and Moon,"Thai, Ramen",San Francisco,4.2,$$
85,Hinodeya Japantown,Ramen,San Francisco,4.2,$$
345,Mangrove Kitchen,"Thai, Ramen",San Francisco,4.0,$$


# Create preference vector for user based on initial ratings

In [22]:
def get_user_pref_vector(user_ratings, df, vectorizer):
    rated_restaurants = df[df['name'].isin(user_ratings.keys())]
    ratings = [user_ratings[name] for name in rated_restaurants['name']]
    rated_features = vectorizer.transform(rated_restaurants['combined_features'])
    weighted_average = sum(r * f for r, f in zip(ratings, rated_features)) / sum(ratings)
    
    return weighted_average


# Returns top n most similar restaurants based on user preference vector

In [23]:
def get_user_recommendations(user_ratings, df, vectorizer, similarity_matrix, top_n=10):
    user_pref_vector = get_user_pref_vector(user_ratings, df, vectorizer)
    sim_scores = cosine_similarity(user_pref_vector, vectorizer.transform(df['combined_features'])).flatten()
    sim_scores = sorted(enumerate(sim_scores), key=lambda x: x[1], reverse=True)
    rated_restaurant_indices = df[df['name'].isin(user_ratings.keys())].index
    sim_scores = [(i, score) for i, score in sim_scores if i not in rated_restaurant_indices]
    sim_scores = sim_scores[:top_n]
    restaurant_indices = [i[0] for i in sim_scores]
    return df.iloc[restaurant_indices][['name', 'cuisine', 'city', 'average_rating', 'price_range']]

    

In [24]:
user_ratings = {
    "Donato & Co" : 1,
    "Masa Ramen Bistro" : 5,
    "Fog Harbor Fish House": 1,
    "The Snug" : 1,
    "Horsefeather" : 1,
    "Santeria" : 1
}

recommendations = get_user_recommendations(user_ratings, restaurants_df, vectorizer, similarity_matrix, top_n=5)
recommendations.head()

Unnamed: 0,name,cuisine,city,average_rating,price_range
619,Hinodeya Ramen Bar,"Ramen, Japanese Curry",San Francisco,4.3,$$
1409,Muracci's Berkeley,Japanese Curry,Berkeley,3.5,$$
198,Nippon Curry,Japanese Curry,San Francisco,4.5,$$
1369,Dela Curo Curry,"Sandwiches, Japanese Curry, Burgers",Berkeley,4.2,$$
1172,Champion's Curry,"Japanese Curry, Salad, Sandwiches",Berkeley,3.9,$$
