In [1]:
import os
from dotenv import load_dotenv


load_dotenv()  # take environment variables from .env.

YELP_CLIENT_ID = os.getenv("YELP_CLIENT_ID")
YELP_API_KEY = os.getenv("YELP_API_KEY")


In [2]:
import requests
import pandas as pd


def yelp_business_search_by_location(
    location: str = "Paris", count: int = 200
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Get Yelp data from API.

    Params:
        location: str
        count: int

    Returns:
        json
    """
    headers = {
        "Authorization": f"Bearer {YELP_API_KEY}",
    }
    limit = 50

    businesses = pd.DataFrame()
    reviews = pd.DataFrame()
    photos = pd.DataFrame()
    for offset in range(0, count, limit):
        businesses_request = requests.get(
            "https://api.yelp.com/v3/businesses/search",
            headers=headers,
            params={
                "location": location,
                "limit": limit,
                "offset": offset,
            },
        )

        if businesses_request.status_code == 200:
            businesses_data = businesses_request.json()
            businesses = businesses.append(
                pd.DataFrame(businesses_data["businesses"]), ignore_index=True
            )

            for business in businesses_data["businesses"]:
                business_detail_request = requests.get(
                    f"https://api.yelp.com/v3/businesses/{business['id']}",
                    headers=headers,
                )
                if business_detail_request.status_code == 200:
                    business_detail_data = business_detail_request.json()
                    photos = photos.append(
                        pd.DataFrame(business_detail_data["photos"]),
                        ignore_index=True,
                    )
                else:
                    raise Exception(
                        f"Yelp API request failed with status code \
                             { business_detail_request.status_code }. \
                                 Response text: { business_detail_request.text }"
                    )

            for business in businesses_data["businesses"]:
                business_reviews_request = requests.get(
                    f"https://api.yelp.com/v3/businesses/{business['id']}/reviews",
                    headers=headers,
                )
                if business_reviews_request.status_code == 200:
                    business_reviews_data = business_reviews_request.json()
                    reviews = reviews.append(
                        pd.DataFrame(business_reviews_data["reviews"]),
                        ignore_index=True,
                    )
                else:
                    raise Exception(
                        f"Yelp API request failed with status code { business_reviews_request.status_code }. Response text: { business_reviews_request.text }"
                    )

        else:
            raise Exception(
                f"Yelp API request failed with status code { businesses_request.status_code }. Response text: { businesses_request.text }"
            )

    return businesses, reviews, photos


In [3]:
import logging
import requests
from hashlib import md5

import pandas as pd


def get_yelp_data(
    location: str = "Paris", category: str = "restaurants", count: int = 200
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Get Yelp data from API.

    - build a GraphQL query to get the data
    - send the query to the Yelp API
    - parse the response
    - return the dataframes

    Params:
        location: str (default: "Paris") - Yelp location to search
        category: str (default: "restaurants") - Yelp category (see https://www.yelp.com/developers/documentation/v3/all_category_list)
        count: int (default: 200) - Yelp count of businesses to retrieve

    Returns:
        businesses: pd.DataFrame - businesses data from Yelp API request
        reviews: pd.DataFrame - reviews data from Yelp API request
        photos: pd.DataFrame - photos data from Yelp API request
    """
    # businesses data (see https://www.yelp.com/developers/graphql/objects/business)
    businesses = pd.DataFrame(
        columns=[
            "business_alias",  # Unique Yelp alias of this business.
            "business_review_count",  # Total number of reviews for this business.
            "business_rating",  # Rating of the business, which is an average of the ratings of all reviews.
            "business_price",  # Price range of the business, from "$" to "$$$$" (inclusive).
            "business_latitude",  # Latitude of the business.
            "business_longitude",  # Longitude of the business.
            "business_categories",  # List of categories the business belongs to.
            "business_parent_categories",  # List of parent categories the business belongs to.
        ]
    )
    reviews = pd.DataFrame(
        columns=[
            "business_alias",  # Unique Yelp alias of the business.
            "review_text",  # Text excerpt of this review.
            "review_rating",  # Rating of this review.
        ]
    )
    photos = pd.DataFrame(
        columns=[
            "business_alias",  # Unique Yelp alias of the business.
            "photo_url",  # URL of the photo.
        ]
    )

    # Yelp's GraphQL endpoint
    url = "https://api.yelp.com/v3/graphql"
    # Request headers
    headers = {
        "Authorization": f"Bearer {YELP_API_KEY}",
        "Content-Type": "application/graphql",
    }

    # Yelp's GraphQL API returns a maximum of 50 results per request
    for offset in range(0, count, 50):
        # Build the GraphQL query
        query = f'{{\n\
    search(categories: "{ category }", location: "{ location }", offset: { offset }, limit: 50) {{\n\
        business {{\n\
            alias\n\
            review_count\n\
            rating\n\
            price\n\
            coordinates {{\n\
                latitude\n\
                longitude\n\
            }}\n\
            categories {{\n\
                alias\n\
                parent_categories {{\n\
                    alias\n\
                }}\n\
            }}\n\
            photos\n\
            reviews {{\n\
                text\n\
                rating\n\
            }}\n\
        }}\n\
    }}\n\
}}'
        # Send the query to the Yelp API
        response = requests.post(url, headers=headers, data=query)
        # Parse the response
        if not response.status_code == 200:
            raise Exception(
                f"Yelp API request failed with status code { response.status_code }. Response text: { response.text }"
            )

        # Parse the response
        data = response.json()

        for business in (
            data.get("data", {}).get("search", {}).get("business", [])
        ):
            # Add the business data to the dataframe
            businesses = businesses.append(
                {
                    "business_alias": business.get("alias"),
                    "business_review_count": business.get("review_count"),
                    "business_rating": business.get("rating"),
                    "business_price": business.get("price"),
                    "business_latitude": business.get("coordinates", {}).get(
                        "latitude"
                    ),
                    "business_longitude": business.get("coordinates", {}).get(
                        "longitude"
                    ),
                    "business_categories": [
                        cat.get("alias")
                        for cat in business.get("categories", [])
                    ],
                    "business_parent_categories": [
                        parent_cat.get("alias")
                        for cat in business.get("categories", [])
                        for parent_cat in cat.get("parent_categories", [])
                    ],
                },
                ignore_index=True,
            )

            for review in business.get("reviews", []) or []:
                # Add the review data to the dataframe
                reviews = reviews.append(
                    {
                        "business_alias": business.get("alias"),
                        "review_text": review.get("text"),
                        "review_rating": review.get("rating"),
                    },
                    ignore_index=True,
                )

            for photo in business.get("photos", []) or []:
                # Add the photo data to the dataframe
                photos = photos.append(
                    {
                        "business_alias": business.get("alias"),
                        "photo_url": photo,
                    },
                    ignore_index=True,
                )

    # Return the dataframes
    return businesses, reviews, photos


def download_photos(
    photos: pd.DataFrame,
    target_path: str,
) -> None:
    # Check if content path exists
    if not os.path.exists(target_path):
        logging.info(f"Creating {target_path}")
        os.makedirs(target_path)

    for photo in photos.itertuples(index=False):
        file_name = f"{ photo.business_alias }_{ md5(photo.photo_url.encode('utf-8')).hexdigest() }.jpg"
        file_path = os.path.join(target_path, file_name)

        if not os.path.exists(file_path):
            request = requests.get(photo.photo_url)
            if not request.status_code == 200:
                raise Exception(
                    f"Yelp API request failed with status code { request.status_code }. Response text: { request.text }"
                )

            photo_data = request.content
            with open(file_path, "wb") as f:
                f.write(photo_data)


In [4]:
businesses_df, reviews_df, photos_df = get_yelp_data()


In [5]:
businesses_df.head()


Unnamed: 0,business_alias,business_review_count,business_rating,business_price,business_latitude,business_longitude,business_categories,business_parent_categories
0,le-comptoir-de-la-gastronomie-paris,1105,4.5,€€,48.864516,2.345402,[french],[restaurants]
1,l-as-du-fallafel-paris,1810,4.5,€,48.857498,2.35908,"[kosher, sandwiches, falafel]","[restaurants, restaurants, mediterranean]"
2,angelina-paris,1345,4.0,€€€,48.865092,2.328464,"[breakfast_brunch, tea, cakeshop]","[restaurants, food, food]"
3,l-avant-comptoir-paris-3,612,4.5,€€,48.85202,2.3388,"[tapas, wine_bars]","[restaurants, bars]"
4,la-coïncidence-paris-4,493,4.5,€€,48.868105,2.284365,[french],[restaurants]


In [6]:
businesses_df.describe(include="all")


Unnamed: 0,business_alias,business_review_count,business_rating,business_price,business_latitude,business_longitude,business_categories,business_parent_categories
count,200,200.0,200.0,200,200.0,200.0,200,200
unique,200,152.0,,4,,,105,31
top,le-comptoir-de-la-gastronomie-paris,74.0,,€€,,,[french],[restaurants]
freq,1,4.0,,99,,,49,101
mean,,,4.28,,48.861413,2.338457,,
std,,,0.315591,,0.011731,0.022675,,
min,,,3.5,,48.8256,2.250765,,
25%,,,4.0,,48.853565,2.329561,,
50%,,,4.5,,48.859895,2.340317,,
75%,,,4.5,,48.867828,2.35423,,


In [7]:
reviews_df.head()


Unnamed: 0,business_alias,review_text,review_rating
0,le-comptoir-de-la-gastronomie-paris,This review is from our 2019 trip. Shame on m...,5
1,le-comptoir-de-la-gastronomie-paris,This place def lives up the hype. Best French...,5
2,le-comptoir-de-la-gastronomie-paris,"While planning a friends trip to Paris, I came...",5
3,l-as-du-fallafel-paris,This is the best falafel sandwich I have ever ...,5
4,l-as-du-fallafel-paris,IMO this is a must try in Paris. \n\nLocated i...,5


In [8]:
reviews_df.describe(include="all")


Unnamed: 0,business_alias,review_text,review_rating
count,600,600,600
unique,200,600,5
top,le-comptoir-de-la-gastronomie-paris,This review is from our 2019 trip. Shame on m...,5
freq,3,1,399


In [9]:
photos_df.head()


Unnamed: 0,business_alias,photo_url
0,le-comptoir-de-la-gastronomie-paris,https://s3-media2.fl.yelpcdn.com/bphoto/Je6THJ...
1,l-as-du-fallafel-paris,https://s3-media2.fl.yelpcdn.com/bphoto/wdIhzK...
2,angelina-paris,https://s3-media3.fl.yelpcdn.com/bphoto/DPM5TB...
3,l-avant-comptoir-paris-3,https://s3-media3.fl.yelpcdn.com/bphoto/mVwgxg...
4,la-coïncidence-paris-4,https://s3-media1.fl.yelpcdn.com/bphoto/QdrAgE...


In [10]:
photos_df.describe(include="all")


Unnamed: 0,business_alias,photo_url
count,200,200
unique,200,200
top,le-comptoir-de-la-gastronomie-paris,https://s3-media2.fl.yelpcdn.com/bphoto/Je6THJ...
freq,1,1


In [11]:
download_photos(photos_df, target_path="../data/raw/photos/")
