In [1]:
import os
from dotenv import load_dotenv


load_dotenv()  # take environment variables from .env.

YELP_CLIENT_ID = os.getenv("YELP_CLIENT_ID")
YELP_API_KEY = os.getenv("YELP_API_KEY")



In [2]:
import requests
import json
import pandas as pd


def yelp_business_search_by_location(
    location: str = "Paris", count: int = 200
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Get Yelp data from API.

    Params:
        location: str
        count: int

    Returns:
        json
    """
    headers = {
        "Authorization": f"Bearer {YELP_API_KEY}",
    }
    limit = 50

    businesses = pd.DataFrame()
    reviews = pd.DataFrame()
    photos = pd.DataFrame()
    for offset in range(0, count, limit):
        businesses_request = requests.get(
            "https://api.yelp.com/v3/businesses/search",
            headers=headers,
            params={
                "location": location,
                "limit": limit,
                "offset": offset,
            },
        )

        if businesses_request.status_code == 200:
            businesses_data = businesses_request.json()
            businesses = businesses.append(
                pd.DataFrame(businesses_data["businesses"]), ignore_index=True
            )

            for business in businesses_data["businesses"]:
                business_detail_request = requests.get(
                    f"https://api.yelp.com/v3/businesses/{business['id']}",
                    headers=headers,
                )
                if business_detail_request.status_code == 200:
                    business_detail_data = business_detail_request.json()
                    photos = photos.append(
                        pd.DataFrame(business_detail_data["photos"]),
                        ignore_index=True,
                    )
                else:
                    raise Exception(
                        f"Yelp API request failed with status code \
                             { business_detail_request.status_code }. \
                                 Response text: { business_detail_request.text }"
                    )

            for business in businesses_data["businesses"]:
                business_reviews_request = requests.get(
                    f"https://api.yelp.com/v3/businesses/{business['id']}/reviews",
                    headers=headers,
                )
                if business_reviews_request.status_code == 200:
                    business_reviews_data = business_reviews_request.json()
                    reviews = reviews.append(
                        pd.DataFrame(business_reviews_data["reviews"]),
                        ignore_index=True,
                    )
                else:
                    raise Exception(
                        f"Yelp API request failed with status code { business_reviews_request.status_code }. Response text: { business_reviews_request.text }"
                    )

        else:
            raise Exception(
                f"Yelp API request failed with status code { businesses_request.status_code }. Response text: { businesses_request.text }"
            )

    return businesses, reviews, photos


In [3]:
businesses, reviews, photos = yelp_business_search_by_location()

photos.describe(include="all")


Exception: Yelp API request failed with status code 503. Response text: upstream connect error or disconnect/reset before headers. reset reason: connection termination

In [None]:
import requests
import json
import pandas as pd


def get_yelp_data(
    location: str = "Paris", category: str = "restaurants", count: int = 200
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Get Yelp data from API.

    Params:
        location: str
        count: int

    Returns:
        json
    """
    businesses = pd.DataFrame(
        columns=[
            "alias",
            "review_count",
            "rating",
            "price",
            "latitude",
            "longitude",
            "categories",
            "parent_categories",
        ]
    )
    reviews = pd.DataFrame(columns=["alias", "text", "rating"])
    photos = pd.DataFrame(columns=["alias", "url"])

    url = "https://api.yelp.com/v3/graphql"
    headers = {
        "Authorization": f"Bearer {YELP_API_KEY}",
        "Content-Type": "application/graphql",
    }
    limit = 50
    for offset in range(0, count, limit):

        query = f'{{\n\
    search(categories: "{ category }", location: "{ location }", limit: { limit }, offset: { offset }) {{\n\
        business {{\n\
            alias\n\
            review_count\n\
            rating\n\
            price\n\
            coordinates {{\n\
                latitude\n\
                longitude\n\
            }}\n\
            categories {{\n\
                alias\n\
                parent_categories {{\n\
                    alias\n\
                }}\n\
            }}\n\
            photos\n\
            reviews {{\n\
                text\n\
                rating\n\
            }}\n\
        }}\n\
    }}\n\
}}'
        response = requests.post(url, headers=headers, data=query)
        data = response.json()

        for business in (
            data.get("data", {}).get("search", {}).get("business", [])
        ):
            businesses = businesses.append(
                {
                    "alias": business.get("alias"),
                    "review_count": business.get("review_count"),
                    "rating": business.get("rating"),
                    "price": business.get("price"),
                    "latitude": business.get("coordinates", {}).get("latitude"),
                    "longitude": business.get("coordinates", {}).get(
                        "longitude"
                    ),
                    "categories": [
                        cat.get("alias")
                        for cat in business.get("categories", [])
                    ],
                    "parent_categories": [
                        parent_cat.get("alias")
                        for cat in business.get("categories", [])
                        for parent_cat in cat.get("parent_categories", [])
                    ],
                },
                ignore_index=True,
            )

            for review in business.get("reviews", []) or []:
                reviews = reviews.append(
                    {
                        "alias": business.get("alias"),
                        "text": review.get("text"),
                        "rating": review.get("rating"),
                    },
                    ignore_index=True,
                )

            for photo in business.get("photos", []) or []:
                photos = photos.append(
                    {
                        "alias": business.get("alias"),
                        "url": photo,
                    },
                    ignore_index=True,
                )

    return businesses, reviews, photos


In [None]:
businesses, reviews, photos = get_yelp_data()

In [None]:
businesses.head()


In [None]:
businesses.describe(include="all")


In [None]:
reviews.head()


In [None]:
reviews.describe(include="all")


In [None]:
photos.head()


In [None]:
photos.describe(include="all")
