In [0]:
# Imports
import requests
import json
import pyspark.sql.functions as F
from pyspark.sql.functions import col

# Transform sunrise and sunset to datetime in NYC timezone
from datetime import datetime, timezone
from zoneinfo import ZoneInfo
import time
import uuid

In [0]:
API_KEY = dbutils.widgets.get('PLACE_API_KEY')

In [0]:
class AttractionFinder:
    def __init__(self, API_KEY):
        self.API_KEY = API_KEY
        self.url = "https://places.googleapis.com/v1/places:searchNearby"
    def get_attraction(self):
        headers = {
            "Content-Type": "application/json",
            "X-Goog-Api-Key": self.API_KEY,
            "X-Goog-FieldMask": "places.id,places.displayName.text,places.primaryType,places.formattedAddress,places.generativeSummary.overview.text,places.rating,places.userRatingCount,places.priceLevel,places.businessStatus,places.currentOpeningHours.openNow,places.parkingOptions,places.reviewSummary"
        }

        data = {
                "includedTypes": ["tourist_attraction", "historical_place", "monument"],
                "maxResultCount": 20,
                "locationRestriction": {
                    "circle": {
                        "center": {
                            "latitude": 40.7128,
                            "longitude": -74.0060
                        },
                    "radius": 10000
                }
            }
        }
        r = requests.post(self.url, headers=headers, json=data)
        return r.json()

In [0]:
a = AttractionFinder(API_KEY)
attraction = a.get_attraction()
attraction

In [0]:
ingest_ts = datetime.now(timezone.utc).isoformat()
target_tz = ZoneInfo("America/New_York")
run_id = str(uuid.uuid4())

rows = []
for p in attraction["places"]:
    place_id = p.get("id")
    name = p.get("displayName", {}).get("text")
    overview = p.get("generativeSummary", {}).get("overview", {}).get("text")
    address = p.get("formattedAddress")
    rating = p.get("rating")
    user_rating_count = p.get("userRatingCount")
    price_level = p.get("priceLevel")
    business_status = p.get("businessStatus")
    primary_type = p.get("primaryType")
    open_now = p.get("currentOpeningHours", {}).get("openNow")
    parking_options = p.get("parkingOptions")
    review_summary = p.get("reviewSummary", {}).get("text", {}).get("text")

    # (optional) you can store a "NYC local ingest time" too
    ingest_ts_nyc = datetime.now(timezone.utc).astimezone(target_tz).isoformat()

    rows.append([
        run_id, ingest_ts_nyc,
        place_id, name, primary_type,
        rating, user_rating_count, overview, review_summary, price_level,
        business_status, open_now,
        address, parking_options
    ])

schema = [
    "run_id", "ingest_ts_nyc",
    "place_id", "name", "primary_type",
    "rating", "user_rating_count","overview", "review_summary", "price_level",
    "business_status", "open_now",
    "address", "parking_options",
]

df_places = spark.createDataFrame(rows, schema=schema)
display(df_places)

In [0]:
%sql
-- Let's create a volume
CREATE VOLUME IF NOT EXISTS travel_planning.lakehouse.raw_data_attraction
COMMENT 'This is the raw data volume for the attraction pipeline';

In [0]:

# Get timestamp
stamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

# Path to save
json_path = f'/Volumes/travel_planning/lakehouse/raw_data_attraction/attraction_{stamp}.json'

# Save the data into a json file
df_places.write.mode('append').json(json_path)

In [0]:
(
  df_places
  .write
  .format('delta')
  .mode("append")
  .saveAsTable('travel_planning.silver.attraction')
)

In [0]:
%sql
SELECT *
FROM travel_planning.silver.attraction