In [None]:
import os
import time
from datetime import datetime, timezone
import re
from bs4 import BeautifulSoup
import pandas as pd
import requests
from urllib.parse import quote
import logging
import sys
import json
from apify_client import ApifyClient

In [None]:
# Load env
WEBHOOK_URL = os.environ["DISCORD_WEBHOOK_URL"]
APIFY_TOKEN = os.environ["APIFY_TOKEN"]

# This creates a file called 'pipeline.log' that saves the history of your script.
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("pipeline.log"), # Save to file
        logging.StreamHandler(sys.stdout)    # Print to screen
    ]
)

In [None]:
# Functions

# I'll try to use Discord webhooks to avoid Self-Botting, which is against Terms of Service (ToS)
# I need to use WEBHOOK_URL from my loaded secret
def post_to_discord(content):
    """
    Posts a message to Discord using a webhook.
    This is fully legal and encouraged by Discord.
    """
    data = {
        "content": content
    }

    try:
        result = requests.post(WEBHOOK_URL, json=data, timeout=10) # Always add a timeout!
        result.raise_for_status()
    except requests.exceptions.Timeout:
        logging.error("Request timed out. Discord might be down.")
    except requests.exceptions.ConnectionError:
        logging.error("Connection error. Check your internet.")
    except requests.exceptions.HTTPError as err:
        logging.error(f"HTTP Error: {err}")
    else:
        # This runs only if NO exception occurred
        print(f"Success! Status Code: {result.status_code}")

In [None]:
# Setup search parameters
# Better if extracted time is placed before the creation of the run object
# but for now, it's located here for simplicity
extracted_time = int(time.time())
city = 'taguig'
radius = '10' # in KM
product = 'arctis nova pro'
sort_by = 'best_match'
exact = 'false'
url = f"https://www.facebook.com/marketplace/{quote(city)}/search?sortBy={quote(sort_by)}&query={quote(product)}&exact={quote(exact)}&radius_in_km={quote(radius)}"

In [None]:
# Initialize the ApifyClient with your API token
client = ApifyClient(APIFY_TOKEN)

# Prepare the Actor input
run_input = {
    "startUrls": [
        { "url": url },
    ],
    "resultsLimit": 10,
    "includeListingDetails": False,
}

# Run the Actor and wait for it to finish
run = client.actor("U5DUNxhH3qKt5PnCf").call(run_input=run_input)

# Fetch the Run details specifically for usage
run_details = client.run(run['id']).get()

# Extract key metrics
compute_units = run_details.get('computeUnits')
total_cost_usd = run_details.get('usageTotalUsd') # This is the "usage" in dollars
duration_secs = run_details.get('runTimeSecs')

print(f"Run completed in {duration_secs}s")
print(f"Compute Units used: {compute_units}")
print(f"Total Cost: ${total_cost_usd}")

# Fetch and print Actor results from the run's dataset (if there are any)
for item in client.dataset(run["defaultDatasetId"]).iterate_items():
    print(item)

In [None]:
limits_url = f"https://api.apify.com/v2/users/me/limits?token={APIFY_TOKEN}"

response = requests.get(limits_url)
if response.status_code == 200:
    stats = response.json()['data']
    remaining = stats['limits']['maxMonthlyUsageUsd'] - stats['current']['monthlyUsageUsd']
    print(f"Verified Balance: ${remaining:.2f}")

In [None]:
# Loaded data for sampling
sample_data = "./sample_data.jsonl"
with open(sample_data, "w") as file:
    for item in client.dataset(run["defaultDatasetId"]).iterate_items():
        json.dump(item, file)
        file.write('\n')

In [None]:
sample_data = "./sample_data.jsonl"
result = []
with open(sample_data, "r") as file:
    for line in file:
        s = line.strip()
        d = json.loads(s)
        result.append({
            'search_product': product,
            'search_city': city,
            'search_radius': radius,
            'search_sort_by': sort_by,
            'search_exact': exact,
            'listing_url': d.get('listingUrl'),
            'listing_id': d.get('id'),
            'amount': d.get('listing_price').get('amount'),
            'item_location': d.get('location').get('reverse_geocode').get('city'),
            'item_region': d.get('location').get('reverse_geocode').get('state'),
            'is_sold': d.get('is_sold'),
            'marketplace_listing_title': d.get('marketplace_listing_title'),
            'marketplace_listing_category_id': d.get('marketplace_listing_category_id'),
            'extracted_at': extracted_time,
            'extracted_utc_date': datetime.fromtimestamp(extracted_time).strftime("%Y-%m-%d"),
            'extracted_utc_time': datetime.fromtimestamp(extracted_time).strftime("%H:%M:%S")
        })

In [None]:
# Load the list of dictonaries into a dataframe
items_df = pd.DataFrame(result)

# Start the sending of data to Discord
logging.info("--- Pipeline Started ---")
# Create an empty message
message = ""

# Iterate over each row in the DataFrame containing the 10 cheapest items
for index, row in items_df.iterrows():
    # Append the title, price, and URL of each item to the message string
    message = f"Title: {row['marketplace_listing_title']}\nPrice: {row['amount']}\nLocation: {row['item_location'] + ', ' + row['item_region']}\nURL: {row['listing_url']}\n\n"
    post_to_discord(message)
    # CRITICAL: Sleep to avoid hitting Discord Rate Limits (30 requests/min)
    time.sleep(2)

logging.info("--- Pipeline Finished ---")