In [1]:
import requests
from bs4 import BeautifulSoup
import psycopg2
import os
from datetime import datetime
from dateutil.rrule import rrule, MONTHLY
import re
import time
import random
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
def connect_db():
    """Establish database connection using psycopg2."""
    try:
        return psycopg2.connect(
            dbname="postgres",
            user="postgres",
            password=os.getenv("DB_PASSWORD"),
            host="localhost"
        )
    except psycopg2.Error as e:
        logging.error("Failed to connect to the database: %s", e)
        return None

In [3]:
def extract_unique_id_from_url(url):
    """ Extract a unique product identifier from the URL. """
    match = re.search(r'/(\d+)[^/]*$', url)
    if match:
        return match.group(1)
    else:
        logging.error("Failed to extract product ID from URL: %s", url)
        return None

In [4]:
def get_headers():
    """ Return headers with a randomly selected user agent. """
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/5.0 (Linux; Android 10; SM-A505FN) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36"
    ]
    return {'User-Agent': random.choice(user_agents)}

In [5]:
def make_request(url, retries=5, backoff_factor=1):
    """ Make HTTP requests with retries and exponential backoff. """
    with requests.Session() as session:
        for attempt in range(retries):
            headers = get_headers()
            try:
                response = session.get(url, headers=headers, timeout=10)
                response.raise_for_status()
                return response
            except requests.exceptions.RequestException as e:
                wait_time = backoff_factor * (2 ** attempt)
                logging.error("Request failed: %s. Retrying in %d seconds.", e, wait_time)
                time.sleep(wait_time)
        return None

In [None]:
def main():
    logging.info("Starting the script...")
    conn = connect_db()
    if not conn:
        logging.error("Exiting due to database connection failure.")
        return
    logging.info("Database connection successful.")

    base_url = "https://web.archive.org/web/20190201183701/https://www.luisaviaroma.com/en-gb/"
    logging.info("Base URL: %s", base_url)
    start_date = datetime(2019, 2, 1)
    end_date = datetime(2024, 6, 30)
    dates = rrule(MONTHLY, dtstart=start_date, until=end_date)
    logging.info("Dates to process: %s", list(dates))

    for date in dates:
        logging.info("Processing date: %s", date)
        response = make_request(base_url)
        if response:
            logging.info("Request successful for date: %s", date)
            soup = BeautifulSoup(response.text, 'html.parser')
            # Implement your specific data extraction logic here
            logging.info("Data processed for date: %s", date)
        else:
            logging.error("Failed to make a request for date: %s", date)
        time.sleep(random.randint(10, 60))  # Adjust timing based on scraping policy

    conn.close()
    logging.info("Database connection closed.")

#Uncomment the following line to run the function if needed in the PCI for testing purposes
main()