In [9]:
import re
import requests
from bs4 import BeautifulSoup




# Function to clean the services description
def clean_services(services, words_to_remove):
    cleaned_services = []
    for service in services:
        for word in words_to_remove:
            service = re.sub(r'\b{}\b'.format(re.escape(word)), '', service, flags=re.IGNORECASE)
        cleaned_services.append(service.strip())
    return ', '.join(cleaned_services).replace(' ,', ',').strip()

def extract_comments(link):
    try:
        response = requests.get(link)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        comments = soup.find_all('li', class_='comment')

        for comment in comments:
            spa_client = comment.find('b', class_='fn').get_text()
            spa_client_comment = comment.find('div', class_='comment-content').get_text(strip=True)
            print(f"Spa Client: {spa_client}")
            print(f"Spa Client Comment: {spa_client_comment}")
            print("------------------------------------------------------")

    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"An error occurred: {err}")


# URL of the page to scrape
url = 'https://g1298.com/category/central-area/'

try:
    # Send a request to the website
    response = requests.get(url)
    response.raise_for_status()  # Raises an HTTPError for unsuccessful status codes
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find all articles in the Central Area category
    articles = soup.find_all('article', class_='category-central-area')

    # Loop through each article and extract information
    for article in articles:
        try:
            # Extract the spa name
            spa_name = article.find('h2', class_='entry-title').get_text()

            # Extract the date and datetime
            time_element = article.find('time', class_='entry-date')
            date = time_element.get_text()
            datetime = time_element['datetime']

            # Extract the shop image source and dimensions
            shop_image_tag = article.find('div', class_='post-thumbnail').find('img')
            shop_image_src = shop_image_tag['src']
            shop_image_width = shop_image_tag['width']
            shop_image_height = shop_image_tag['height']

            # Extract the shop description
            description = article.find('h2', class_='wp-block-heading').get_text()

            # Extract the service description
            service_description = article.find('p', class_='has-text-align-center').get_text()

            # Regular expression to capture 'Opening Hours' or 'Operating Hours'
            opening_hours_match = re.search(r'(Opening|Operating) Hours:?\s*(\d{1,2}\.\d{2}(?:am|pm)\s+to\s+\d{1,2}\.\d{2}(?:am|pm))', service_description, re.IGNORECASE)
            opening_hours = opening_hours_match.group(2) if opening_hours_match else "Not available"
            service_description = service_description.replace(opening_hours_match.group(0), '').strip() if opening_hours_match else service_description

            # Extract contact information
            contact_tag = article.find('a', href=re.compile(r'^tel:'))
            if contact_tag:
                contact = contact_tag['href'][4:]  # Extracts the phone number
                contact_format = {
                    "classes": " ".join(contact_tag.get('class', [])),
                    "style": contact_tag.get('style', '')
                }
            else:
                contact = "Not available"
                contact_format = {"classes": "", "style": ""}

            # Extract WhatsApp contact information
            whatsapp_tag = article.find('a', href=re.compile(r'https://api.whatsapp.com/send\?phone='))
            if whatsapp_tag:
                whatsapp_contact = re.search(r'phone=(\d+)', whatsapp_tag['href']).group(1)
                whatsapp_img_tag = whatsapp_tag.find('img')
                whatsapp_format = {
                    "classes": " ".join(whatsapp_img_tag.get('class', [])),
                    "style": whatsapp_img_tag.get('style', '')
                }
            else:
                whatsapp_contact = "Not available"
                whatsapp_format = {"classes": "", "style": ""}

 
            # Extract the spa name
            spa_name = article.find('h2', class_='entry-title').get_text()

            # Extract the service description
            service_element = article.find('p', class_='has-text-align-center')
            services_lines = service_element.get_text(separator='\n').split('\n')

            # Identify the line containing the location
            location = "Not available"
            services = []
            for line in services_lines:
                if 'Singapore' in line:
                    location = line
                elif 'Opening Hours' not in line:
                    services.append(line)

            # Prepare words to be removed from services
            words_to_remove = set(re.findall(r'\b\w+\b', spa_name + ' ' + location))

            # Clean the services description
            service_description = clean_services(services, words_to_remove)
    
            # Print the extracted information
            print(f"url: {url}")
            print(f"Spa Name: {spa_name}")
            print(f"Date: {date}")
            print(f"DateTime published: {datetime}")
            print(f"Shop Image: {shop_image_src} ({shop_image_width} x {shop_image_height})")


            print(f"Spa Name: {spa_name}")
            print(f"Location: {location}")
            print(f"Services: {service_description}")
            
            #print(f"Description: {description}")
           
            print(f"Opening Hours: {opening_hours}")
            print(f"Contact: {contact}")
            print(f"Contact Format: {contact_format}")
            print(f"WhatsApp Contact: {whatsapp_contact}")
            print(f"WhatsApp Format: {whatsapp_format}")

            # Print or process the extracted link and text
            print(f"Continue Reading Link: {continue_reading_link}")
            print(f"Continue Reading Text: {continue_reading_text}")


           # Find the 'Continue reading' anchor tag


            
            continue_reading_tag = article.find('a', class_='more-link')
            
            if continue_reading_tag:
                continue_reading_link = continue_reading_tag['href']
                continue_reading_text = continue_reading_tag.get_text(strip=True).split('“')[0].strip()  # Gets the text 'Continue reading'
                # Extract comments from the spa's page
                extract_comments(continue_reading_link)
            else:
                continue_reading_link = "Not available"
                continue_reading_text = ""


            
            
            print("------------------------------------------------------")

        
        except Exception as e:
            print(f"An error occurred while processing an article: {e}")

except requests.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
except Exception as err:
    print(f"An error occurred: {err}")


url: https://g1298.com/category/central-area/
Spa Name: Seasons Wellness Spa 43 Beach Road
Date: 20/11/2023
DateTime published: 2023-11-20T12:46:19+08:00
Shop Image: data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20500%20300'%3E%3C/svg%3E (500 x 300)
Spa Name: Seasons Wellness Spa 43 Beach Road
Location: 43 Beach Road Singapore 189681
Services: Massages, to Restore the Harmony in Your Body., 60min Body Massage $68, 90min Body Massage $98, 120min Body Massage $116
Opening Hours: 9.00am to 10.30pm
Contact: +65 6336 8105 
Contact Format: {'classes': 'wp-block-button__link has-white-color has-vivid-green-cyan-background-color has-text-color has-background wp-element-button', 'style': 'border-radius:10px'}
WhatsApp Contact: 6594496956
WhatsApp Format: {'classes': 'wp-image-6080', 'style': 'width:225px;height:78px'}
Continue Reading Link: https://g1298.com/mango-spa-3-greendale-avenue/#more-3462
Continue Reading Text: Continue reading
Spa Client: Andrew
Spa C