In [31]:
import re
import requests
from bs4 import BeautifulSoup

# URL of the page to scrape
# url = 'https://g1298.com/category/west-area/'
url = 'https://g1298.com/category/central-area/'

# Send a request to the website
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')

# Find all articles in the West Area category
articles = soup.find_all('article', class_='category-central-area')


# Loop through each article and extract information
for article in articles:


    # Ensure there are enough classes in the list to avoid IndexError
    if len(article['class']) >7:
        region = article['class'][7]  # Access the 7th element in the list (index 6)
        print(f"region: {region}")
    else:
        print("Class list is too short.")

    
    # Extract the spa name
    spa_name = article.find('h2', class_='entry-title').get_text()

    # Extract the date
    # date = article.find('time', class_='entry-date').get_text()
    
    time_element = article.find('time', class_='entry-date')
    date = time_element.get_text()
    datetime = time_element['datetime']  # This extracts the full datetime string, e.g., "2023-11-02T08:46:59+08:00"


    # Extract the shop image source and dimensions
    shop_image_tag = article.find('div', class_='post-thumbnail').find('img')
    shop_image_src = shop_image_tag['src']
    shop_image_width = shop_image_tag['width']
    shop_image_height = shop_image_tag['height']



   # Find all staff images and extract their names from the alt attribute
    staff_images = article.find_all('img', alt=True)
    staff_names = [img['alt'] for img in staff_images if 'Therapist' in img['alt']]


    
    # Extract all staff images
    staff_images = article.find_all('figure', class_='wp-block-image')
    staff_info = []
    for img in staff_images:
        staff_image_src = img.find('img')['src']
        staff_image_width = img.find('img')['width']
        staff_image_height = img.find('img')['height']
        staff_image_alt = img.find('img')['alt']
        staff_info.append({
            'source': staff_image_src,
            'width': staff_image_width,
            'height': staff_image_height,
            'alt': staff_image_alt
        })

    # Extract the shop description
    description = article.find('h2', class_='wp-block-heading').get_text()


    # Extract the service description
    service_description = article.find('p', class_='has-text-align-center').get_text()

    
    # Using regular expression to find 'Opening Hours'
    #opening_hours_match = re.search(r'Opening Hours\s+\d{1,2}\.\d{2}am\s+to\s+\d{1,2}\.\d{2}pm', service_description)
    #opening_hours_match = re.search(r'Operating Hours\s+(\d{1,2}\.\d{2}am\s+to\s+\d{1,2}\.\d{2}pm)', service_description)
    opening_hours_match = re.search(r'Opening Hours\s*(\d{1,2}\.\d{2}am\s+to\s+\d{1,2}\.\d{2}(?:am|pm))', service_description, re.IGNORECASE)


    if opening_hours_match:
        opening_hours = opening_hours_match.group()
    else:
        opening_hours = "Not available"
    
    # Update service_description to exclude 'Opening Hours'
    service_description = service_description.replace(opening_hours, '').strip()


    
    # Find the a tag with href attribute starting with "tel:"
    contact_tag = article.find('a', href=re.compile(r'^tel:'))
    contact = contact_tag['href'][4:] if contact_tag else "Not available"  # Extracts the phone number part
    
    # Extract contact and WhatsApp information
    contact_info = service_description  # Assuming the contact info is in the same paragraph as the service description
    whatsapp_image = article.find('img', alt=True, src=True)  # Assuming the first image with alt and src is the WhatsApp icon
    whatsapp_link = whatsapp_image['src'] if whatsapp_image else None

    # Print or process the extracted information
    # Extract category

    print(f"url: {url}")
    print(f"Spa Name: {spa_name}")
    print(f"Date: {date}")
    print(f"DateTime published: {datetime}")
    print(f"Shop Image: {shop_image_src} ({shop_image_width} x {shop_image_height})")
#   print(f"Staff Names: {staff_names}")
#   print(f"Staff Images: {staff_info}")
    print(f"Description: {description}")
    print(f"Services: {service_description}")
    print(f"Opening Hours: {opening_hours}")
#   print(f"Contact: {contact_info}")
    print(f"Contact: {contact}")
    print(f"WhatsApp Link: {whatsapp_link}")
    print("------------------------------------------------------")

region: category-central-area
url: https://g1298.com/category/central-area/
Spa Name: Seasons Wellness Spa 43 Beach Road
Date: 20/11/2023
DateTime published: 2023-11-20T12:46:19+08:00
Shop Image: data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20500%20300'%3E%3C/svg%3E (500 x 300)
Description: Seasons Wellness Spa located in 43 Beach Road By G1298.com – The Massage Directory
Services: Massages to Restore the Harmony in Your Body.60min Body Massage $6890min Body Massage $98120min Body Massage $11643 Beach Road Singapore 189681
Opening Hours: Opening Hours 9.00am to 10.30pm
Contact: +65 6336 8105 
WhatsApp Link: data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20500%20300'%3E%3C/svg%3E
------------------------------------------------------
region: category-central-area
url: https://g1298.com/category/central-area/
Spa Name: Ichiban The Spa 9 Selegie Road
Date: 19/11/2023
DateTime published: 2023-11-19T15:39:57+08:00
Shop Imag

In [58]:
import re
import requests
from bs4 import BeautifulSoup
    
    # URL of the page to scrape
    # url = 'https://g1298.com/category/west-area/'
url = 'https://g1298.com/category/central-area/'


try:
    # Send a request to the website
    response = requests.get(url)
    response.raise_for_status()  # This will raise an HTTPError if the HTTP request returned an unsuccessful status code
   
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all articles in the West Area category
    articles = soup.find_all('article', class_='category-central-area')
    
    
    # Loop through each article and extract information
    for article in articles:
        try:
            # Ensure there are enough classes in the list to avoid IndexError
            if len(article['class']) >7:
                region = article['class'][7]  # Access the 7th element in the list (index 6)
                print(f"region: {region}")
            else:
                print("Class list is too short.")
        
            
            # Extract the spa name
            spa_name = article.find('h2', class_='entry-title').get_text()
        
            # Extract the date
            # date = article.find('time', class_='entry-date').get_text()
            
            time_element = article.find('time', class_='entry-date')
            date = time_element.get_text()
            datetime = time_element['datetime']  # This extracts the full datetime string, e.g., "2023-11-02T08:46:59+08:00"
        
        
            # Extract the shop image source and dimensions
            shop_image_tag = article.find('div', class_='post-thumbnail').find('img')
            shop_image_src = shop_image_tag['src']
            shop_image_width = shop_image_tag['width']
            shop_image_height = shop_image_tag['height']
    
        
           # Find all staff images and extract their names from the alt attribute
            staff_images = article.find_all('img', alt=True)
            staff_names = [img['alt'] for img in staff_images if 'Therapist' in img['alt']]
        
        
            
            # Extract all staff images
            staff_images = article.find_all('figure', class_='wp-block-image')
            staff_info = []
            for img in staff_images:
                staff_image_src = img.find('img')['src']
                staff_image_width = img.find('img')['width']
                staff_image_height = img.find('img')['height']
                staff_image_alt = img.find('img')['alt']
                staff_info.append({
                    'source': staff_image_src,
                    'width': staff_image_width,
                    'height': staff_image_height,
                    'alt': staff_image_alt
                })
        
            # Extract the shop description
            description = article.find('h2', class_='wp-block-heading').get_text()
        
        
            # Extract the service description
            service_description = article.find('p', class_='has-text-align-center').get_text()
        
            # Regular expression to capture either 'Opening Hours' or 'Operating Hours'
            opening_hours_match = re.search(r'(Opening|Operating) Hours:?\s*(\d{1,2}\.\d{2}(?:am|pm)\s+to\s+\d{1,2}\.\d{2}(?:am|pm))', service_description, re.IGNORECASE)
            
            if opening_hours_match:
                opening_hours = opening_hours_match.group(2)  # Group 2 contains the time portion
            else:
                opening_hours = "Not available"
        
            # Update service_description to exclude the matched hours text
            service_description = service_description.replace(opening_hours_match.group(0), '').strip() if opening_hours_match else service_description
        
        # Find the a tag with href attribute starting with "tel:"
        #    contact_tag = article.find('a', href=re.compile(r'^tel:'))
        #    contact = contact_tag['href'][4:] if contact_tag else "Not available"  # Extracts the phone number part
        
            # Find the a tag with href attribute starting with "tel:"
            contact_tag = article.find('a', href=re.compile(r'^tel:'))
            
            if contact_tag:
                contact = contact_tag.get_text(strip=True)  # Extracts the phone number text
                contact = contact_tag['href'][4:] if contact_tag else "Not available"  # Extracts the phone number part
                contact_format = {
                    "classes": " ".join(contact_tag.get('class', [])),  # Join all classes into a single string
                    "style": contact_tag.get('style', '')  # Get the style attribute
                }
            else:
                contact = "Not available"
                contact_format = {"classes": "", "style": ""}
          
            
            # Extract contact and WhatsApp information
            # Find the WhatsApp link and extract the phone number
            whatsapp_tag = article.find('a', href=re.compile(r'https://api.whatsapp.com/send\?phone='))
            if whatsapp_tag:
                whatsapp_contact = re.search(r'phone=(\d+)', whatsapp_tag['href']).group(1)
                whatsapp_img_tag = whatsapp_tag.find('img')
                whatsapp_format = {
                    "classes": " ".join(whatsapp_img_tag.get('class', [])),  # Join all classes into a single string
                    "style": whatsapp_img_tag.get('style', '')  # Get the style attribute
                }
            else:
                whatsapp_contact = "Not available"
                whatsapp_format = {"classes": "", "style": ""}
             
        
            # Print or process the extracted information
            # Extract category
        
            print(f"url: {url}")
            print(f"Spa Name: {spa_name}")
            print(f"Date: {date}")
            print(f"DateTime published: {datetime}")
            print(f"Shop Image: {shop_image_src} ({shop_image_width} x {shop_image_height})")
        #   print(f"Staff Names: {staff_names}")
        #   print(f"Staff Images: {staff_info}")
            print(f"Description: {description}")
            print(f"Services: {service_description}")
            print(f"Opening Hours: {opening_hours}")
            print(f"Contact: {contact}")
            print(f"Contact Format: {contact_format}")
            print(f"WhatsApp Contact: {whatsapp_contact}")
            print(f"WhatsApp Format: {whatsapp_format}")
            print("------------------------------------------------------")

        except Exception as e:
            print(f"An error occurred while processing an article: {e}")

except requests.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
except Exception as err:
    print(f"An error occurred: {err}")


region: category-central-area
url: https://g1298.com/category/central-area/
Spa Name: Seasons Wellness Spa 43 Beach Road
Date: 20/11/2023
DateTime published: 2023-11-20T12:46:19+08:00
Shop Image: data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20500%20300'%3E%3C/svg%3E (500 x 300)
Description: Seasons Wellness Spa located in 43 Beach Road By G1298.com – The Massage Directory
Services: Massages to Restore the Harmony in Your Body.60min Body Massage $6890min Body Massage $98120min Body Massage $11643 Beach Road Singapore 189681
Opening Hours: 9.00am to 10.30pm
Contact: +65 6336 8105 
Contact Format: {'classes': 'wp-block-button__link has-white-color has-vivid-green-cyan-background-color has-text-color has-background wp-element-button', 'style': 'border-radius:10px'}
WhatsApp Contact: 6594496956
WhatsApp Format: {'classes': 'wp-image-6080', 'style': 'width:225px;height:78px'}
------------------------------------------------------
region: category-central-ar

In [71]:
import re
import requests
from bs4 import BeautifulSoup

# URL of the page to scrape
url = 'https://g1298.com/category/central-area/'


# Function to clean the services description
def clean_services(services, words_to_remove):
    cleaned_services = []
    for service in services:
        for word in words_to_remove:
            service = re.sub(r'\b{}\b'.format(re.escape(word)), '', service, flags=re.IGNORECASE)
        cleaned_services.append(service.strip())
    return ', '.join(cleaned_services).replace(' ,', ',').strip()



try:
    # Send a request to the website
    response = requests.get(url)
    response.raise_for_status()  # Raises an HTTPError for unsuccessful status codes

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all articles in the Central Area category
    articles = soup.find_all('article', class_='category-central-area')

    # Loop through each article and extract information
    for article in articles:
        try:
            # Extract the spa name
            spa_name = article.find('h2', class_='entry-title').get_text()

            # Extract the date and datetime
            time_element = article.find('time', class_='entry-date')
            date = time_element.get_text()
            datetime = time_element['datetime']

            # Extract the shop image source and dimensions
            shop_image_tag = article.find('div', class_='post-thumbnail').find('img')
            shop_image_src = shop_image_tag['src']
            shop_image_width = shop_image_tag['width']
            shop_image_height = shop_image_tag['height']

            # Extract the shop description
            description = article.find('h2', class_='wp-block-heading').get_text()

            # Extract the service description
            service_description = article.find('p', class_='has-text-align-center').get_text()

            # Regular expression to capture 'Opening Hours' or 'Operating Hours'
            opening_hours_match = re.search(r'(Opening|Operating) Hours:?\s*(\d{1,2}\.\d{2}(?:am|pm)\s+to\s+\d{1,2}\.\d{2}(?:am|pm))', service_description, re.IGNORECASE)
            opening_hours = opening_hours_match.group(2) if opening_hours_match else "Not available"
            service_description = service_description.replace(opening_hours_match.group(0), '').strip() if opening_hours_match else service_description

            # Extract contact information
            contact_tag = article.find('a', href=re.compile(r'^tel:'))
            if contact_tag:
                contact = contact_tag['href'][4:]  # Extracts the phone number
                contact_format = {
                    "classes": " ".join(contact_tag.get('class', [])),
                    "style": contact_tag.get('style', '')
                }
            else:
                contact = "Not available"
                contact_format = {"classes": "", "style": ""}

            # Extract WhatsApp contact information
            whatsapp_tag = article.find('a', href=re.compile(r'https://api.whatsapp.com/send\?phone='))
            if whatsapp_tag:
                whatsapp_contact = re.search(r'phone=(\d+)', whatsapp_tag['href']).group(1)
                whatsapp_img_tag = whatsapp_tag.find('img')
                whatsapp_format = {
                    "classes": " ".join(whatsapp_img_tag.get('class', [])),
                    "style": whatsapp_img_tag.get('style', '')
                }
            else:
                whatsapp_contact = "Not available"
                whatsapp_format = {"classes": "", "style": ""}

 
            # Extract the spa name
            spa_name = article.find('h2', class_='entry-title').get_text()

            # Extract the service description
            service_element = article.find('p', class_='has-text-align-center')
            services_lines = service_element.get_text(separator='\n').split('\n')

            # Identify the line containing the location
            location = "Not available"
            services = []
            for line in services_lines:
                if 'Singapore' in line:
                    location = line
                elif 'Opening Hours' not in line:
                    services.append(line)

            # Prepare words to be removed from services
            words_to_remove = set(re.findall(r'\b\w+\b', spa_name + ' ' + location))

            # Clean the services description
            service_description = clean_services(services, words_to_remove)

            
            # Print the extracted information
            print(f"url: {url}")
            print(f"Spa Name: {spa_name}")
            print(f"Date: {date}")
            print(f"DateTime published: {datetime}")
            print(f"Shop Image: {shop_image_src} ({shop_image_width} x {shop_image_height})")


            print(f"Spa Name: {spa_name}")
            print(f"Location: {location}")
            print(f"Services: {service_description}")
            
            #print(f"Description: {description}")
           
            print(f"Opening Hours: {opening_hours}")
            print(f"Contact: {contact}")
            print(f"Contact Format: {contact_format}")
            print(f"WhatsApp Contact: {whatsapp_contact}")
            print(f"WhatsApp Format: {whatsapp_format}")
            print("------------------------------------------------------")

        except Exception as e:
            print(f"An error occurred while processing an article: {e}")

except requests.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
except Exception as err:
    print(f"An error occurred: {err}")


url: https://g1298.com/category/central-area/
Spa Name: Seasons Wellness Spa 43 Beach Road
Date: 20/11/2023
DateTime published: 2023-11-20T12:46:19+08:00
Shop Image: data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20500%20300'%3E%3C/svg%3E (500 x 300)
Spa Name: Seasons Wellness Spa 43 Beach Road
Location: 43 Beach Road Singapore 189681
Services: Massages, to Restore the Harmony in Your Body., 60min Body Massage $68, 90min Body Massage $98, 120min Body Massage $116
Opening Hours: 9.00am to 10.30pm
Contact: +65 6336 8105 
Contact Format: {'classes': 'wp-block-button__link has-white-color has-vivid-green-cyan-background-color has-text-color has-background wp-element-button', 'style': 'border-radius:10px'}
WhatsApp Contact: 6594496956
WhatsApp Format: {'classes': 'wp-image-6080', 'style': 'width:225px;height:78px'}
------------------------------------------------------
url: https://g1298.com/category/central-area/
Spa Name: Ichiban The Spa 9 Selegie Road
Dat

In [72]:
import re
import requests
from bs4 import BeautifulSoup

# Function to clean the services description
def clean_services(service_description, opening_hours):
    if opening_hours:
        return service_description.replace(opening_hours, '').strip()
    return service_description

# URL of the page to scrape
url = 'https://g1298.com/category/central-area/'

try:
    # Send a request to the website
    response = requests.get(url)
    response.raise_for_status()

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all articles in the Central Area category
    articles = soup.find_all('article', class_='category-central-area')

    for article in articles:
        try:
            spa_name = article.find('h2', class_='entry-title').get_text()
            time_element = article.find('time', class_='entry-date')
            date, datetime = time_element.get_text(), time_element['datetime']

            # Image extraction
            shop_image_tag = article.find('div', class_='post-thumbnail').find('img')
            shop_image_src, shop_image_width, shop_image_height = shop_image_tag['src'], shop_image_tag['width'], shop_image_tag['height']

            # Staff information extraction
            staff_images = article.find_all('figure', class_='wp-block-image')
            staff_info = [{'source': img.find('img')['src'], 'width': img.find('img')['width'], 'height': img.find('img')['height']} for img in staff_images]

            description = article.find('h2', class_='wp-block-heading').get_text()
            service_description = article.find('p', class_='has-text-align-center').get_text()
            opening_hours_match = re.search(r'Opening Hours\s*(\d{1,2}\.\d{2}am\s+to\s+\d{1,2}\.\d{2}(?:am|pm))', service_description, re.IGNORECASE)
            opening_hours = opening_hours_match.group() if opening_hours_match else "Not available"
            service_description = clean_services(service_description, opening_hours)

            # Contact information extraction
            contact_tag = article.find('a', href=re.compile(r'^tel:'))
            contact = contact_tag['href'][4:] if contact_tag else "Not available"

            # WhatsApp information extraction
            whatsapp_image = article.find('img', alt=True, src=True)
            whatsapp_link = whatsapp_image['src'] if whatsapp_image else None

            # Output the extracted information
            print(f"URL: {url}")
            print(f"Spa Name: {spa_name}")
            print(f"Date: {date}")
            print(f"DateTime Published: {datetime}")
            print(f"Shop Image: {shop_image_src} ({shop_image_width} x {shop_image_height})")
            print(f"Description: {description}")
            print(f"Services: {service_description}")
            print(f"Opening Hours: {opening_hours}")
            print(f"Contact: {contact}")
            print(f"WhatsApp Link: {whatsapp_link}")
            print("------------------------------------------------------")

        except Exception as e:
            print(f"An error occurred while processing an article: {e}")

except requests.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
except Exception as err:
    print(f"An error occurred: {err}")


URL: https://g1298.com/category/central-area/
Spa Name: Seasons Wellness Spa 43 Beach Road
Date: 20/11/2023
DateTime Published: 2023-11-20T12:46:19+08:00
Shop Image: data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20500%20300'%3E%3C/svg%3E (500 x 300)
Description: Seasons Wellness Spa located in 43 Beach Road By G1298.com – The Massage Directory
Services: Massages to Restore the Harmony in Your Body.60min Body Massage $6890min Body Massage $98120min Body Massage $11643 Beach Road Singapore 189681
Opening Hours: Opening Hours 9.00am to 10.30pm
Contact: +65 6336 8105 
WhatsApp Link: data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000/svg'%20viewBox='0%200%20500%20300'%3E%3C/svg%3E
------------------------------------------------------
URL: https://g1298.com/category/central-area/
Spa Name: Ichiban The Spa 9 Selegie Road
Date: 19/11/2023
DateTime Published: 2023-11-19T15:39:57+08:00
Shop Image: data:image/svg+xml,%3Csvg%20xmlns='http://www.w3.org/2000