<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Capstone Project: Harmony
## 1.2 Web scraping - Hafary
> Authors: Eugene Matthew Cheong
---

## Table of Contents ##

#### 1. Web Scraping

- [1.1 Scraping Lian Seng Hin Website](1.1_web_scraping_liansenghin.ipynb)
- [1.2 Scraping Hafary Website](1.2_web_scraping_hafary.ipynb)
- [1.3 Scraping Lamitak Website](1.3_web_scraping_lamitak.ipynb)
- [1.4 Scraping Nippon Website](1.4_web_scraping_nippon.ipynb)
- [1.5 Consolidate All Product Database](1.5_consolidate_product_database.ipynb)

#### 2. Preprocessing

- [2.1 Processing Canva Palettes](2.1_processing_canva_palette.ipynb)

#### 3. Modelling

- [3.1 Matching Input Photo to Products](3.1_matching_input_photo_to_products.ipynb)
- [3.2 Recommending Canva Palette to Products](3.2_recommending_canva_palette_to_product.ipynb)
- [3.3 Recommending Colours and Colour Palettes with Llama3](3.3_recommending_colours_and_colour_palettes_with_llama3.ipynb)

---

# Import Module

In [None]:
import os
import re
import time
import shutil

import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Website to scrape

- https://www.hafary.com.sg/collections/Tiles

In [None]:
data_img_folder = "../datasets/images"
hafary_img_folder =  os.path.join(data_img_folder,"hafary")

# Scraping for Hafary tile products

### Function to scrape information required per Hafary page

In [None]:
# Function to scrape images and labels from a single page
def scrape_page(url,input_folder,typeofproduct):

    # Create a directory to store images
    os.makedirs(input_folder, exist_ok=True)

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')



    # Find image containers within the specified class
    image_containers = soup.find('div', class_="div_global_right_content collection").find_all('a')

    product_list = []
    # Extract image URLs and labels
    for container in image_containers:
        
        product_site = f"https://www.hafary.com.sg{container['href']}"

        image_label = container.find('div', class_="div_global_grid_title").text.strip()

        image_tags = container.find_all('div', class_="div_global_grid_image_collection")
        for image in image_tags:
            image_url = image.find('img')['src']
            image_url_correct = f"https://www.hafary.com.sg{image_url}"

            image_label_clean = image_label.replace('.','-').replace(' ','-').replace('/','-').lower()

            #Remove file extension
            image_basename = os.path.basename(image_url)
            image_name = os.path.splitext(image_basename)[0]
            image_filename = f"{image_label_clean}_{image_name}"
            
            
            image_dict = {"Model Name": image_label, 
                          "Product URL": product_site, 
                          "Filename": f"{image_filename}.jpg", 
                          "Company" : "Hafary",
                          "Type" : typeofproduct}
            
            #Download image and save with label
            download_image(image_url_correct, image_filename, input_folder)
            product_list.append(image_dict)
            
            print(f"Image Label Found: {image_label}")
            print(f"Image Product Site Found: {product_site}")
            print(f"Image URL Found: {image_url_correct}")
    
    return product_list       

### Function to download images with given URL

In [None]:
# Function to download image and save with label
def download_image(url, label, input_folder):
    image_data = requests.get(url).content
    filename = f"{label}.jpg"
    image_filepath = os.path.join(input_folder,filename)
    with open(image_filepath, 'wb') as f:
        f.write(image_data)
    print(f"Image saved: {image_filepath}")

### Function to scrape Hafary pages

In [None]:
# Main function to iterate through pages and scrape
def hafary_main(base_url, typeofproduct):
    page_number = 1

    all_product_list = []
    # Iterate through pages
    #while page_number <= page_limit:
    while True:
        page_url = f"{base_url}?Page={page_number}/"
        response = requests.get(page_url)
        if response.status_code != 200:
            print(f"No more pages. Exiting.")
            break
        
        #Find if there are anymore content of interest in the page, else break.
        soup = BeautifulSoup(response.content, 'html.parser')
        if soup.find('div', class_="div_global_right_content collection").find_all('a') == []:
            print(f"No more pages. Exiting.")
            break



        print(f"Scraping page {page_number}...")
        product_list_page = scrape_page(page_url,hafary_img_folder, typeofproduct)
        all_product_list += product_list_page
        page_number += 1
    
    return all_product_list

In [None]:
start_time = time.time()

base_url = "https://www.hafary.com.sg/collections/Tiles"
all_product_list = hafary_main(base_url, "Tiles")

end_time = time.time()

runtime = end_time - start_time
print("Scraping Runtime:", runtime, "seconds")

Converting to Dataframe

In [None]:
product_df = pd.DataFrame(all_product_list)
product_df

Checking if there are any null values in the 'Product URL'

In [None]:
product_df['Product URL'].isnull().sum()

### Function to populate tile details

In [None]:
def populate_tile_details(row):
   print("Processing row:", row.name)
   product_url = row['Product URL']


   response = requests.get(product_url)
   if response.status_code != 200:
      print(f"Failed to fetch {product_url}")
      return None

   soup = BeautifulSoup(response.content, 'html.parser')

   # Find image containers within the specified class
   info_containers = soup.find('div', "div_more_info").find_all('div', class_= 'column')

   all_dict = {}
   # Extract image URLs and labels
   for container in info_containers:
      product_headers = container.find_all("div", class_="div_microsite_sub_title_header")
      product_descriptions = container.find_all("div", class_="div_microsite_sub_title")

      product_headers_list = []
      product_descriptions_list = []

      for header in product_headers:
         product_headers_list.append(header.text.strip())

      for descriptions in product_descriptions:
         product_descriptions_list.append(descriptions.text.strip())

      all_dict.update(dict(zip(product_headers_list, product_descriptions_list)))

   product_description = ""

   try:
      product_description += f"{all_dict['MATERIAL']}"
   except:
      print(f"Error: No Materials Found for {row['Product URL']}")
      pass

   try:
      product_description += f", {all_dict['FEATURES']}"
   except:
      print(f"Error: No Features Found for {row['Product URL']}")
      pass

   try:
      product_description += f", {all_dict['VARIATION']}"
   except:
      print(f"Error: No Variations Found for {row['Product URL']}")
      pass


   # Split the text using regular expression
   application_split_text_list = re.findall('[A-Z][^A-Z]*', all_dict['APPLICATION'])
   application_split_text = ", ".join(application_split_text_list)

   try:
      dimension_text = all_dict['AVAILABLE DIMENSIONS'].replace("(L)","").replace("(W)","").replace("mm","").replace(" ","")
      dimension_list = dimension_text.split("×")
      row['Width (cm)'] = dimension_list[0]
      row['Height (cm)'] = dimension_list[1]
   except:
      print(f"Error: No Dimensions Found for {row['Product URL']}")
      pass  

   try:
      row['Origin Country'] = all_dict['COUNTRY OF ORIGIN']
   except:
      print(f"Error: No Country of Origin Found for {row['Product URL']}")
      pass

   try:
      row['Application'] = application_split_text
   except:
      print(f"Error: No Variations Found for {row['Product URL']}")
      pass

         
   row['Category Tags'] = product_description
   row['Type'] = "Tiles"

   return row
          
          

In [None]:
start_time = time.time()

product_df =  product_df.apply(populate_tile_details, axis=1).dropna()

end_time = time.time()

runtime = end_time - start_time
print("Scraping Runtime:", runtime, "seconds")

In [None]:
product_df


There are special characters like * in the naming of the file. Needs to be renamed.

In [None]:
def rename_files(df, folder):
    # Iterate over the DataFrame
    for index, row in df.iterrows():
        original_filename = row['Filename']
        # Check if '*' is in the filename
        if '*' in original_filename or '\\' in original_filename:
            # Replace '*' with '-'
            new_filename = original_filename.replace('*', '-').replace('\\', '-')
            # Update the DataFrame
            df.at[index, 'Filename'] = new_filename
            # Rename the file on disk
            original_filepath = os.path.join(folder, original_filename)
            new_filepath = os.path.join(folder, new_filename)
            if os.path.exists(original_filepath):
                os.rename(original_filepath, new_filepath)
            else:
                print(f"File does not exist: {original_filepath}")
    return df

In [None]:
product_df = rename_files(product_df, hafary_img_folder)

In [None]:
product_df['Filename'].str.contains(r'\*', regex=True).sum()

# Export Dataframe to CSV

In [None]:
archive_dataset_path = "../datasets/archive_dataset/"
file_path = '../datasets/hafary_df.csv'

Archives the old csv and updates with the current list

In [None]:
if not os.path.exists(archive_dataset_path):
    os.makedirs(archive_dataset_path)  # Create the archive folder if it doesn't exist

# Check if the file exists
if os.path.isfile(file_path):
    # Move the file to the archive folder
    shutil.move(file_path, os.path.join(archive_dataset_path, f"hafary_df_archived_{pd.Timestamp.now().strftime('%Y%m%d%H%M%S')}.csv"))

In [None]:
product_df.to_csv("../datasets/hafary_df.csv")

I noticed after scraping, there are some images that are not correct and showing the tile image. It shows a room instead. So I will update the images later.

# Find missing files and update to the correct image

In [None]:
product_df = pd.read_csv("../datasets/hafary_df.csv")

In [None]:
missing_image_list = []

for i in list(product_df['Filename']):
  full_image_filepath = os.path.join(hafary_img_folder,i)
  if os.path.exists(full_image_filepath):
    missing_image_list.append(os.path.join(hafary_img_folder,i))
  else:
    print(f"Error finding image path: {full_image_filepath}")

# Moving old product image to archive when it is no longer in the CSV

When there are new updates to the catalogue, it will archive the images so that it will not be included in the recommendation.

In [None]:
listdir = os.listdir(hafary_img_folder)

In [None]:
archive_img_path = os.path.join(hafary_img_folder,"archived")
if not os.path.exists(archive_img_path):
    os.makedirs(archive_img_path)  # Create the archive folder if it doesn't exist

# Iterate over all files in the image folder
for image in listdir:
    if os.path.isfile(image):
        # Extract the name or identifier from the image filename
        image_name = os.path.basename(image)  # Adjust this according to your filename structure

        # Check if this image_name exists in the DataFrame
        if not any(product_df['Filename'].astype(str).str.contains(image_name)):
            # Move the file to the archive folder
            try:
                shutil.move(os.path.join(hafary_img_folder, image), os.path.join(archive_img_path, image))
                print(f'Image moved to archived: {os.path.join(hafary_img_folder, image)}')
            except:
                print(f'Error: Image not found: {os.path.join(hafary_img_folder, image)}')

            print(image_name)


---

### Next Notebook: [1.3 Scraping Lamitak Website](1.3_web_scraping_lamitak.ipynb)