<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Capstone Project: Harmony
## 1.3 Web scraping - Lamitak
> Authors: Eugene Matthew Cheong
---

## Table of Contents ##

#### 1. Web Scraping

- [1.1 Scraping Lian Seng Hin Website](1.1_web_scraping_liansenghin.ipynb)
- [1.2 Scraping Hafary Website](1.2_web_scraping_hafary.ipynb)
- [1.3 Scraping Lamitak Website](1.3_web_scraping_lamitak.ipynb)
- [1.4 Scraping Nippon Website](1.4_web_scraping_nippon.ipynb)
- [1.5 Consolidate All Product Database](1.5_consolidate_product_database.ipynb)

#### 2. Preprocessing

- [2.1 Processing Canva Palettes](2.1_processing_canva_palette.ipynb)

#### 3. Modelling

- [3.1 Matching Input Photo to Products](3.1_matching_input_photo_to_products.ipynb)
- [3.2 Recommending Canva Palette to Products](3.2_recommending_canva_palette_to_product.ipynb)
- [3.3 Recommending Colours and Colour Palettes with Llama3](3.3_recommending_colours_and_colour_palettes_with_llama3.ipynb)

---

# Import Modules

In [None]:
import os
import re
import time
import shutil

import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Website to scrape
- https://sg.lamitak.com/collections/woods
- https://sg.lamitak.com/collections/solids
- https://sg.lamitak.com/collections/patterns
- https://sg.lamitak.com/collections/specialities

In [None]:
data_img_folder = "../datasets/images"
lamitak_img_folder =  os.path.join(data_img_folder,"lamitak")

## Function to scrape information required per Lamitak page

In [None]:
# Function to scrape images and labels from a single page
def scrape_page(url, design, input_folder):

    # Create a directory to store images
    os.makedirs(input_folder, exist_ok=True)

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    # Find image containers within the specified class
    image_containers = soup.find_all('li', class_="col-md-3 col-sm-6 col-xs-6 product-item")

    product_list =[]

    # Extract image URLs and labels
    for container in image_containers:

        #print(container)
        
        pattern = r"(?<=url\(').*?(?='\))"
        image = container.find('a')['style']
        image_url = re.search(pattern, image).group()

        image_label = container.find('span').text.strip()

        product_site = container.find("a", class_="product-image")['href']

        image_filename = os.path.splitext(os.path.basename(image_url))[0]

        image_dict = {"Model Name" : image_label,
                      "Product URL" : product_site,
                      "Filename" : f"{image_filename}.jpg",
                      "Company": "lamitak",
                      "Type" : "Laminate",
                      "Category Tags" : f"{design},"}


        
        
        #Download image and save with label
        download_image(image_url, image_filename, input_folder)
        product_list.append(image_dict)
        print(f"Image Label Found: {image_label}")
        print(f"Image Product Site Found: {image_filename}")
        print(f"Image URL Found: {image_url}")

    return product_list 



## Function to download images with given URL

In [None]:
# Function to download image and save with label
def download_image(url, label, input_folder):
    image_data = requests.get(url).content
    filename = f"{label}.jpg"
    image_filepath = os.path.join(input_folder,filename)
    with open(image_filepath, 'wb') as f:
        f.write(image_data)
    print(f"Image saved: {image_filepath}")

## Function to scrape Lian Seng Hin Tile pages

In [None]:
# Main function to iterate through pages and scrape
def lamitak_scrape(base_url_list):

    all_product_list = []
    design_list = ["wood", "solid-colors", "patterns", "specialities"]

    for design in design_list:
        selectedurl = base_url_list[design]
        
        print(f"Scraping page {selectedurl}...")
        product_list_page = scrape_page(selectedurl, design, lamitak_img_folder)
        all_product_list += product_list_page

    return all_product_list

# Launch scraping for lian Seng Hin tile products

In [None]:
base_url_list = {"wood": "https://sg.lamitak.com/collections/woods",
                 "solid-colors": "https://sg.lamitak.com/collections/solids",
                 "patterns": "https://sg.lamitak.com/collections/patterns",
                 "specialities": "https://sg.lamitak.com/collections/specialities"
                 }

In [None]:
#Launch scraping for lian seng hin tile products
start_time = time.time()

all_product_list = lamitak_scrape(base_url_list)

end_time = time.time()

runtime = end_time - start_time
print("Scraping Runtime:", runtime, "seconds")


Converting to Dataframe

In [None]:
product_df = pd.DataFrame(all_product_list)
product_df

In [None]:
product_df['Category Tags'].value_counts()

# Populate Laminate details

In [None]:
def populate_tile_details(row):
   # Export Nippon DF to CSV
   product_url = row['Product URL']
   print(product_url)

   response = requests.get(product_url)
   if response.status_code != 200:
      print(f"Failed to fetch {product_url}")
      return

   soup = BeautifulSoup(response.content, 'html.parser')

   # Find image containers within the specified class


   info_containers = soup.find('div', "col-md-6 col-xs-12 product-info-container").find_all('dl', class_= 'product-details-table')

   all_dict = {}
   # Extract image URLs and labels
   for container in info_containers:
      product_headers = container.find_all("dt")
      product_descriptions = container.find_all("dd")

      product_headers_list = []
      product_descriptions_list = []

      for header in product_headers:
         product_headers_list.append(header.text.strip())

      for descriptions in product_descriptions:
         product_descriptions_list.append(descriptions.text.strip())
      
      all_dict.update(dict(zip(product_headers_list, product_descriptions_list)))

   
   product_description = ""

   try:
      product_description += f"{all_dict['Type']}"
   except:
      print(f"Error: No Type Found for {row['Product URL']}")
      pass

   try:
      product_description += f", {all_dict['Finish']} Finish"
   except:
      print(f"Error: No Features Found for {row['Product URL']}")
      pass

   try:
      product_description += f", {all_dict['Grains']} Grains"
   except:
      print(f"Error: No Features Found for {row['Product URL']}")
      pass

   #Procssing dimensions
   try:
      dimension_text = all_dict['Newedge Size'].replace("W","")
      dimension_list = dimension_text.split("/")
      row['Width (cm)'] = dimension_list[0]
      row['Height (cm)'] = dimension_list[1]
   except:
      row['Width (cm)'] = "None"
      row['Height (cm)'] = "None"
      print(f"Error: No Measurements Found for {row['Product URL']}")



   row['Origin Country'] = "None"
   row['Category Tags'] = product_description
   row['Application'] = "Carpentry"

   print(row)
   return row

          
          

In [None]:
start_time = time.time()

product_df =  product_df.apply(populate_tile_details, axis=1)

end_time = time.time()

runtime = end_time - start_time
print("Scraping Runtime:", runtime, "seconds")

In [None]:
product_df

# Export Dataframe to CSV

In [None]:
archive_dataset_path = "../datasets/archive_dataset/"
file_path = '../datasets/lamitak_df.csv'

Archives the old csv and updates with the current list

In [None]:
if not os.path.exists(archive_dataset_path):
    os.makedirs(archive_dataset_path)  # Create the archive folder if it doesn't exist

# Check if the file exists
if os.path.isfile(file_path):
    # Move the file to the archive folder
    shutil.move(file_path, os.path.join(archive_dataset_path, f"lamitak_df_archived_{pd.Timestamp.now().strftime('%Y%m%d%H%M%S')}.csv"))

In [None]:
product_df.to_csv(file_path)

I noticed after scraping, there are some images that are not correct and showing the tile image. It shows a room instead. So I will update the images later.

# Find missing files and update to the correct image

In [None]:
product_df = pd.read_csv(file_path)

In [None]:
missing_image_list = []

for i in list(product_df['Filename']):
  full_image_filepath = os.path.join(lamitak_img_folder,i)
  if os.path.exists(full_image_filepath):
    missing_image_list.append(os.path.join(lamitak_img_folder,i))
  else:
    print(f"Error finding image path: {full_image_filepath}")

# Moving old product image to archive when it is no longer in the CSV

When there are new updates to the catalogue, it will archive the images so that it will not be included in the recommendation.

In [None]:
listdir = os.listdir(lamitak_img_folder)

In [None]:
archive_img_path = os.path.join(lamitak_img_folder,"archived")
if not os.path.exists(archive_img_path):
    os.makedirs(archive_img_path)  # Create the archive folder if it doesn't exist

# Iterate over all files in the image folder
for image in listdir:
    if os.path.isfile(image):
        # Extract the name or identifier from the image filename
        image_name = os.path.basename(image)  # Adjust this according to your filename structure

        # Check if this image_name exists in the DataFrame
        if not any(product_df['Filename'].astype(str).str.contains(image_name)):
            # Move the file to the archive folder
            try:
                shutil.move(os.path.join(lamitak_img_folder, image), os.path.join(archive_img_path, image))
                print(f'Image moved to archived: {os.path.join(lamitak_img_folder, image)}')
            except:
                print(f'Error: Image not found: {os.path.join(lamitak_img_folder, image)}')

            print(image_name)


---

### Next Notebook: [1.4 Scraping Nippon Website](1.4_web_scraping_nippon.ipynb)