<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Capstone Project: Harmony
## 1.4 Web scraping - Nippon
> Authors: Eugene Matthew Cheong
---

## Table of Contents ##

#### 1. Web Scraping

- [1.1 Scraping Lian Seng Hin Website](1.1_web_scraping_liansenghin.ipynb)
- [1.2 Scraping Hafary Website](1.2_web_scraping_hafary.ipynb)
- [1.3 Scraping Lamitak Website](1.3_web_scraping_lamitak.ipynb)
- [1.4 Scraping Nippon Website](1.4_web_scraping_nippon.ipynb)
- [1.5 Consolidate All Product Database](1.5_consolidate_product_database.ipynb)

#### 2. Preprocessing

- [2.1 Processing Canva Palettes](2.1_processing_canva_palette.ipynb)

#### 3. Modelling

- [3.1 Matching Input Photo to Products](3.1_matching_input_photo_to_products.ipynb)
- [3.2 Recommending Canva Palette to Products](3.2_recommending_canva_palette_to_product.ipynb)
- [3.3 Recommending Colours and Colour Palettes with Llama3](3.3_recommending_colours_and_colour_palettes_with_llama3.ipynb)

---

# Import Modules

In [None]:
import requests
from bs4 import BeautifulSoup
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Website to scrape
- https://nipponpaint.com.sg/colours/find-your-colour/

In [None]:
data_img_folder = "../datasets/images"
nippon_img_folder =  os.path.join(data_img_folder,"nippon")

# Gathering the "Colour Family" links as they are considered "pages" in this website.

In [None]:
main_web_color_page = 'https://nipponpaint.com.sg/colours/find-your-colour/'

In [None]:
response = requests.get(main_web_color_page)
if response.status_code != 200:
    print(f"Failed to fetch {main_web_color_page}")
else:
  soup = BeautifulSoup(response.content, 'html.parser')

  # Find colour containers within the specified class
  colour_family_container = soup.find('div', class_="colour-family")
  colour_link_found = colour_family_container.find_all('a')
  colour_links_list = []
  for colour in colour_link_found:
     colour_links_list.append(colour['href'])
     print(f'Colour Link Found: {colour["href"]}')

## Function to scrape information required per Nippon Colour Family page

In [None]:
# Function to scrape images and labels from a single page
def scrape_page(url,input_folder):

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch {url}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    # Find image containers within the specified class
    item_containers = soup.find('div', class_="colours-item-box")

    product_list = []
    
    image_containers = item_containers.find_all('div', class_='col-6 col-md-3')
    for image in image_containers:
        alL_colour_info = image.find_all('h5', class_='card-title')
        for colour_info in alL_colour_info:
            paint_label = colour_info.text.strip()
            paint_url = colour_info.find('a')['href']
            
            print(f"Paint Label Found: {colour_info.text.strip()}")
            print(f"Paint URL Found: {colour_info.find('a')['href']}")
                
        
            imagedict = {"Model Name": f"{colour_info.text.strip()}",
                        "Product URL": colour_info.find('a')['href'],
                        "Type": "Paint",
                        "Application": "Wall",
                        "Company": "Nippon",
                        "Origin Country": "None",
                        "Category Tags": ""
                        }

            product_list.append(imagedict)

    return product_list

## Function to download images with given URL

In [None]:
# Function to download image and save with label
def download_image(url, label, input_folder):
    image_data = requests.get(url).content
    filename = f"{label}.jpg"
    image_filepath = os.path.join(input_folder,filename)
    with open(image_filepath, 'wb') as f:
        f.write(image_data)
    print(f"Image saved: {image_filepath}")

# Launch scraping for Nippon products

In [None]:
#Launch scraping for lian seng hin tile products
start_time = time.time()

all_product_list = []

for link in colour_links_list:
  product_list_page = scrape_page(link,nippon_img_folder)
  all_product_list += product_list_page

end_time = time.time()

runtime = end_time - start_time
print("Scraping Runtime:", runtime, "seconds")


Converting to Dataframe

In [None]:
product_df = pd.DataFrame(all_product_list)
product_df

# Populate Paint details

In [None]:
def convert_hex_to_rgb(input_hex):
  h = input_hex.lstrip('#')
  return tuple(int(h[i:i+2], 16) for i in (0, 2, 4))

In [None]:
def populate_paint_details(row):
   print("Processing row:", row.name)
   product_url = row['Product URL']
   print(product_url)
   try:
      response = requests.get(product_url)
      if response.status_code != 200:
         print(f"Failed to fetch {product_url}")
         return

      soup = BeautifulSoup(response.content, 'html.parser')

      # Find image containers within the specified class


      info_containers = soup.find('div', class_="row my-4")
      #print(info_containers)
      color_code = info_containers.find('div', class_="color-box")['style'].replace("background-color: ","").replace(";","")
      color_code_rgb = convert_hex_to_rgb(color_code)

      model_number = info_containers.find('div', class_="col-lg-12 col-12").find('h2').text.strip()

      print(model_number)
      print(f"Paint Color Code Found: {color_code}")
      print(f"Paint Color RGB Found: {color_code_rgb}")
      print(f"Paint Model Number Found: {model_number}")
      row['Color Code'] = color_code
      row['Color R'] = color_code_rgb[0]
      row['Color G'] = color_code_rgb[1]
      row['Color B'] = color_code_rgb[2]
      row['Model Number'] = model_number
   except:
      print(f"Unable to process row: {product_url}. Please try again.")

   return row
          

In [None]:
start_time = time.time()

product_df =  product_df.apply(populate_paint_details, axis=1)

end_time = time.time()

runtime = end_time - start_time
print("Scraping Runtime:", runtime, "seconds")

In [None]:
product_df

# Generate color PNG for model and updating dataframe with product filename

In [None]:
product_df

In [None]:
def save_color_images(row):
  print("Processing row:", row.name)
  # Ensure the folder exists
  if not os.path.exists(nippon_img_folder):
      os.makedirs(nippon_img_folder, exist_ok=True)


  height, width, channel = 300, 300, 3

  #Define Red,Green,Blue Color -for each- 0 to 255
  red, green, blue = row['Color R'], row['Color G'], row['Color B']

  #Generate RGB Numpy Array 
  image_data = np.full((height, width, channel), [red, green, blue], dtype=('uint8'))

  plt.imshow(image_data)
  plt.axis('off')

  # Save the figure
  output_img_filename = f"{nippon_img_folder}/{row['Model Number']}.png"
  print(f"Saving image: {output_img_filename}")
  plt.savefig(output_img_filename, bbox_inches='tight', pad_inches=0)
  plt.close()
  row['Filename'] = os.path.basename(output_img_filename)
  print(row)

  return row

In [None]:
start_time = time.time()

#save_color_images(product_df, nippon_img_folder)
product_df = product_df.apply(save_color_images, axis=1)

end_time = time.time()

runtime = end_time - start_time
print("Generating Color PNG:", runtime, "seconds")

In [None]:
product_df

# Export Dataframe to CSV

In [None]:
archive_dataset_path = "../datasets/archive_dataset/"
file_path = '../datasets/nippon_df.csv'

In [None]:
product_df.to_csv(file_path)

# Find missing files and update to the correct image

In [None]:
missing_image_list = []

for i in list(product_df['Filename']):
  full_image_filepath = os.path.join(nippon_img_folder,i)
  if os.path.exists(full_image_filepath):
    missing_image_list.append(os.path.join(nippon_img_folder,i))
  else:
    print(f"Error finding image path: {full_image_filepath}")

# Moving old product image to archive when it is no longer in the CSV

When there are new updates to the catalogue, it will archive the images so that it will not be included in the recommendation.

In [None]:
listdir = os.listdir(nippon_img_folder)

In [None]:
archive_img_path = os.path.join(nippon_img_folder,"archived")
if not os.path.exists(archive_img_path):
    os.makedirs(archive_img_path)  # Create the archive folder if it doesn't exist

# Iterate over all files in the image folder
for image in listdir:
    if os.path.isfile(image):
        # Extract the name or identifier from the image filename
        image_name = os.path.basename(image)  # Adjust this according to your filename structure

        # Check if this image_name exists in the DataFrame
        if not any(product_df['Filename'].astype(str).str.contains(image_name)):
            # Move the file to the archive folder
            try:
                shutil.move(os.path.join(nippon_img_folder, image), os.path.join(archive_img_path, image))
                print(f'Image moved to archived: {os.path.join(nippon_img_folder, image)}')
            except:
                print(f'Error: Image not found: {os.path.join(nippon_img_folder, image)}')

            print(image_name)


---

### Next Notebook: [1.5 Consolidate All Product Database](1.5_consolidate_product_database.ipynb)