### 1. Import Needed Libraries
<pre>
Input   : -
Output  : -
Process :
Import some needed Python libraries and models such as
    1. OS : Document directory management and Path manipulation. 
    2. CSV : CSV data manipulation (Write/Read)
    3. request : Download images from website.
    4. Urllib.parse : Breaks URLs and parses query strings into dictionary.
</pre>

In [None]:
import os
import csv
import requests
from urllib.parse import urlparse

### 2. Run Image Downloader
<pre>
Input   : CSV file named image_link_cleaned.csv (Product image links and Alt)
Output  : Product images in folders.
Process :
    1. The program is first identified with the User Agent.
    2. Creates image storage folders using the product identity in the link.
    3. Downloads the images and names them with the Alt of the image (containing the variant name).
    4. Record data that was successfully downloaded and those that failed.
</pre>

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'
}

IMAGE_DIR = r"D:\Kerja\recommendation_skin_analyst\28-07-2023_ProductScraping\data\Sephora_EN\product_images"

def create_folder(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

def download_image(image_url, folder_name, file_name):
    try:
        response = requests.get(image_url, headers=headers)
        if response.status_code == 200:
            full_path = os.path.join(folder_name, file_name)
            if os.path.exists(full_path):
                base, ext = os.path.splitext(file_name)
                count = 2
                while os.path.exists(os.path.join(folder_name, f"{base} ({count}){ext}")):
                    count += 1
                file_name = f"{base} ({count}){ext}"
            with open(os.path.join(folder_name, file_name), 'wb') as f:
                f.write(response.content)
            print(f"Downloaded: {file_name} in folder {folder_name}")
        else:
            print(f"Failed to download: {file_name}. Response code: {response.status_code}")
    except Exception as e:
        print(f"Failed to download: {file_name}. Error: {e}")

def main():
    csv_file = r"D:\Kerja\recommendation_skin_analyst\28-07-2023_ProductScraping\data\Sephora_EN\image_link_cleaned.csv"
    
    with open(csv_file, 'r', newline='', encoding='latin-1') as file:
        reader = csv.DictReader(file)
        for row in reader:
            web_link = row['web_link']
            image_link = row['image_link']
            alt = row['alt']
            
            folder_name = urlparse(web_link).path.strip('/').split('/')[-1]
            folder_path = os.path.join(IMAGE_DIR, folder_name)
            create_folder(folder_path)

            image_name = alt.replace(' ', '_').replace('/', '_').replace('\', '_').replace(',', '').replace('?','_').replace('<','').replace('>','').replace('|','').replace('*','').replace('"','').replace(':','') + '.jpg'
            download_image(image_link, folder_path, image_name)


if __name__ == "__main__":
    main()


### 3 Adding "Image 1" into the main product's image
<pre>
Input   : Files in 
Output  : Product images in folders.
Process :
    1. Search for image files that do not have the word Image_1 in the folder.
    2. Name the file with the same name in Image_2 after the file + Image_1.
    3. The program can be interrupted or run at any time.

In [None]:
import os
from datetime import datetime

directory = r"D:\Kerja\recommendation_skin_analyst\28-07-2023_ProductScraping\data\Sephora_EN\product_images"

cutoff_date = datetime(2024, 4, 15)

def rename_file(file_path, new_file_path):
    try:
        os.rename(file_path, new_file_path)
        print(f"File {file_path} has been renamed to {new_file_path}")
    except FileExistsError:
        print(f"File {new_file_path} already exists, skipped.")
    except FileNotFoundError:
        print(f"File {file_path} not found, skipped.")

def traverse_directory(directory):
    for root, _, files in os.walk(directory):
        for filename in files:
            file_path = os.path.join(root, filename)
            try:
                if datetime.fromtimestamp(os.path.getmtime(file_path)) < cutoff_date:
                    if "image" not in filename.lower():
                        file_name, file_ext = os.path.splitext(filename)
                        new_file_name = file_name + "_Image_1" + file_ext
                        new_file_path = os.path.join(root, new_file_name)
                        rename_file(file_path, new_file_path)
            except FileNotFoundError:
                print(f"File {file_path} not found, skipped.")

traverse_directory(directory)