In [14]:
from PIL import Image, ImageOps
import os
import hashlib
import requests
import csv
from datetime import datetime
from io import BytesIO
import pandas as pd


MAX_IMAGES = 500  # Maximum number of images to download
downloaded_count = 0  # Counter to track the number of downloaded images
MAX_IMAGES_PER_CATEGORY = 500  # Maximum number of images to download per category
downloaded_count_per_category = {}  # Dictionary to track the number of downloaded images per category
CSV_LINK_FILE = 'image_links.csv'
CSV_METADATA_FILE = 'image_metadata.csv'


In [20]:
def clean_and_update_metadata(csv_path, image_dir):
    valid_paths_with_url = set()

    # Load metadata entries that have valid file_path and url
    if os.path.exists(csv_path):
        with open(csv_path, newline='', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            metadata_rows = list(reader)  # Load all rows into memory for modification

        # Check for valid file paths and URLs
        updated_rows = []
        for row in metadata_rows:
            file_path = row.get('file_path', '').strip()
            url = row.get('url', '').strip()

            # If the file exists, keep the row and add the file path to the valid set
            if file_path and url and os.path.exists(file_path):
                valid_paths_with_url.add(os.path.abspath(file_path))
                updated_rows.append(row)
            else:
                # If the file does not exist, remove the row from metadata
                print(f"File not found for row: {file_path}. Removing metadata entry.")

        # Save the updated metadata back to the CSV file
        with open(csv_path, mode='w', newline='', encoding='utf-8') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
            writer.writeheader()
            writer.writerows(updated_rows)




In [21]:
clean_and_update_metadata('image_metadata.csv', 'images')

In [None]:
def update_download_status(metadata_csv, links_csv):
    downloaded_urls = set()
    if os.path.exists(metadata_csv):
        with open(metadata_csv, newline='', encoding='utf-8') as metafile:
            reader = csv.DictReader(metafile)
            for row in reader:
                if 'url' in row:
                    downloaded_urls.add(row['url'].strip())

    if not os.path.exists(links_csv):
        return

    updated_rows = []
    with open(links_csv, newline='', encoding='utf-8') as linkfile:
        reader = csv.DictReader(linkfile)
        fieldnames = reader.fieldnames if 'downloaded' in reader.fieldnames else reader.fieldnames + ['downloaded']
        for row in reader:
            row_url = row['url'].strip()
            if row_url in downloaded_urls:
                print(row)
                row['downloaded'] = True
                
            else:
                row['downloaded'] = False
            updated_rows.append(row)

    with open(links_csv, mode='w', newline='', encoding='utf-8') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(updated_rows)

In [5]:
df = pd.read_csv(CSV_LINK_FILE, encoding='utf-8')
print(df[df['downloaded'] == True][::-1])

NameError: name 'pd' is not defined

In [None]:
update_download_status(CSV_METADATA_FILE, CSV_LINK_FILE)


{'category': 'bicycle', 'url': 'https://ir.ebaystatic.com/rs/v/fxxj3ttftm5ltcqnto1o4baovyl.png', 'source': 'ebay', 'timestamp': '03:10.', 'downloaded': True}
{'category': 'bicycle', 'url': 'https://i.ebayimg.com/images/g/tF0AAOSwo6lnTVw~/s-l500.webp', 'source': 'ebay', 'timestamp': '03:10.', 'downloaded': True}
{'category': 'bicycle', 'url': 'https://i.ebayimg.com/images/g/jDYAAOSw89FnwjxI/s-l500.webp', 'source': 'ebay', 'timestamp': '03:10.', 'downloaded': True}
{'category': 'bicycle', 'url': 'https://i.ebayimg.com/images/g/UI4AAOSwsnJnqr36/s-l500.webp', 'source': 'ebay', 'timestamp': '03:10.', 'downloaded': True}
{'category': 'bicycle', 'url': 'https://i.ebayimg.com/images/g/MJMAAOSwEiZnwOX7/s-l500.webp', 'source': 'ebay', 'timestamp': '03:10.', 'downloaded': True}
{'category': 'bicycle', 'url': 'https://i.ebayimg.com/images/g/-~kAAOSw0-Jl1bA4/s-l500.webp', 'source': 'ebay', 'timestamp': '03:10.', 'downloaded': True}
{'category': 'bicycle', 'url': 'https://i.ebayimg.com/images/g/gLoA

In [None]:
def get_aspect_ratio(image):
    width, height = image.size
    return width / height

def find_closest_aspect_ratio(image):
    aspect_ratios = {
        "1:1": 1.0,
        "4:3": 4 / 3,
        "16:9": 16 / 9
    }
    original_aspect_ratio = get_aspect_ratio(image)
    return min(aspect_ratios, key=lambda r: abs(aspect_ratios[r] - original_aspect_ratio))

def resize_image(image, target_size):
    image = ImageOps.exif_transpose(image)
    new_image = Image.new("RGB", target_size, (255, 255, 255))
    image.thumbnail(target_size)
    x_offset = (target_size[0] - image.size[0]) // 2
    y_offset = (target_size[1] - image.size[1]) // 2
    new_image.paste(image, (x_offset, y_offset))
    return new_image

def resize_and_save(image, file_path):
    closest_ratio = find_closest_aspect_ratio(image)
    if closest_ratio == "1:1":
        resized_img = resize_image(image, (640, 640))
    elif closest_ratio == "4:3":
        resized_img = resize_image(image, (1024, 768))
    elif closest_ratio == "16:9":
        resized_img = resize_image(image, (1280, 720))
    resized_img.save(file_path.replace('.jpg', f'_{closest_ratio.replace(":", "-")}.jpg'))

def download_image_from_csv_row(category, url, output_dir):
    global downloaded_count_per_category
    if downloaded_count_per_category.get(category, 0) >= MAX_IMAGES_PER_CATEGORY:
        print(f"Reached the maximum download limit of 500 images for category '{category}'.")
        return

    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            # print(f"Downloading image from {url}...")
            image_content = response.content
            file_hash = hashlib.md5(image_content).hexdigest()
            image = Image.open(BytesIO(image_content))
            image_format = image.format

            if image_format == 'PNG':
                file_ext = 'png'
            elif image_format in ['JPEG', 'JPG', 'WEBP']:
                file_ext = 'jpg'
            else:
                print(f"Unsupported format: {image_format}")
                return

            category_path = os.path.join(output_dir, category)
            os.makedirs(category_path, exist_ok=True)
            file_path = os.path.join(category_path, f"{file_hash}.{file_ext}")
            # print(f"File path: {file_path}")
            if not os.path.exists(file_path) and 'plus.unsplash' not in url:
                resize_and_save(image, file_path)
                print(f"Saved image: {file_path} from {url}")
                downloaded_count_per_category[category] = downloaded_count_per_category.get(category, 0) + 1  # Increment the counter for the category

                already_logged = False
                if os.path.exists(CSV_METADATA_FILE):
                    with open(CSV_METADATA_FILE, newline='', encoding='utf-8') as meta_file:
                        reader = csv.DictReader(meta_file)
                        for row in reader:
                            if row['file_path'] == file_path:
                                already_logged = True
                                break

                if not already_logged:
                    with open(CSV_METADATA_FILE, mode='a', newline='', encoding='utf-8') as metafile:
                        meta_writer = csv.writer(metafile)
                        if os.stat(CSV_METADATA_FILE).st_size == 0:
                            meta_writer.writerow(['category', 'url', 'download_time', 'file_path'])
                        meta_writer.writerow([category, url, datetime.now(), file_path])
            # else:
            #     print(f"Image already exists: {file_path}")
    except Exception as e:
        print(f"Error downloading image from {url}: {e}")

def download_images_from_csv(csv_path=CSV_LINK_FILE, output_dir=OUTPUT_DIR, df=df):
    global downloaded_count_per_category
    for _, row in df.iterrows():
        category = row['category']
        url = row['url']
        if downloaded_count_per_category.get(category, 0) >= MAX_IMAGES_PER_CATEGORY:
            print(f"Reached the maximum download limit of 500 images for category '{category}'.")
            continue
        download_image_from_csv_row(category, url, output_dir)


In [None]:
download_images_from_csv()

Saved image: images\baseball glove\c93ffe99ada0318f41a3820265d26a3a.jpg from https://i.ebayimg.com/images/g/oUwAAOSwOrBnGJaZ/s-l500.webp
Saved image: images\baseball glove\6d9a37c8e940c5008be1526e4c47a98c.jpg from https://i.ebayimg.com/images/g/OYgAAOSwVcZmqel0/s-l500.webp
Saved image: images\baseball glove\cb837bc8da878c38a6c90c5d4bf5d7f8.jpg from https://i.ebayimg.com/images/g/eU8AAOSwhsBmbzMq/s-l500.webp
Saved image: images\baseball glove\928c6ac51f977e302043cc6fc1a528ca.jpg from https://i.ebayimg.com/images/g/OocAAOSwzZpngWr~/s-l500.webp
Saved image: images\baseball glove\fed6f7ff8c90eca9042d50e14cd123c9.jpg from https://i.ebayimg.com/images/g/QUMAAeSwF61n71uw/s-l500.webp
Saved image: images\baseball glove\8ef86dbd6bc802abf11d7d740b227b99.jpg from https://i.ebayimg.com/images/g/x5gAAOSwdr5n709f/s-l500.webp
Saved image: images\baseball glove\9835bd73ae1a4af03bbf4c4c99211135.jpg from https://i.ebayimg.com/images/g/4BcAAeSwUGtn6VBk/s-l500.webp
Saved image: images\baseball glove\53d7ac

KeyboardInterrupt: 

In [10]:
import os

# Define the root directory and the target file name
root_dir = 'images'
target_file = '989d155fe0261a9d9938549a3c2f8168.png'

# Walk through all subdirectories and files in the root directory
for root, dirs, files in os.walk(root_dir):
    for file in files:
        if file == target_file:  # Check if the file matches the target file name
            file_path = os.path.join(root, file)
            try:
                os.remove(file_path)  # Delete the file
                print(f"Deleted: {file_path}")
            except Exception as e:
                print(f"Failed to delete {file_path}: {e}")

Deleted: images\backpack\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\baseball bat\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\baseball glove\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\bed\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\bench\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\bicycle\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\boat\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\book\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\bus\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\cell phone\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\chair\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\clock\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\frisbee\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\handbag\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\keyboard\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\kite\989d155fe0261a9d9938549a3c2f8168.png
Deleted: images\laptop\989d155

In [1]:
import pandas as pd

csv_files = [
    'image_metadata0.csv',
    'image_metadata1.csv',
    'image_metadata2.csv',
    'image_metadata3.csv',
    'image_metadata4.csv'
]

dataframes = [pd.read_csv(file) for file in csv_files]
combined_df = pd.concat(dataframes, ignore_index=True)

combined_df.to_csv('image_metadata.csv', index=False)

print("All CSV files have been combined into 'combined_image_metadata.csv'.")

All CSV files have been combined into 'combined_image_metadata.csv'.


In [2]:
import pandas as pd

# Load the CSV file into a DataFrame
csv_file = 'image_metadata_initial.csv'
df = pd.read_csv(csv_file)

# Filter out rows where 'file_path' ends with '989d155fe0261a9d9938549a3c2f8168.png'
df = df[~df['file_path'].str.endswith('989d155fe0261a9d9938549a3c2f8168.png')]

# Save the updated DataFrame back to the CSV file
df.to_csv('image_metadata.csv', index=False)

print("Rows with file_path ending in '989d155fe0261a9d9938549a3c2f8168.png' have been removed.")

Rows with file_path ending in '989d155fe0261a9d9938549a3c2f8168.png' have been removed.
