In [12]:
import json
import os
import re

# Path to input JSONL file
jsonl_file = "../data/initial_loads_product_meta_data_products_meta_Books.jsonl"

# List to store tuples of (parent_asin, image_url, parsed_code)
image_data = []

# Read JSONL file and extract image URLs with parent_asin
with open(jsonl_file, "r", encoding="utf-8") as file:
    for line in file:
        try:
            data = json.loads(line)
            parent_asin = data.get("parent_asin", "unknown_asin")  # Fallback if missing

            # Ensure "images" is a valid JSON string and parse it
            images_str = data.get("images", "[]")
            images = json.loads(images_str) if isinstance(images_str, str) else images_str

            for image in images:
                url = image.get("large")
                if url:
                    # Extract the unique image code from the URL
                    match = re.search(r"/I/([^._]+)", url)
                    parsed_code = match.group(1) if match else "unknown_code"

                    image_data.append((parent_asin, url, parsed_code))

        except json.JSONDecodeError as e:
            print(f"Skipping invalid JSON line: {e}")

# Save ASIN, URL, and parsed_code to a text file
output_file = "../data/image_urls.txt"
os.makedirs(os.path.dirname(output_file), exist_ok=True)  # Ensure output directory exists

with open(output_file, "w", encoding="utf-8") as f:
    for asin, url, parsed_code in image_data:
        f.write(f"{asin},{url},{parsed_code}\n")  # Store as CSV format

print(f"Extracted {len(image_data)} image URLs with parent ASIN and parsed codes, saved to {output_file}")


AttributeError: 'str' object has no attribute 'get'

In [15]:
import json
import re

# Path to input JSONL file
jsonl_file = "../data/initial_loads_product_meta_data_products_meta_Books.jsonl"
output_file = "../data/book_image_urls.txt"

# List to store tuples of (parent_asin, image_url, parsed_code)
image_data = []

# Read JSONL file and extract image URLs with parent_asin
with open(jsonl_file, "r", encoding="utf-8") as file:
    for line in file:
        try:
            data = json.loads(line)
            parent_asin = data.get("parent_asin", "unknown_asin")  # Fallback if missing

            # Ensure "images" is properly parsed
            images_str = data.get("images", "[]")

            try:
                images = json.loads(images_str)  # First parsing attempt
                if isinstance(images, str):  
                    images = json.loads(images)  # Second parsing attempt (if needed)
            except json.JSONDecodeError:
                images = []  # If parsing fails, assume no images

            # Extract URLs from properly parsed image data
            for image in images:
                if isinstance(image, dict):  # Ensure it's a dictionary before accessing 'large'
                    url = image.get("large")
                    if url:
                        # Extract the unique image code from the URL
                        match = re.search(r"/I/([^._]+)", url)
                        parsed_code = match.group(1) if match else "unknown_code"

                        image_data.append((parent_asin, url, parsed_code))

        except json.JSONDecodeError as e:
            print(f"Skipping invalid JSON line: {e}")

# Save ASIN, URL, and parsed_code to a text file
with open(output_file, "w", encoding="utf-8") as f:
    for asin, url, parsed_code in image_data:
        f.write(f"{asin},{url},{parsed_code}\n")  # Store as CSV format

print(f"Extracted {len(image_data)} book image URLs with parent ASIN and parsed codes, saved to {output_file}")


Extracted 97284 book image URLs with parent ASIN and parsed codes, saved to ../data/book_image_urls.txt


In [10]:
import os
import requests

# Configuration
url_file = "../data/image_urls.txt"
output_folder = "../output_data/downloaded_images"
max_images = 20
# Create output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Read ASIN, URL, and parsed code
with open(url_file, "r") as file:
    asin_url_code_list = file.read().splitlines()

# Download images
downloaded_count = 0
for i, line in enumerate(asin_url_code_list):
    if downloaded_count >= max_images:
        break

    try:
        asin, url, parsed_code = line.strip().split(",", 2)

        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200:
            image_filename = f"{parsed_code}.jpg"
            image_path = os.path.join(output_folder, image_filename)

            with open(image_path, "wb") as img_file:
                for chunk in response.iter_content(1024):
                    img_file.write(chunk)

            downloaded_count += 1
            print(f"({downloaded_count}/{max_images}) Downloaded: {image_path}")

        else:
            print(f"Failed to download {url} - Status Code: {response.status_code}")

    except ValueError:
        print(f"Skipping invalid line: {line}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")

print("Image download completed.")


(1/20) Downloaded: ../output_data/downloaded_images\51xAtwVr3ML.jpg
(2/20) Downloaded: ../output_data/downloaded_images\51DNo8piIsL.jpg
(3/20) Downloaded: ../output_data/downloaded_images\416SR3V057L.jpg
(4/20) Downloaded: ../output_data/downloaded_images\51ln3gTBmqL.jpg
(5/20) Downloaded: ../output_data/downloaded_images\51IWAe7gUoL.jpg
(6/20) Downloaded: ../output_data/downloaded_images\51bAmLm4wYL.jpg
(7/20) Downloaded: ../output_data/downloaded_images\51lfSt+1OwL.jpg
(8/20) Downloaded: ../output_data/downloaded_images\51nBMMqON7L.jpg
(9/20) Downloaded: ../output_data/downloaded_images\41VHYCN1KFL.jpg
(10/20) Downloaded: ../output_data/downloaded_images\51z7G1V8FrL.jpg
(11/20) Downloaded: ../output_data/downloaded_images\41Aty1S8HdL.jpg
(12/20) Downloaded: ../output_data/downloaded_images\41mZJB7WLaL.jpg
(13/20) Downloaded: ../output_data/downloaded_images\51jbPDwEeFL.jpg
(14/20) Downloaded: ../output_data/downloaded_images\41B-a8IMN1L.jpg
(15/20) Downloaded: ../output_data/download