In [1]:
import os
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))

In [5]:
train.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram


In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263859 entries, 0 to 263858
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   image_link    263859 non-null  object
 1   group_id      263859 non-null  int64 
 2   entity_name   263859 non-null  object
 3   entity_value  263859 non-null  object
dtypes: int64(1), object(3)
memory usage: 8.1+ MB


In [8]:
import os
import time
import urllib.request
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import multiprocessing
from functools import partial

In [6]:
# def create_placeholder_image(image_save_path):
#     try:
#         placeholder_image = Image.new('RGB', (100, 100), color='black')  # Creating a black placeholder image
#         placeholder_image.save(image_save_path)  # Saving the image in the specified path
#     except Exception as e:
#         return

In [9]:
# def download_image(image_link, save_folder, retries=3, delay=3):
#     if not isinstance(image_link, str):
#         return

#     filename = Path(image_link).name
#     image_save_path = os.path.join(save_folder, filename)

#     if os.path.exists(image_save_path):
#         return

#     for _ in range(retries):
#         try:
#             urllib.request.urlretrieve(image_link, image_save_path)
#             return
#         except:
#             time.sleep(delay)
    
#     create_placeholder_image(image_save_path)

In [10]:
# def download_images(image_links, download_folder, allow_multiprocessing=True):
#     if not os.path.exists(download_folder):
#         os.makedirs(download_folder)

#     if allow_multiprocessing:
#         download_image_partial = partial(
#             download_image, save_folder=download_folder, retries=3, delay=3)

#         with multiprocessing.Pool(64) as pool:
#             list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
#             pool.close()
#             pool.join()
#     else:
#         for image_link in tqdm(image_links, total=len(image_links)):
#             download_image(image_link, save_folder=download_folder, retries=3, delay=3)

In [13]:
def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return

# Function to download a single image
def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except:
            time.sleep(delay)
    
    create_placeholder_image(image_save_path) # Create a black placeholder image for invalid links/images

# Function to download images in parallel
def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(download_image, save_folder=download_folder, retries=3, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)

# Function to clear a folder after training
def clear_folder(folder_path):
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
        except Exception as e:
            print(f'Error deleting file: {file_path} | {e}')

# Function to download images in batches and train the model
def download_in_batches(df, batch_size, download_folder):
    total_images = len(df)
    
    for i in range(0, total_images, batch_size):
        batch_df = df.iloc[i:i+batch_size]
        image_links_list = batch_df['image_links'].tolist()
        
        # Download current batch
        print(f"Downloading batch {i // batch_size + 1}")
        download_images(image_links_list, download_folder)
        
        # Perform model training or evaluation here
        print(f"Training model on batch {i // batch_size + 1}")
        # Call your model training function here (you need to implement this)
        # model_training_function(download_folder)
        
        # Clear downloaded images after training
        clear_folder(download_folder)
        print(f"Cleared batch {i // batch_size + 1} images")

In [14]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Function to preprocess images
def preprocess_images(image_folder, target_size=(128, 128), batch_size=32):
    datagen = ImageDataGenerator(rescale=1./255)  # Rescale pixel values to [0, 1]
    
    # Load images from the folder and preprocess them
    image_generator = datagen.flow_from_directory(
        image_folder,
        target_size=target_size,
        batch_size=batch_size,
        class_mode='categorical',  # Assuming multi-class classification
        shuffle=True
    )
    
    return image_generator

ImportError: Could not find the DLL(s) 'msvcp140_1.dll'. TensorFlow requires that these DLLs be installed in a directory that is named in your %PATH% environment variable. You may install these DLLs by downloading "Microsoft C++ Redistributable for Visual Studio 2015, 2017 and 2019" for your platform from this URL: https://support.microsoft.com/help/2977003/the-latest-supported-visual-c-downloads