## Requeriments

In [None]:
# Instal boto3 
%pip install boto3

# Install dotenv
%pip install python-dotenv

In [22]:
# Reload notebook to have saved changes of others modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
# Import libraries
# Classic libraries
from dotenv import load_dotenv
import pandas as pd
from PIL import Image
from matplotlib import pyplot as plt
import numpy as np
import shutil
import os
load_dotenv()

# Custom functions
# from modules import plot_bounding_boxes
import prepare_dataset
from prepare_dataset import get_file_folders
from prepare_dataset import download_files
from prepare_dataset import concatenate_csv
from prepare_dataset import check_bad_files
from prepare_dataset import plot_bounding_boxes
from prepare_dataset import mov_images
# AWS
import boto3

# Open CV
import cv2

In [2]:
# Initiate AWS env variables
ACCESS_KEY = os.getenv('ACCESS_KEY')
SECRET_KEY = os.getenv('SECRET_KEY')
BUCKET_NAME = os.getenv('BUCKET_NAME')
BUCKET_PREFIX = os.getenv('BUCKET_PREFIX')

# Local paths
BASE_DIR= os.getenv('BASE_DIR')
PATH_TO_SAVE = os.getenv('PATH_TO_SAVE')
path_to_save = os.path.join(BASE_DIR, PATH_TO_SAVE)

# csv files directions
csv_paths  = os.path.join(BASE_DIR, os.getenv('csv_paths'))
txt_file   = os.path.join(csv_paths, os.getenv('txt_file'))
test_file  = os.path.join(csv_paths, os.getenv('test_file')) 
val_file   = os.path.join(csv_paths, os.getenv('val_file'))      
train_file = os.path.join(csv_paths, os.getenv('train_file')) 

# Important paths
final_images_dir = os.path.join(BASE_DIR, os.getenv('final_images_dir'))
images_dir = os.path.join(BASE_DIR, os.getenv('images_dir'))
split_dir = os.path.join(BASE_DIR, os.getenv('split_dir'))

## Download information from AWS S3

"wget http://trax-geometry.s3.amazonaws.com/cvpr_challenge/SKU110K_fixed.tar.gz"

## Evaluation EDA

##### csv
- Validad que no existan valores nulos.
- Validar valores enteros en coordendas.
- Validar que sean positivos.
- Validar que los xs y ys no sean iguales.
- Validar superposición de bounding boxes

##### Imagenes & csv
- Validar imagenes que no tengan bounding boxes con referencia con csv.

##### Imagenes
- Evaluar todas las imágenes con el fin de encontrar imágenes corruptas <br> o con formato no valido.


- pendientes: Validar que los bounding boxes esten en el rango correcto.

In [4]:
# Get concatenated dataframes
df = concatenate_csv(txt_file,
     test_file,
     train_file,
     val_file)

In [5]:
# Get the amount of classes
df['class'].value_counts()

object    1730996
Name: class, dtype: int64

In [5]:
# Check data info and check that coordinates and image size are int types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1730996 entries, 0 to 1730995
Data columns (total 9 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   image_name    object
 1   x1            int64 
 2   y1            int64 
 3   x2            int64 
 4   y2            int64 
 5   class         object
 6   image_width   int64 
 7   image_height  int64 
 8   set           object
dtypes: int64(6), object(3)
memory usage: 118.9+ MB


In [6]:
# Check stats about the info
df.describe().round(2)

Unnamed: 0,x1,y1,x2,y2,image_width,image_height
count,1730996.0,1730996.0,1730996.0,1730996.0,1730996.0,1730996.0
mean,1173.73,1531.98,1294.15,1721.8,2452.5,3220.0
std,709.15,824.72,711.42,813.46,425.52,677.16
min,0.0,0.0,21.0,21.0,480.0,640.0
25%,595.0,898.0,716.0,1098.0,2336.0,2560.0
50%,1153.0,1498.0,1270.0,1675.0,2448.0,3264.0
75%,1702.0,2106.0,1820.0,2286.0,2448.0,3264.0
max,4129.0,4730.0,4323.0,5168.0,4320.0,5312.0


In [7]:
# Check null values in dataframe
df.isnull().value_counts()

image_name  x1     y1     x2     y2     class  image_width  image_height  set  
False       False  False  False  False  False  False        False         False    1730996
dtype: int64

In [108]:
# Check for negative values
numeric_df = df.drop(['image_name', 'class', 'set'], axis=1)

negatives = numeric_df.apply(lambda x : x < 0)
negatives = numeric_df[negatives.any(1)]

print(f"There is {len(negatives)} negative values")

There is 0 negative values


In [106]:
# Check we have coordinates that creates an area
df["area"] = (df["x2"] - df["x1"]) * (df["y2"] - df["y1"])

df_without_area = df[df["area"] == 0]

print(f"{len(df_without_area)} coordinates don't create an area")

0 coordinates don't create an area


In [105]:
# Check every image has its own bounding boxes
# List every filename in the "data/images" folder
images_list = set(os.listdir(images_dir))

# List unique image names from csv
images_csv = set(df["image_name"])

# Find differences between sets
diff = (images_list - images_csv)

print(f"{len(diff)} images doesn't have bounding box")

0 images doesn't have bounding box


In [104]:
# Evaluate corrupt images
bad_files = check_bad_files(images_dir)

print(f"There are {len(bad_files)} corrupt images")

There are 0 corrupt images


In [23]:
# List of dirs to create
dir_keys = ["train", "val", "test"]

for key in dir_keys:
    key_labels_dir = os.path.join(split_dir,'labels',key)
    os.makedirs(key_labels_dir)

In [6]:
import pandas as pd
def box_normalization(csv_paths, subset, filename):

    df_annotations = pd.read_csv(f'{csv_paths}annotations_{subset}.csv', names=["image_name", "x1", "y1", "x2", "y2","class", "image_width", "image_height"])

    print(f'{csv_paths}annotations_{subset}.csv')
    normalized_coordinates = []
    starter = 0

    for i in df_annotations.loc[df_annotations['image_name'] == filename].values:
        
        b_center_x = (i[1] + i[3]) / 2 
        b_center_y = (i[2] + i[4]) / 2
        b_width    = (i[3] - i[1])
        b_height   = (i[4] - i[2])

        # Normalise the co-ordinates by the dimensions of the image
        image_w = i[6]
        image_h= i[7]
        image_c = i[5]
        b_center_x /= image_w 
        b_center_y /= image_h 
        b_width    /= image_w 
        b_height   /= image_h
        
        starter += 1

        # Create text
        normalized_coordinates.append("{} {:.3f} {:.3f} {:.3f} {:.3f}".format(starter, b_center_x, b_center_y, b_width, b_height))
    
    return normalized_coordinates

In [9]:
def get_images_list(csv_paths, subset):
    # List of image names
    df = pd.read_csv(f'{csv_paths}annotations_{subset}.csv', names=["image_name", "x1", "y1", "x2", "y2","class", "image_width", "image_height"])

    # Extract unique names
    images_list = list(df["image_name"].unique())

    return images_list

In [10]:
get_images_list(csv_paths=csv_paths, subset='train')

['train_0.jpg',
 'train_1.jpg',
 'train_10.jpg',
 'train_100.jpg',
 'train_1000.jpg',
 'train_1001.jpg',
 'train_1002.jpg',
 'train_1003.jpg',
 'train_1004.jpg',
 'train_1005.jpg',
 'train_1006.jpg',
 'train_1007.jpg',
 'train_1008.jpg',
 'train_1009.jpg',
 'train_101.jpg',
 'train_1010.jpg',
 'train_1011.jpg',
 'train_1012.jpg',
 'train_1013.jpg',
 'train_1014.jpg',
 'train_1015.jpg',
 'train_1016.jpg',
 'train_1017.jpg',
 'train_1018.jpg',
 'train_1019.jpg',
 'train_102.jpg',
 'train_1020.jpg',
 'train_1021.jpg',
 'train_1022.jpg',
 'train_1023.jpg',
 'train_1025.jpg',
 'train_1026.jpg',
 'train_1027.jpg',
 'train_1028.jpg',
 'train_1029.jpg',
 'train_103.jpg',
 'train_1030.jpg',
 'train_1031.jpg',
 'train_1032.jpg',
 'train_1033.jpg',
 'train_1034.jpg',
 'train_1035.jpg',
 'train_1036.jpg',
 'train_1037.jpg',
 'train_1038.jpg',
 'train_1039.jpg',
 'train_104.jpg',
 'train_1040.jpg',
 'train_1041.jpg',
 'train_1042.jpg',
 'train_1043.jpg',
 'train_1044.jpg',
 'train_1045.jpg',
 'trai

In [8]:
filename = 'train_8207.jpg'
box_normalization(csv_paths=csv_paths, subset='train', filename=filename)

/Users/fabioalvarez/Documents/1. Anyone AI/Final Project/retail_prediction/data/raw/annotations/annotations_train.csv


['1 0.318 0.542 0.063 0.054',
 '2 0.247 0.551 0.071 0.042',
 '3 0.542 0.446 0.039 0.046',
 '4 0.225 0.376 0.083 0.056',
 '5 0.243 0.494 0.083 0.048',
 '6 0.370 0.307 0.069 0.057',
 '7 0.298 0.308 0.072 0.062',
 '8 0.500 0.451 0.047 0.044',
 '9 0.443 0.466 0.055 0.051',
 '10 0.437 0.301 0.062 0.055',
 '11 0.383 0.471 0.058 0.043',
 '12 0.496 0.405 0.042 0.049',
 '13 0.314 0.486 0.059 0.054',
 '14 0.440 0.359 0.058 0.048',
 '15 0.767 0.616 0.029 0.025',
 '16 0.803 0.540 0.020 0.028',
 '17 0.492 0.355 0.046 0.051',
 '18 0.236 0.436 0.079 0.053',
 '19 0.310 0.428 0.066 0.054',
 '20 0.379 0.420 0.067 0.051',
 '21 0.445 0.413 0.053 0.055',
 '22 0.251 0.605 0.075 0.045',
 '23 0.208 0.244 0.087 0.071',
 '24 0.291 0.244 0.074 0.066',
 '25 0.365 0.245 0.069 0.061',
 '26 0.434 0.243 0.062 0.060',
 '27 0.216 0.313 0.090 0.062',
 '28 0.825 0.626 0.022 0.031',
 '29 0.799 0.634 0.022 0.026',
 '30 0.772 0.647 0.026 0.034',
 '31 0.740 0.659 0.027 0.027',
 '32 0.703 0.672 0.031 0.034',
 '33 0.669 0.682 

In [6]:
def normalized_text(csv_paths, split_dir, filename):
    sub_path="labels"


    # Get list of box coordinates by image
    normalized_coordinates = box_normalization(csv_paths= csv_paths, filename= filename)

    # Path to save images
    key_path = filename.split("_")[0]
    name = filename.split(".")[0] + ".txt"

    # Full path
    txt_path = os.path.join(split_dir,sub_path,key_path,name)

    # Append list and create txt
    with open(txt_path, 'w') as f:
        f.write("\n".join(normalized_coordinates))

In [7]:
normalized_text(csv_paths, split_dir, filename)

In [14]:
import multiprocessing
from multiprocessing import get_context
from functools import partial

In [11]:
# List of image names 
images_list = list(df["image_name"].unique())

In [None]:
images_list

In [None]:
# Instanciate multiprocessing pool
pool = multiprocessing.get_context("fork").Pool()

# Partial de function
func = partial(normalized_text,csv_paths ,split_dir)
pool.map(func, images_list)
pool.close()
pool.join()

#### Next steps

1. Investigar estructura de files para Yolo V5.
2. Realizar proceso de estructurar datos.
3. Realizar docker file, compose para montar microservicios y dependencias en aws.


In [6]:
p = get_context("fork").Pool(4)
results = p.map(multi, [1,2,3,4])
p.close()

In [13]:
import multiprocessing
from multiprocessing import get_context
from functools import partial
from dotenv import load_dotenv
import pandas as pd
import os
load_dotenv()

# Local paths
BASE_DIR= os.getenv('BASE_DIR')
PATH_TO_SAVE = os.getenv('PATH_TO_SAVE')
path_to_save = os.path.join(BASE_DIR, PATH_TO_SAVE)

# csv files directions
csv_paths  = os.path.join(BASE_DIR, os.getenv('csv_paths'))
txt_file   = os.path.join(csv_paths, os.getenv('txt_file'))

test_file  = os.path.join(csv_paths, os.getenv('test_file')) 
val_file   = os.path.join(csv_paths, os.getenv('val_file'))      
train_file = os.path.join(csv_paths, os.getenv('train_file')) 

# Important paths
final_images_dir = os.path.join(BASE_DIR, os.getenv('final_images_dir'))
images_dir = os.path.join(BASE_DIR, os.getenv('images_dir'))
split_dir = os.path.join(BASE_DIR, os.getenv('split_dir'))


def create_labels_path():

    # List of dirs to create
    dir_keys = ["train", "val", "test"]

    for key in dir_keys:
        key_labels_dir = os.path.join(split_dir,'labels',key)

        if not os.path.exists(key_labels_dir):
            os.makedirs(key_labels_dir)

def box_normalization(csv_paths, subset, filename):

    df_annotations = pd.read_csv(f'{csv_paths}annotations_{subset}.csv', names=["image_name", "x1", "y1", "x2", "y2","class", "image_width", "image_height"])

    normalized_coordinates = []
    obj_class = 0

    for i in df_annotations.loc[df_annotations['image_name'] == filename].values:
        
        b_center_x = (i[1] + i[3]) / 2 
        b_center_y = (i[2] + i[4]) / 2
        b_width    = (i[3] - i[1])
        b_height   = (i[4] - i[2])

        # Normalise the co-ordinates by the dimensions of the image
        image_w = i[6]
        image_h= i[7]
        image_c = i[5]
        b_center_x /= image_w 
        b_center_y /= image_h 
        b_width    /= image_w 
        b_height   /= image_h

        # Create text
        normalized_coordinates.append("{} {:.3f} {:.3f} {:.3f} {:.3f}".format(obj_class, b_center_x, b_center_y, b_width, b_height))
    
    return normalized_coordinates

def normalized_text(csv_paths, split_dir, subset, filename):
    sub_path="labels"

    # Get list of box coordinates by image
    normalized_coordinates = box_normalization(csv_paths= csv_paths, subset=subset, filename= filename)

    # Path to save images
    key_path = filename.split("_")[0]
    name = filename.split(".")[0] + ".txt"

    # Full path
    txt_path = os.path.join(split_dir,sub_path,key_path,name)

    # Append list and create txt
    with open(txt_path, 'w') as f:
        f.write("\n".join(normalized_coordinates))

def get_images_list(csv_paths, subset):
    # List of image names
    df = pd.read_csv(f'{csv_paths}annotations_{subset}.csv', names=["image_name", "x1", "y1", "x2", "y2","class", "image_width", "image_height"])

    # Extract unique names
    images_list = list(df["image_name"].unique())

    return images_list

In [14]:
subset = "test"
images_list = get_images_list(csv_paths=csv_paths, subset=subset)
# Instanciate multiprocessing pool
pool = multiprocessing.get_context("fork").Pool()
func = partial(normalized_text,csv_paths ,split_dir, subset)
pool.map(func, images_list)
pool.close()
pool.join()

Process ForkPoolWorker-13:
Process ForkPoolWorker-11:
Process ForkPoolWorker-10:
Process ForkPoolWorker-15:
Process ForkPoolWorker-9:
Process ForkPoolWorker-12:
Process ForkPoolWorker-16:
Process ForkPoolWorker-14:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):


KeyboardInterrupt: 

  File "/opt/miniconda3/envs/DataScience/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/miniconda3/envs/DataScience/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/miniconda3/envs/DataScience/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/miniconda3/envs/DataScience/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/miniconda3/envs/DataScience/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/miniconda3/envs/DataScience/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/miniconda3/envs/DataScience/lib/python3.9/multiprocessing/pool.py", line 125, in worker
    resu

## 