- [1. Importing Packages](#1)
- [2. Building Functions](#2)
- [3. Cleaning Data](#3)
    - [3.1 Select Images With Count > 100](#3_1)
    - [3.2 Move Images to Correct Locations](#3_2)
        - [3.2.1 For All](#3_2_1)
        - [3.2.2 For Top 100](#3_2_2)
        - [3.2.3 For Top 10](#3_2_3)

## 1. Importing Packages <a id='1'></a>

In [113]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
import shutil

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve, TimeSeriesSplit

# For data augmentation
from PIL import Image
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

## 2. Building Functions <a id='2'></a>

In [107]:
def load_images(folder_path, label):
    images = []
    labels = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg"):
            image = Image.open(os.path.join(folder_path, filename)).convert('RGB').resize((224, 224))
            images.append(np.array(image))
            labels.append(label)
    return images, labels

## 3. Cleaning Data <a id='3'></a>

### 3.1 Select Images With Count > 100 <a id='3_1'></a>

- We label data and leave them in different folders
- Then we use example code to augment data

In [50]:
label_path = './images.csv'
image_label = pd.read_csv(label_path)
image_label['label'].unique()

array(['Not sure', 'T-Shirt', 'Shoes', 'Shorts', 'Shirt', 'Pants',
       'Skirt', 'Other', 'Top', 'Outwear', 'Dress', 'Body', 'Longsleeve',
       'Undershirt', 'Hat', 'Polo', 'Blouse', 'Hoodie', 'Skip', 'Blazer'],
      dtype=object)

In [51]:
# We want to keep everything that's not "Not sure" and has a count more than 100
labels_count = image_label.groupby('label')['label'].count().sort_values(ascending=False)
labels_count = pd.DataFrame(labels_count).rename({'label':'count'}, axis=1).reset_index()
to_keep = [i for i in labels_count[labels_count['count'] >= 100].label.to_list() if i != 'Not sure']

In [52]:
image_label_to_keep = image_label[image_label['label'].isin(to_keep)]
image_label_to_keep.label.unique()

array(['T-Shirt', 'Shoes', 'Shorts', 'Shirt', 'Pants', 'Skirt', 'Outwear',
       'Dress', 'Longsleeve', 'Undershirt', 'Hat', 'Polo', 'Hoodie',
       'Blazer'], dtype=object)

In [53]:
# Lower the label
image_label_to_keep['label'] = image_label_to_keep['label'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  image_label_to_keep['label'] = image_label_to_keep['label'].str.lower()


### 3.2 Move Images to Correct Locations <a id='3_2'></a>

In [54]:
image_label_to_keep.shape

(4961, 4)

In [61]:
pwd

'/Users/liqingyang/Documents/GitHub/few_shot_clothing_detection/data'

In [69]:
### Make the folder
folder_names = list(image_label_to_keep['label'].unique())
base_path = './clean_data/top_100/'


for folder_name in folder_names:
    folder_path = os.path.join(base_path, folder_name)

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder '{folder_name}' created.")
    else:
        print(f"Folder '{folder_name}' already exists.")

Folder 't-shirt' already exists.
Folder 'shoes' already exists.
Folder 'shorts' already exists.
Folder 'shirt' already exists.
Folder 'pants' already exists.
Folder 'skirt' already exists.
Folder 'outwear' already exists.
Folder 'dress' already exists.
Folder 'longsleeve' already exists.
Folder 'undershirt' already exists.
Folder 'hat' already exists.
Folder 'polo' already exists.
Folder 'hoodie' already exists.
Folder 'blazer' already exists.


#### 3.2.1 For All <a id='3_2_1'></a>

In [94]:
image_folder = '/Users/liqingyang/Documents/GitHub/few_shot_clothing_detection/data/raw_data/'  # Path to the raw images
output_folder = './clean_data/all_data/'  # Path to new location

In [95]:
for index, row in image_label_to_keep.iterrows():
    image_name = row['image']
    image_name += ".jpg"
    label = row['label']
    
    # Define the source and destination paths
    src = os.path.join(image_folder, image_name)
    dst_folder = os.path.join(output_folder, label)
    
    # Move the image to the destination folder
    if os.path.exists(src):  # Check if the source image exists
        shutil.copy(src, dst_folder)
    else:
        print(f"Image {image_name} not found in {image_folder}")

print("Images moved successfully.")

Images moved successfully.


#### 3.2.2 For Top 100 <a id='3_2_2'></a>

In [97]:
image_folder = '/Users/liqingyang/Documents/GitHub/few_shot_clothing_detection/data/raw_data/'  # Path to the raw images
output_folder = './clean_data/top_100/'  # Path to new location
image_label_top_100 = image_label_to_keep.groupby('label').head(100)

In [98]:
image_label_top_100.shape

(1400, 4)

In [99]:
for index, row in image_label_top_100.iterrows():
    image_name = row['image']
    image_name += ".jpg"
    label = row['label']
    
    # Define the source and destination paths
    src = os.path.join(image_folder, image_name)
    dst_folder = os.path.join(output_folder, label)
    
    # Move the image to the destination folder
    if os.path.exists(src):  # Check if the source image exists
        shutil.copy(src, dst_folder)
    else:
        print(f"Image {image_name} not found in {image_folder}")

print("Images moved successfully.")

Images moved successfully.


#### 3.2.3 For Top 10 <a id='3_2_3'></a>

In [102]:
image_folder = '/Users/liqingyang/Documents/GitHub/few_shot_clothing_detection/data/raw_data/'  # Path to the raw images
output_folder = './clean_data/top_10/'  # Path to new location
image_label_top_10 = image_label_to_keep.groupby('label').head(10)
image_label_top_10.shape

(140, 4)

In [103]:
for index, row in image_label_top_10.iterrows():
    image_name = row['image']
    image_name += ".jpg"
    label = row['label']
    
    # Define the source and destination paths
    src = os.path.join(image_folder, image_name)
    dst_folder = os.path.join(output_folder, label)
    
    # Move the image to the destination folder
    if os.path.exists(src):  # Check if the source image exists
        shutil.copy(src, dst_folder)
    else:
        print(f"Image {image_name} not found in {image_folder}")

print("Images moved successfully.")

Images moved successfully.


### 3.3 Data Augmentation <a id='3_3'></a>

In [114]:
folder_0 = './clean_data/top_10/blazer/'
folder_1 = './clean_data/top_10/t-shirt/'

# Mark non_head_and_shoulder as 0 and head_and_shoulder as 1
images_0, labels_0 = load_images(folder_0, 0)
images_1, labels_1 = load_images(folder_1, 1)

images = np.array(images_0 + images_1)
labels = np.array(labels_0 + labels_1)

# Split data into train and test sets, we also stratify them 
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42, stratify=labels)

# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    rescale=1./255
)

datagen.fit(X_train)

In [115]:
X_train

array([[[[225, 216, 209],
         [231, 222, 215],
         [234, 225, 218],
         ...,
         [ 54,  54,  52],
         [ 52,  52,  50],
         [ 53,  53,  51]],

        [[229, 220, 213],
         [239, 230, 223],
         [233, 224, 217],
         ...,
         [ 55,  55,  51],
         [ 53,  52,  49],
         [ 53,  52,  48]],

        [[228, 219, 212],
         [237, 228, 221],
         [232, 223, 216],
         ...,
         [ 55,  53,  48],
         [ 54,  52,  47],
         [ 56,  54,  49]],

        ...,

        [[221, 212, 198],
         [228, 219, 205],
         [226, 217, 203],
         ...,
         [209, 198, 182],
         [211, 202, 185],
         [214, 205, 188]],

        [[224, 214, 202],
         [228, 218, 206],
         [233, 223, 211],
         ...,
         [217, 206, 189],
         [218, 209, 192],
         [218, 209, 192]],

        [[228, 218, 206],
         [234, 224, 212],
         [240, 230, 218],
         ...,
         [212, 202, 185],
        