In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pprint
from collections import Counter
import joblib
from skimage.io import imread
from skimage.transform import resize
import pandas as pd
pp = pprint.PrettyPrinter(indent=4)

In [None]:

def write_pkl(img_data, pklname):
    """
    load images from path, resize them and write them as arrays to a dictionary, 
    together with labels and metadata. The dictionary is written to a pickle file 
    named '{pklname}_{width}x{height}px.pkl'.
     
    Parameter
    ---------
    img_data: str
        CSV file with image and category data
    pklname: str
        path to output file

    """
     
    data = dict()
    data['description'] = 'product images in rgb'
    data['label'] = []
    data['filename'] = []
    data['data'] = []   
     
    pklname = f"{pklname}.pkl"
 
    # read all images in PATH, resize and write to DESTINATION_PATH
    df = pd.read_csv(img_data)
    for idx, row in df.iterrows():
        img_file = f"./images_clean/{row['img_file_name']}"

        if img_file[-3:] in {'jpg', 'png'}:
            im = imread(img_file)
            # im = resize(im, (width, height)) #[:,:,::-1]
            data['label'].append(row['category_id'])
            data['filename'].append(row['img_file_name'])
            data['data'].append(im)
 
    joblib.dump(data, pklname)

In [None]:
write_pkl('image_cat.csv', 'image_cat')

In [None]:
data = joblib.load(f'image_cat.pkl')
 
print('number of samples: ', len(data['data']))
print('keys: ', list(data.keys()))
print('description: ', data['description'])
print('image shape: ', data['data'][0].shape)
print('labels:', np.unique(data['label']))
 
Counter(data['label'])

In [None]:
# use np.unique to get all unique values in the list of labels
labels = np.unique(data['label'])
 
# set up the matplotlib figure and axes, based on the number of labels
fig, axes = plt.subplots(1, len(labels))
fig.set_size_inches(15,4)
fig.tight_layout()
 
# make a plot for every label (equipment) type. The index method returns the 
# index of the first item corresponding to its search string, label in this case
for ax, label in zip(axes, labels):
    idx = data['label'].index(label)
     
    ax.imshow(data['data'][idx])
    ax.axis('off')
    ax.set_title(label)

In [2]:
data = joblib.load(f'image_cat.pkl')
X = np.array(data['data'])
y = np.array(data['label'])

: 

: 

In [None]:
from sklearn.model_selection import train_test_split
 
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.3, 
    shuffle=True,
    random_state=42,
)

In [None]:
def plot_bar(y, loc='left', relative=True):
    width = 0.35
    if loc == 'left':
        n = -0.5
    elif loc == 'right':
        n = 0.5
     
    # calculate counts per type and sort, to ensure their order
    unique, counts = np.unique(y, return_counts=True)
    sorted_index = np.argsort(unique)
    unique = unique[sorted_index]
     
    if relative:
        # plot as a percentage
        counts = 100*counts[sorted_index]/len(y)
        ylabel_text = '% count'
    else:
        # plot counts
        counts = counts[sorted_index]
        ylabel_text = 'count'
         
    xtemp = np.arange(len(unique))
     
    plt.bar(xtemp + n*width, counts, align='center', alpha=.7, width=width)
    plt.xticks(xtemp, unique, rotation=45)
    plt.xlabel('equipment type')
    plt.ylabel(ylabel_text)
 
plt.suptitle('relative amount of photos per type')
plot_bar(y_train, loc='left')
plot_bar(y_test, loc='right')
plt.legend([
    'train ({0} photos)'.format(len(y_train)), 
    'test ({0} photos)'.format(len(y_test))
]);