In [1]:
import os
import pandas as pd
from scipy.io import loadmat
from random import randrange
import random
import numpy as np
from matplotlib import pyplot as plt
import cv2
import imageio
import imgaug.augmenters as iaa
import glob
from pathlib import Path
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB7
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

%matplotlib inline

In [2]:
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        fullPath = os.path.join(dirName, entry)
        
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            if '.jpg' in fullPath:
                allFiles.append(fullPath)
                
    return allFiles

def custom_image_generator(lists, batch_size, mode="train", aug=None):
    while True:
        images = []
        make_ids = []
        model_ids = []
        
        while len(images) < batch_size:
            random_idx = randrange(len(lists[0]))
            img = load_img(lists[0][random_idx], target_size=(IMAGE_SIZE, IMAGE_SIZE))
            img = img_to_array(img)
            # TODO: Center crop instead of resize
            images.append(img)
            make_ids.append(lists[1][random_idx])
            model_ids.append(lists[2][random_idx])
        
        labels = [np.array(make_ids),np.array(model_ids)]
        
        if aug:
            (images, labels) = next(aug.flow(np.array(images), labels, batch_size=batch_size))
        
        yield np.array(images),  labels 

def build_model(losses, metrics):
    base_model = EfficientNetB7(include_top=False, weights='imagenet')
    
    for layer in base_model.layers:
        layer.trainable = False
    
    model_input = Input(shape=(IMAGE_SIZE,IMAGE_SIZE,3))
    x = base_model(model_input)
    x = GlobalAveragePooling2D()(x)
    
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.25)(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.25)(x)
    
    y1 = Dense(128, activation='relu')(x)
    y1 = Dropout(0.25)(y1)
    y1 = Dense(64, activation='relu')(y1)
    y1 = Dropout(0.25)(y1)
    
    y2 = Dense(128, activation='relu')(x)
    y2 = Dropout(0.25)(y2)
    y2 = Dense(64, activation='relu')(y2)
    y2 = Dropout(0.25)(y2)
    
    y1 = Dense(163, activation='softmax', name='make_id')(y1)
    y2 = Dense(1716, activation='softmax', name='model_id')(y2)
    
    model = Model(inputs=model_input, outputs=[y1, y2])
    
    model.compile(loss=losses, optimizer=SGD(lr=0.01, momentum=0.9), metrics=metrics)
    
    return model

In [3]:
def plotImages(file_paths):
    r = random.sample(file_paths, 12)
    plt.figure(figsize=(10,10))
    plt.subplot(341)
    plt.imshow(cv2.imread(r[0])); plt.axis('off')
    plt.subplot(342)
    plt.imshow(cv2.imread(r[1])); plt.axis('off')
    plt.subplot(343)
    plt.imshow(cv2.imread(r[2])); plt.axis('off')
    plt.subplot(344)
    plt.imshow(cv2.imread(r[3])); plt.axis('off')
    plt.subplot(345)
    plt.imshow(cv2.imread(r[4])); plt.axis('off')
    plt.subplot(346)
    plt.imshow(cv2.imread(r[5])); plt.axis('off')
    plt.subplot(347)
    plt.imshow(cv2.imread(r[6])); plt.axis('off')
    plt.subplot(348)
    plt.imshow(cv2.imread(r[7])); plt.axis('off')
    plt.subplot(349)
    plt.imshow(cv2.imread(r[8])); plt.axis('off')
    plt.subplot(3,4,10)
    plt.imshow(cv2.imread(r[9])); plt.axis('off')
    plt.subplot(342)
    plt.imshow(cv2.imread(r[10])); plt.axis('off')
    plt.subplot(343)
    plt.imshow(cv2.imread(r[11])); plt.axis('off')

In [4]:
file_paths = getListOfFiles('data/image')
df = pd.DataFrame(file_paths, columns=['filename'])
df['make_id'] = df.filename.apply(lambda x: x.split('/')[2])
df['model_id'] = df.filename.apply(lambda x: x.split('/')[3])
df.to_csv('df.csv', index=False)

In [5]:
df.make_id.nunique()

163

In [6]:
df.model_id.nunique()

1716

In [7]:
drop_table = pd.DataFrame()
drop_rows = df[['make_id', 'model_id']].value_counts()[df[['make_id', 'model_id']].value_counts() < 6].keys().tolist()
drop_table = drop_table.append(df[pd.Series(list(zip(df['make_id'], df['model_id']))).isin(drop_rows)])
drop_table = drop_table.drop_duplicates(keep=False)
drop_table

Unnamed: 0,filename,make_id,model_id
2965,data/image/59/358/2009/d163010a9407bd.jpg,59,358
2966,data/image/59/358/2009/f3e8b21e67f030.jpg,59,358
2967,data/image/59/358/2009/e4035be5bee1f4.jpg,59,358
2968,data/image/59/358/2009/7a60b138b980af.jpg,59,358
3891,data/image/66/1995/2012/0c54d61c25a44a.jpg,66,1995
...,...,...,...
135569,data/image/149/1804/2012/d8accda4213fd6.jpg,149,1804
135570,data/image/149/1804/2012/97ee687e784b99.jpg,149,1804
135571,data/image/149/1804/2012/f3b2f33ed7693e.jpg,149,1804
135572,data/image/149/1804/2012/d0effb5ce242c3.jpg,149,1804


In [8]:
temp_df = df[~df.filename.isin(drop_table.filename)]
temp_df

Unnamed: 0,filename,make_id,model_id
0,data/image/135/947/2009/deab76f22e8937.jpg,135,947
1,data/image/135/947/2009/b00fef3e12bdf0.jpg,135,947
2,data/image/135/947/2009/69ada64edf3ebb.jpg,135,947
3,data/image/135/947/2009/60b5fa727f82f2.jpg,135,947
4,data/image/135/947/2009/83ce148be0dcb5.jpg,135,947
...,...,...,...
136721,data/image/25/1757/2010/e43225a8f5170e.jpg,25,1757
136722,data/image/25/1757/2010/3283c7054ab6af.jpg,25,1757
136723,data/image/25/1757/2010/c03431b205c981.jpg,25,1757
136724,data/image/25/1757/2010/c0f30e5d156b4d.jpg,25,1757


In [9]:
temp_df[['make_id', 'model_id']].value_counts()

make_id  model_id
54       196         303
157      1915        288
81       68          283
77       127         272
157      1917        237
                    ... 
27       1082          6
131      1011          6
138      1411          6
118      994           6
149      1808          6
Length: 1677, dtype: int64

In [10]:
train, test = train_test_split(temp_df, test_size=0.10, shuffle=True, random_state=42, stratify=temp_df[['make_id', 'model_id']])
test = test.append(drop_table)

In [11]:
train, val = train_test_split(train, test_size=0.25, shuffle=True, random_state=42, stratify=train[['make_id', 'model_id']])

In [15]:
train.make_id.nunique()

162

In [16]:
val.model_id.nunique()

1677

In [341]:
train.to_csv('train.csv', index=False)
val.to_csv('val.csv', index=False)
test.to_csv('test.csv', index=False)