<a href="https://colab.research.google.com/github/fabianmax/car-classification/blob/master/notebooks/prefilter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Select TF2.x as version
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
import tensorflow as tf
import numpy as np
from keras.preprocessing.image import image
from keras.applications.resnet_v2 import preprocess_input
from tensorflow.keras.applications import ResNet50V2
import os
from shutil import copy2
from random import sample, seed
import gc

from google.colab import drive

In [0]:
# Class labels in imagenet corresponding to cars
CAR_CLASSES = ['minivan', 'limousine', 'sports_car', 'convertible', 'cab', 'racer', 'passenger_car',
               'recreational_vehicle', 'pickup', 'police_van', 'minibus', 'moving_van', 'tow_truck', 'jeep',
               'landrover']

CAR_IDX = [656, 627, 817, 511, 468, 751, 705, 757, 717, 734, 654, 675, 864, 609]

In [5]:
# Mount Google Drive for data
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
!mkdir data
!mkdir data/raw
!cp drive/My\ Drive/Car-Classifier/data/zip/car-classifier-raw.zip data

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘data/raw’: File exists
^C


In [0]:
!mkdir data/filtered

!unzip -q data/car-classifier-raw.zip -d data/raw

In [0]:
# Good values below 0.2, else too many cars out
THRESH = 0.1
IMAGE_DIR = 'data/raw/car-classifier-raw/'  # Directory images are stored in
STORAGE_DIR = 'drive/My Drive/Car-Classifier/data/'  # Directory to store split images

In [0]:
def is_car_acc_prob(predictions, thresh=THRESH, car_idx=CAR_IDX):
    """
    Determine if car on image by accumulating probabilities of car prediction and comparing to threshold

    Args:
        predictions: (?, 1000) matrix of probability predictions resulting from ResNet with imagenet weights
        thresh: threshold accumulative probability over which an image is considered a car
        car_idx: indices corresponding to cars

    Returns:
        np.array of booleans describing if car or not
    """
    predictions = np.array(predictions, dtype=float)
    car_probs = predictions[:, car_idx]
    car_probs_acc = car_probs.sum(axis=1)
    return car_probs_acc > thresh

In [0]:
def load_images(filepath, filenames):
    """
    Load images given in filenames to array format

    Args:
        filepath: directory files are stored in. Needed since filenames are passed relative
        filenames: filenames of images of which array representation should be returned

    Returns:
        img_array: array of images
    """
    img_array = []
    for i, file in enumerate(filenames):
        if i % 1000 == 0:
            print("#", end="")
        img = image.load_img(filepath + file, target_size=(224, 224))
        # img = tf.image.resize_with_crop_or_pad(img, target_height=224, target_width=224)
        img = image.img_to_array(img)

        img_array.append(img)

    return preprocess_input(np.asarray(img_array))

In [9]:
# Filenames of all images
files = os.listdir(IMAGE_DIR)

# seed(32)
# files = sample(files, 5000)

total_files = len(files)

print("There are {} files to be processed...".format(total_files))

There are 64467 files to be processed...


In [0]:
# Initialize ResNet Model
model = ResNet50V2(weights='imagenet')

In [20]:
# Compute predictions chunkwise with explicit garbage collection
n = 8000
pred_list = []
for i in range(0, len(files), n):
  print(i)
  img_array = load_images(IMAGE_DIR, files[i:min(i+n, len(files))])
  preds = model.predict(img_array)
  pred_list.append(preds)
  del img_array
  gc.collect()

0
########8000
########16000
########24000
########32000
########40000
########48000
########56000
########64000
#

In [0]:
preds = np.concatenate(pred_list, axis=0)

In [0]:
assert(preds.shape[0] == len(files))

In [25]:
res = is_car_acc_prob(preds)

# List to store filenames of car images
car_images = []

# List to store filenames of non-car images
non_car_images = []

# for i, prediction in enumerate(decoded_preds):
#     if is_car(prediction, CAR_CLASSES):
#         car_images.append(files[i])
#     else:
#         non_car_images.append(files[i])

for i, car_flag in enumerate(res):
    if car_flag:
        car_images.append(files[i])
    else:
        non_car_images.append(files[i])


print(len(car_images), len(non_car_images))

52610 11857


In [0]:
storage_dir_ext = STORAGE_DIR + 'thresh' + str(THRESH).replace('.', '_') + '/'
os.mkdir(storage_dir_ext)

os.mkdir(storage_dir_ext + 'cars')
os.mkdir(storage_dir_ext + 'non_cars')

for filename in car_images:
    copy2(IMAGE_DIR + filename, storage_dir_ext + 'cars')

for filename in non_car_images:
    copy2(IMAGE_DIR + filename, storage_dir_ext + 'non_cars')