# School mapping - Montevideo: Model

In [4]:
#@title Verificar presencia de placa GPU { display-mode: "form" }

import tensorflow as tf
print("GPU device name: {}".format(tf.test.gpu_device_name()))

GPU device name: /device:GPU:0


In [5]:
#@title Descargar datasets { display-mode: 'form' }
#@test {'output': 'ignore'}

#!mkdir -p 1/
#!gcloud config set project golden-system-178513
#!gsutil -m cp gs://dym-temp/school-mapping/datasets/1.tar.gz 1/
#!cd 1/ && tar xzf 1.tar.gz

In [6]:
import numpy as np
import os
import csv
import gc

from glob import glob
from keras.applications.resnet50 import ResNet50
from keras.layers import Flatten, Dense, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras import optimizers
from keras import backend as K 

In [7]:
DATASET_DIR = '../data/ds1/'

WIDTH = 300
HEIGHT = 300
CLASSES = 3

In [8]:
TRAIN_DIR = os.path.join(DATASET_DIR, 'train')
VAL_DIR = os.path.join(DATASET_DIR, 'test')

train_files = glob(os.path.join(TRAIN_DIR, '*.jpg'))
val_files = glob(os.path.join(VAL_DIR, '*.jpg'))

n_train_samples = len(train_files)
n_val_samples = len(val_files)

n_train_samples, n_val_samples

(39905, 9977)

In [36]:
model = ResNet50(weights='imagenet',
                 include_top=False,
                 input_shape=(WIDTH, HEIGHT, 3))



In [12]:
FC_SIZE = 1024
DROPOUT = 0.5

In [38]:
x = model.output
x = Flatten()(x)
x = Dense(FC_SIZE, activation='relu')(x)
x = Dropout(DROPOUT)(x)
predictions = Dense(CLASSES, activation='sigmoid')(x)

In [13]:
LR = 0.0001
#MOMENTUM = 0.9

In [40]:
model_final = Model(inputs=model.input, outputs=predictions)

model_final.compile(loss='binary_crossentropy',
                    optimizer=optimizers.RMSprop(lr=LR),
                    metrics=['accuracy'])

In [41]:
model_final.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 300, 300, 3)  0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 306, 306, 3)  0           input_2[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 150, 150, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 150, 150, 64) 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation

## Data augmentation

In [42]:
from keras.applications.resnet50 import preprocess_input
from keras.preprocessing.image import ImageDataGenerator

In [14]:
BATCH_SIZE = 40

In [44]:
train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    horizontal_flip=True,
    fill_mode="nearest",
    zoom_range=0.3,
    width_shift_range=0.3,
    height_shift_range=0.3,
    rotation_range=30)

test_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input)

In [45]:
train_generator = train_datagen.flow_from_directory(
    TRAIN_DIR,
    classes=[''],
    target_size=(HEIGHT, WIDTH),
    batch_size=BATCH_SIZE,
    class_mode="categorical")

validation_generator = test_datagen.flow_from_directory(
    VAL_DIR,
    classes=[''],    
    target_size=(HEIGHT, WIDTH),
    class_mode="categorical")

Found 39905 images belonging to 1 classes.
Found 9977 images belonging to 1 classes.


In [46]:
def parse_label_row(row):
    labels = list(row.values())[1:]
    labels = [int(label) for label in labels]
    return labels

def read_labels_dict(dataset_dir):
    with open(os.path.join(dataset_dir, 'labels.csv')) as csvfile:
        reader = csv.DictReader(csvfile)
        return {row['img']: parse_label_row(row) for row in reader}

In [47]:
labels_dict = read_labels_dict(DATASET_DIR)

In [48]:
def build_data_generator(gen, labels_dict):
    for x in gen:
        idx = gen.batch_index * gen.batch_size
        filenames = gen.filenames[idx : idx + gen.batch_size]
        labels = np.array([labels_dict[fname] for fname in filenames])
        
        if x[0].shape[0] == labels.shape[0]:
            yield x[0], labels

In [49]:
train_datagen = build_data_generator(train_generator, labels_dict)
val_datagen = build_data_generator(validation_generator, labels_dict)

## Training

In [50]:
CLASS_WEIGHT = {0: 0.42842266, 1: 1.51206851, 2: 221.69444444}

In [52]:
WEIGHTS_PATH = "weights-2.h5"

In [53]:
from keras.callbacks import Callback
import subprocess

class UploadToStorageCallback(Callback):
    def __init__(self, gspath):
        super(UploadToStorageCallback, self).__init__()
        self.gspath = gspath
        
    def on_epoch_end(self, epoch, logs=None):
        cmd = 'gsutil -m cp -n *.h5 {}'.format(self.gspath)
        print(cmd)
        subprocess.run(cmd, shell=True)

In [54]:
checkpoint = ModelCheckpoint(WEIGHTS_PATH,
    monitor = 'val_acc',
    verbose = 1,
    save_best_only = True,
    save_weights_only = False,
    mode = 'auto',
    period = 1)

upload_to_storage = UploadToStorageCallback(
    'gs://dym-temp/school-mapping/models/')

early = EarlyStopping(
    monitor = 'val_acc',
    min_delta = 0,
    patience = 10,
    verbose = 1,
    mode = 'auto')

reduce_lr = ReduceLROnPlateau(
    monitor = 'val_loss',
    factor = 0.2,
    patience = 5,
    min_lr = 0.0001)

In [55]:
SCHEDULE = [
    dict(epochs=5, lr=0.001, layers=4),
    dict(epochs=30, lr=0.001, layers=30),
]

In [56]:
def freeze_last_layers(model, n_layers):
    for layer in model.layers:
        layer.trainable = False
    for layer in model.layers[-n_layers:]:
        layer.trainable = True

In [57]:
model = ResNet50(weights='imagenet',
                 include_top=False,
                 input_shape=(WIDTH, HEIGHT, 3))

x = model.output
x = Flatten()(x)
x = Dense(FC_SIZE, activation='relu')(x)
x = Dropout(DROPOUT)(x)
predictions = Dense(CLASSES, activation='sigmoid')(x)

model_final = Model(inputs=model.input, outputs=predictions)

histories = []
for i, opts in enumerate(SCHEDULE):
    print("===== Schedule step {} =====".format(i))

    freeze_last_layers(model_final, opts['layers'])

    model_final.compile(loss='binary_crossentropy',
        optimizer=optimizers.SGD(lr=opts['lr']),
        metrics=['acc'])
    
    history = model_final.fit_generator(
        train_datagen,
        steps_per_epoch = n_train_samples // BATCH_SIZE,
        epochs = opts['epochs'], 
        validation_data = val_datagen,
        validation_steps = n_val_samples // BATCH_SIZE,
        class_weight = CLASS_WEIGHT,
        #callbacks = [checkpoint, early, reduce_lr])
        #callbacks = [checkpoint, early, upload_to_storage])
        callbacks = [checkpoint, early])
      
    histories.append(history)
    model_final.load_weights(WEIGHTS_PATH)
    
    #K.clear_session()
    
#model_final.save('final.h5')



===== Schedule step 0 =====
Epoch 1/5

Epoch 00001: val_acc improved from -inf to 0.48532, saving model to weights-2.h5
Epoch 2/5

Epoch 00002: val_acc improved from 0.48532 to 0.83049, saving model to weights-2.h5
Epoch 3/5

Epoch 00003: val_acc improved from 0.83049 to 0.86914, saving model to weights-2.h5
Epoch 4/5

Epoch 00004: val_acc did not improve from 0.86914
Epoch 5/5

Epoch 00005: val_acc did not improve from 0.86914
===== Schedule step 1 =====
Epoch 1/30

Epoch 00001: val_acc did not improve from 0.86914
Epoch 2/30

Epoch 00002: val_acc did not improve from 0.86914
Epoch 3/30

Epoch 00003: val_acc improved from 0.86914 to 0.87006, saving model to weights-2.h5
Epoch 4/30

Epoch 00004: val_acc did not improve from 0.87006
Epoch 5/30

Epoch 00005: val_acc did not improve from 0.87006
Epoch 6/30

Epoch 00006: val_acc did not improve from 0.87006
Epoch 7/30

Epoch 00007: val_acc did not improve from 0.87006
Epoch 8/30

Epoch 00008: val_acc did not improve from 0.87006
Epoch 9/30

In [None]:
#!gsutil -m cp -n *.h5 gs://dym-temp/school-mapping/models/

In [None]:
K.clear_session()
del model_final
del model
gc.collect()

## Evaluation

Now we evaluate model over validation set and only "schools" class

In [9]:
from keras.applications.resnet50 import ResNet50
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input
from sklearn.metrics import confusion_matrix
import numpy as np

In [10]:
#!gsutil cp gs://dym-temp/school-mapping/models/*.h5 .

In [15]:
WEIGHTS_PATH = 'weights-2.h5'

model = ResNet50(weights='imagenet',
                 include_top=False,
                 input_shape=(WIDTH, HEIGHT, 3))
x = model.output
x = Flatten()(x)
x = Dense(FC_SIZE, activation='relu')(x)
x = Dropout(DROPOUT)(x)
predictions = Dense(CLASSES, activation='sigmoid')(x)
model_final = Model(inputs=model.input, outputs=predictions)
model_final.load_weights(WEIGHTS_PATH)



In [16]:
model_final.output_shape

(None, 3)

In [17]:
target_size = model_final.input_shape[1:3]

In [18]:
img_path = os.path.join(DATASET_DIR, 'test', '0_93.jpg')
img = image.load_img(img_path, target_size=target_size)
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

preds = model_final.predict(x)
preds

array([[2.0344121e-19, 3.8215993e-20, 0.0000000e+00]], dtype=float32)

In [22]:
from itertools import zip_longest

def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)

def predict_images(img_path, batch_size=40):
    images = glob(os.path.join(img_path, '*.jpg'))
    # XXX
    ones = np.ones((batch_size, 3))
    for g, img_group in enumerate(grouper(images, batch_size)):
        imgs = [image.load_img(path, target_size=target_size) for path in img_group]
        arrays = np.array([image.img_to_array(img) for img in imgs])
        pre_arrays = preprocess_input(arrays)
        preds = model_final.predict(pre_arrays)
        print(np.round(preds))
        return

In [23]:
y_pred = predict_images(os.path.join(DATASET_DIR, 'test'))
y_pred

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [13]:
def parse_label_row(row):
    labels = list(row.values())[1:]
    labels = [int(label) for label in labels]
    return labels

def get_labels_from_data_subset(dataset_dir, subset_name):
    labels_path = os.path.join(dataset_dir, 'labels.csv')
    with open(labels_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        rows_by_img = { row['img']: row for row in reader }
    images = glob(os.path.join(dataset_dir, subset_name, '*.jpg'))
    basenames = [os.path.basename(img) for img in images]
    instances_subset = [rows_by_img[img] for img in basenames]
    y = np.array([parse_label_row(instance) for instance in instances_subset])
    return y

In [14]:
y_true = get_labels_from_data_subset(DATASET_DIR, 'test')
y_true.shape, y_true

ValueError: invalid literal for int() with base 10: '99_249.jpg'