<a href="https://colab.research.google.com/github/dincbariscagri/cng562_project/blob/master/googleimage_562project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Google Drive Authorization for Storage
You can reach dincbariscagri@gmail.com to get authorization for google drive storage. I can directly give you the code required.


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


## Data Downloading, Adjustments and Storing into Drive

**No need to run these codes.**

In [0]:
import logging 
import math
import os
import subprocess
from multiprocessing import Pool
from PIL import Image
import shutil
def create_logger(filename, 
                  logger_name='logger', 
                  file_fmt='%(asctime)s %(levelname)-8s: %(message)s',
                  console_fmt='%(asctime)s | %(message)s',
                  file_level=logging.DEBUG, 
                  console_level=logging.INFO):
    
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.DEBUG)
    logger.propagate = False

    file_fmt = logging.Formatter(file_fmt)
    log_file = logging.FileHandler(filename)
    log_file.setLevel(file_level)
    log_file.setFormatter(file_fmt)
    logger.addHandler(log_file)

    console_fmt = logging.Formatter(console_fmt)
    log_console = logging.StreamHandler()
    log_console.setLevel(logging.DEBUG)
    log_console.setFormatter(console_fmt)
    logger.addHandler(log_console)

    return logger


def move_images_from_sub_to_root_folder(root_folder, subfolder):
    subfolder_content = os.listdir(subfolder)
    folders_in_subfolder = [i for i in subfolder_content if os.path.isdir(os.path.join(subfolder, i))]
    for folder_in_subfolder in folders_in_subfolder:
        subfolder_ = os.path.join(subfolder, folder_in_subfolder)
        move_images_from_sub_to_root_folder(root_folder, subfolder_)
    images = [i for i in subfolder_content if i not in folders_in_subfolder]
    for image in images:
        path_to_image = os.path.join(subfolder, image) 
        os.system(f"mv {path_to_image} ./{root_folder}/{image}")
        
        
def remove_all_subfolders_inside_folder(folder):
    folder_content = os.listdir(folder)
    subfolders = [i for i in folder_content if os.path.isdir(os.path.join(folder, i))]
    for subfolder in subfolders:
        path_to_subfolder = os.path.join(folder, subfolder)
        os.system(f'rm -r {path_to_subfolder}')
        
        
def resize_folder_images(src_dir, dst_dir, size=224):
    if not os.path.isdir(dst_dir):
        logger.info("destination directory does not exist, creating destination directory.")
        os.makedirs(dst_dir)

    image_filenames=os.listdir(src_dir)
    count = 0
    for filename in image_filenames:
        dst_filepath = os.path.join(dst_dir, filename)
        src_filepath = os.path.join(src_dir, filename)
        new_img = read_and_resize_image(src_filepath, size)
        if new_img is not None:
            new_img = new_img.convert("RGB")
            new_img.save(dst_filepath)
            count += 1
    logger.debug(f'{src_dir} files resized: {count}')
    
    
def read_and_resize_image(filepath, size):
    img = read_image(filepath)
    if img:
        img = resize_image(img, size)
    return img


def resize_image(img, size):
    if type(size) == int:
        size = (size, size)
    if len(size) > 2:
        raise ValueError("Size needs to be specified as Width, Height")
    return resize_contain(img, size)


def read_image(filepath):
    try:
        img = Image.open(filepath)
        return img
    except (OSError, Exception) as e:
        logger.debug("Can't read file {}".format(filepath))
        return None


def resize_contain(image, size, resample=Image.LANCZOS, bg_color=(255, 255, 255, 0)):
    img_format = image.format
    img = image.copy()
    img.thumbnail((size[0], size[1]), resample)
    background = Image.new('RGBA', (size[0], size[1]), bg_color)
    img_position = (
        int(math.ceil((size[0] - img.size[0]) / 2)),
        int(math.ceil((size[1] - img.size[1]) / 2))
    )
    background.paste(img, img_position)
    background.format = img_format
    return background.convert('RGB')
    
    
def download_resize_clean(index):
    try:
        if not os.path.exists('train'):
            os.system('mkdir train')

        file_index = '{0:0>3}'.format(index)
        images_file_name = f'images_{file_index}.tar'
        images_folder = images_file_name.split('.')[0]
        images_md5_file_name = f'md5.images_{file_index}.txt'
        images_tar_url = f'https://s3.amazonaws.com/google-landmark/train/{images_file_name}'
        images_md5_url = f'https://s3.amazonaws.com/google-landmark/md5sum/train/{images_md5_file_name}'

        logger.info(f'Downloading: {images_file_name} and {images_md5_file_name}')
        os.system(f'wget {images_tar_url}')
        os.system(f'wget {images_md5_url}')

        logger.debug(f'Checking file md5 and control md5')
        p = subprocess.Popen(
            ["md5sum", images_file_name], 
            stdout=subprocess.PIPE, 
            stderr=subprocess.STDOUT
        )
        stdout, stderr = p.communicate()
        md5_images = stdout.decode("utf-8").split(' ')[0]
        md5_control = open(images_md5_file_name).read().split(' ')[0]

        if md5_images == md5_control:
            logger.debug(f'MD5 are the same: {md5_images}, {md5_control}')
            logger.debug(f'Unarchiving images into: {images_folder}')
            os.system(f'mkdir {images_folder}')
            os.system(f'tar -xf {images_file_name} -C ./{images_folder}/')

            logger.debug(f'Moving images into root folder')
            move_images_from_sub_to_root_folder(images_folder, images_folder)
            remove_all_subfolders_inside_folder(images_folder)

            logger.debug(f'Resizing images')
            resize_folder_images(
                src_dir=images_folder, 
                dst_dir='train',
                size=224
            )
            os.system(f'rm -r {images_folder}')
            os.system(f'rm {images_file_name}')
            os.system(f'rm {images_md5_file_name}') 
        else:
            logger.error(f'{images_file_name} was not processed due to md5 missmatch')
    except:
        logger.error(f'FAILED TO PROCESS {images_file_name}')

In [0]:
logger = create_logger('download.log')
p = Pool(processes=6)
for i in range(1,50):  
  p.map(download_resize_clean, range(10*(i-1),10*i))
  shutil.make_archive(f'train{i}', 'zip', "train")
  shutil.copy(f'train{i}.zip',f'gdrive/My Drive/CNG562_Images/')
  os.system(f'rm -rf train')
p.close()

## Unzipping from Drive and Other Settings (Class and Data reductions for computation Reduction)

In [0]:
import zipfile
for i in range(1,51):
  zip_ref = zipfile.ZipFile(f'/content/gdrive/My Drive/CNG562_Images/train{i}.zip', 'r')
  zip_ref.extractall("train/")
  zip_ref.close()

In [0]:
import zipfile

zip_ref = zipfile.ZipFile(f'/content/gdrive/My Drive/CNG562_Images/train2test.zip', 'r')
zip_ref.extractall("train/")
zip_ref.close()

In [0]:
!wget https://s3.amazonaws.com/google-landmark/metadata/train.csv

--2019-06-14 18:16:40--  https://s3.amazonaws.com/google-landmark/metadata/train.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.138.245
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.138.245|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 525832518 (501M) [text/csv]
Saving to: ‘train.csv’


2019-06-14 18:16:51 (49.7 MB/s) - ‘train.csv’ saved [525832518/525832518]



In [0]:
import pandas as pd
train = pd.read_csv('train.csv')
train.id=train.id.astype(str)+".jpg"
train.landmark_id=train.landmark_id.astype(str)

Class Count Reduction

In [0]:
from collections import Counter
NUM_THRESHOLD = 250

counts = dict(Counter(train['landmark_id']))
landmarks_dict = {x:[] for x in train.landmark_id.unique() if counts[x] >= NUM_THRESHOLD}
NUM_CLASSES = len(landmarks_dict)
print("Total number of valid classes: {}".format(NUM_CLASSES))

i = 0
landmark_to_idx = {}
idx_to_landmark = []
for k in landmarks_dict:
    landmark_to_idx[k] = i
    idx_to_landmark.append(k)
    i += 1

all_urls = train['url'].tolist()
all_landmarks = train['landmark_id'].tolist()
valid_urls_dict = {x[0].split("/")[-1]:landmark_to_idx[x[1]] for x in zip(all_urls, all_landmarks) if x[1] in landmarks_dict}
valid_urls_list = [x[0] for x in zip(all_urls, all_landmarks) if x[1] in landmarks_dict]

NUM_EXAMPLES = len(valid_urls_list)
print("Total number of valid examples: {}".format(NUM_EXAMPLES))

Total number of valid classes: 1067
Total number of valid examples: 478577


In [0]:
train2 = train[pd.DataFrame(train.url.tolist()).isin(valid_urls_list).any(1)]

In [0]:
!mkdir train2

In [0]:
import shutil
for index, row in train2.iterrows():
  shutil.copyfile(f'train/{row[0]}',f'train2/{row[0]}')

In [0]:
import os, os.path
len(os.listdir('train/'))

478577

In [0]:
shutil.make_archive(f'train2test', 'zip', "train2")

'/content/train2test.zip'

In [0]:
shutil.copy(f'train2test.zip',f'gdrive/My Drive/CNG562_Images/')

'gdrive/My Drive/CNG562_Images/train2test.zip'

In [0]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(rescale=1./255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True)
test_datagen = ImageDataGenerator(rescale=1./255)

train_set =train_datagen.flow_from_dataframe(dataframe=train2[:430720],directory="train/",x_col='id',y_col='landmark_id',class_mode='categorical',batch_size=1000,target_size=(32,32))
test_set = test_datagen.flow_from_dataframe(dataframe=train2[430720:],directory="train/",x_col='id',y_col='landmark_id',class_mode='categorical',batch_size=1000,target_size=(32,32))

Found 430720 validated image filenames belonging to 1067 classes.
Found 47857 validated image filenames belonging to 1067 classes.


Tensor Core adjusments

In [0]:
import os
import pprint
import tensorflow as tf

if 'COLAB_TPU_ADDR' not in os.environ:
  print('ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!')
else:
  tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
  print ('TPU address is', tpu_address)

  with tf.Session(tpu_address) as session:
    devices = session.list_devices()
    
  print('TPU devices:')
pprint.pprint(devices)

TPU address is grpc://10.125.175.82:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 7882111324951740436),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 6528557291878862604),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 14726621810661986602),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 15900451039166030568),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 12377266099146206416),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 4369043022606031793),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 12756309355972463154),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 8227873495543562732),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 57112215582

## Model Build

In [0]:
#Convolutional Neural Network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

#Intialization
classifier = Sequential()

#Convolution
classifier.add(Conv2D(32,(3,3),input_shape=(64,64,3),activation='relu'))

#Max Pooling / Downsampling
classifier.add(MaxPooling2D(pool_size=(2,2)))

#Dropout
classifier.add(Dropout(rate=0.25))

#2nd Convolution 
classifier.add(Conv2D(64,(3,3),activation='relu'))

#2nd Max Pooling / Downsampling
classifier.add(MaxPooling2D(pool_size=(2,2)))

#Flatten
classifier.add(Flatten())

#Full Connection
classifier.add(Dense(256, activation='relu'))
classifier.add(Dropout(rate=0.5))
classifier.add(Dense(1, activation='sigmoid'))

In [0]:
classifier.summary()

In [0]:
opt = tf.train.AdamOptimizer(0.001)
classifier.compile(opt,loss='categorical_crossentropy',metrics=['accuracy'])

In [0]:
TPU_ADDRESS = "grpc://" + os.environ['COLAB_TPU_ADDR']
resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
dist_strategy = tf.contrib.tpu.TPUDistributionStrategy(resolver)
tpu_model = tf.contrib.tpu.keras_to_tpu_model(classifier,strategy=dist_strategy)

In [0]:
history= tpu_model.fit_generator(train_set, steps_per_epoch=430720, epochs=10, validation_data=test_set, validation_steps=47857)

Epoch 1/10
INFO:tensorflow:New input shapes; (re-)compiling: mode=train (# of cores 8), [TensorSpec(shape=(64,), dtype=tf.int32, name='core_id_80'), TensorSpec(shape=(64, 32, 32, 3), dtype=tf.float32, name='conv2d_8_input_10'), TensorSpec(shape=(64, 213), dtype=tf.float32, name='dense_9_target_30')]
INFO:tensorflow:Overriding default placeholder.
INFO:tensorflow:Remapping placeholder for conv2d_8_input
INFO:tensorflow:Started compiling
INFO:tensorflow:Finished compiling. Time elapsed: 4.658758163452148 secs
INFO:tensorflow:Setting weights on TPU model.
INFO:tensorflow:Overriding default placeholder.
INFO:tensorflow:Remapping placeholder for conv2d_8_input
INFO:tensorflow:Started compiling
INFO:tensorflow:Finished compiling. Time elapsed: 3.591972827911377 secs
INFO:tensorflow:Overriding default placeholder.
INFO:tensorflow:Remapping placeholder for conv2d_8_input
INFO:tensorflow:Started compiling
INFO:tensorflow:Finished compiling. Time elapsed: 2.9195921421051025 secs
INFO:tensorflow:

In [0]:
## Transfer learning with ResNet50
from tensorflow.python.keras.applications import ResNet50

In [0]:
## Two-layer model: 1st ResNet50, 2nd Dense softmax

#Initialize 
model = Sequential()

#Add ResNet50 
model.add(ResNet50(include_top = False, pooling = 'avg', weights = 'imagenet'))

#Add Dense softmax
model.add(Dense(1067, activation = 'softmax'))

#Fix pre-trained weights. Will only train the last layer
model.layers[0].trainable = False


Instructions for updating:
Colocations handled automatically by placer.


In [0]:
model.summary()

In [0]:
#Compiled
opt = tf.train.AdamOptimizer(0.001)
model.compile(opt,loss='categorical_crossentropy',metrics=['accuracy'])

In [0]:
#Early stopping and check-point
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint

cb_early_stopper = EarlyStopping(monitor = 'val_loss', patience = 3)
cb_checkpointer = ModelCheckpoint(filepath = '../working/best.hdf5', monitor = 'val_loss', save_best_only = True, mode = 'auto')

In [0]:
#Fit the model


fit_history = model.fit_generator(
        train_set,
        steps_per_epoch=430720,
        epochs = 10,
        validation_data=test_set,
        validation_steps=47857,
        callbacks=[cb_checkpointer, cb_early_stopper]
)
model.load_weights("../working/best.hdf5")

In [0]:
for filename in os.listdir('train/'):
  if(train2.id.str.contains(filename).any() == False):
    os.remove(f'train/{filename}')

In [0]:
for filename in os.listdir('train/'):
  if(train2.id.str.contains(filename).any()):
    shutil.copyfile(f'train/{filename}',f'/content/train2/{filename}')