In [1]:
# All imports for ease here
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
import numpy as np
import pandas as pd
import os, os.path

# init local path constants
raid_dir = '/mnt/raid0/Projects/Kaggle/GoogleLandmarkRecognition/'
raid_train_dir = '/mnt/raid0/Projects/Kaggle/GoogleLandmarkRecognition/train/'
raid_valid_dir = '/mnt/raid0/Projects/Kaggle/GoogleLandmarkRecognition/valid/'
raid_test_dir = '/mnt/raid0/Projects/Kaggle/GoogleLandmarkRetrieval/test/'
train_csv = '~/Documents/Kaggle/GoogleLandmarkRecognition/train.csv'
test_csv = '~/Documents/Kaggle/GoogleLandmarkRecognition/test.csv'

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# use this function to load the train and test data
def load_dataset(path):
    data = load_files(path)
    image_files = np.array(data['filenames'])
    train_targets = np_utils.to_categorical(np.array(data['target']), 133)
    return image_files, train_targets

# point this to the train.csv file
def load_variable_names(path):
    # use dtype=None because we have strings and ints
    data = pd.read_csv(path, quotechar='"')
    return data

# use this to count the images available to learn from
def get_total_files(path):
    return len([name for name in os.listdir(path) if os.path.isfile(os.path.join(path, name))])

In [3]:

#train_files, train_targets = load_dataset('/mnt/raid0/Projects/Kaggle/GoogleLandmarkRecognition')
train_csv_pd = load_variable_names(train_csv)
test_csv_pd = load_variable_names(test_csv)

total_files_train = get_total_files(raid_train_dir)
total_files_test = get_total_files(raid_test_dir)

print('There are %d files in the train/ dir' % total_files_train)
print('There are %d files in the train.csv file' % len(train_csv_pd['id']))
print('There are %d unique landmarks in the train.csv file' % len(train_csv_pd['landmark_id'].unique()))
print('\nThere are %d files in the test/ dir' % total_files_test)
print('There are %d files in the test.csv file' % len(test_csv_pd['id']))

print('\nMissing %d training files!' % (len(train_csv_pd['id']) - total_files_train))
print('Missing %d testing files!' % (len(test_csv_pd['id']) - total_files_test))
    

There are 0 files in the train/ dir
There are 1225029 files in the train.csv file
There are 14951 unique landmarks in the train.csv file

There are 117697 files in the test/ dir
There are 117703 files in the test.csv file

Missing 1225029 training files!
Missing 6 testing files!


In [4]:
# RESULT : Wound up with a 77-23 split because of missing files. The first time I ran it I got a 84-16 split.
#          I tried running it again to get closer to 80 - 20 but because of missing files I ran up with 77-23.

# Uncomment to split the training data
#raid_practice = '/mnt/raid0/Projects/Kaggle/GoogleLandmarkRecognition/'
#raid_practice_train = '/mnt/raid0/Projects/Kaggle/GoogleLandmarkRecognition/train/'
#raid_practice_valid = '/mnt/raid0/Projects/Kaggle/GoogleLandmarkRecognition/valid/'
#csv_file = '/mnt/raid0/Projects/Kaggle/GoogleLandmarkRecognition/test_valid_split.csv'

#csv_file_data = load_variable_names(csv_file)
#train, valid = train_test_split(csv_file_data, test_size=0.2)
#train, valid = train_test_split(train_csv_pd, test_size=0.04)
#print(train)
#print('\n')
#print(valid)

#for row in valid.itertuples():
#    filename = str(row[3]) + '/' + str(row[1]) + '.jpg'
#    dirname = str(row[3])
#    if os.path.isfile(os.path.join(raid_practice_train, filename)):
#        if not os.path.isdir(raid_practice_valid + dirname + '/'):
#            os.makedirs(raid_practice_valid + dirname)
#        os.rename(raid_practice_train + filename, raid_practice_valid + filename)

In [5]:
# Change this to True and uncomment if you need to process the data into subdirectories
#FORMAT_INTO_SUBDIRECTORY = False
    
#if FORMAT_INTO_SUBDIRECTORY:
#    img_count = 0
#    dir_count = 0
#    for row in file_info_train.itertuples():
#        filename = row[1]+'.jpg'
#        dirname = row[3]
#        if os.path.isfile(os.path.join(raid_train_dir,filename)):
#            if not os.path.isdir(raid_train_dir+str(dirname)+'/'):
#                os.makedirs(raid_train_dir+str(dirname))
#                dir_count += 1
#            os.rename(raid_train_dir+filename, raid_train_dir+str(dirname)+'/'+filename)
#            img_count += 1
#    print('Moved {0} Files into {1} new directories.'.format(img_count, dir_count))


In [6]:
# Need to test and configure this code!!
from keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(rescale=1./255)

test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
        raid_train_dir,
        target_size=(150, 150),
        batch_size=1000,
        class_mode='categorical')

valid_generator = test_datagen.flow_from_directory(
        raid_valid_dir,
        target_size=(150,150),
        batch_size=1000,
        class_mode='categorical')

Found 940338 images belonging to 14951 classes.
Found 284000 images belonging to 13168 classes.


In [None]:
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D, GlobalMaxPooling2D
from keras.layers import Dropout, Flatten, Dense, Activation
from keras.models import Sequential

from keras.applications.resnet50 import ResNet50
from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input, decode_predictions


def create_model():
    model = Sequential()
    model.add(Flatten(input_shape=model.output_shape[1:]))
    model.add(Dense(14951, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(14951, activation='softmax'))
    return model

In [None]:
from keras.callbacks import ModelCheckpoint 

resnet = ResNet50(include_top=False, weights='imagenet')

bottleneck_features_train = resnet.predict_generator(train_generator)
# save the output as a Numpy array
np.save(open('bottleneck_features/bottleneck_features_train.npy', 'w'), bottleneck_features_train)

bottleneck_features_validation = resnet.predict_generator(valid_generator)
np.save(open('bottleneck_features/bottleneck_features_validation.npy', 'w'), bottleneck_features_validation)

#model = create_model()
#model.summary()
#model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

#checkpointer = ModelCheckpoint(filepath=raid_dir + 'saved_models/weights.best.from_scratch.hdf5', 
#                           verbose=1, save_best_only=True)

#model.fit_generator(
#    train_generator,
#    steps_per_epoch=940,
#    epochs=25,
#    validation_data=valid_generator,
#    validation_steps=284,
#    use_multiprocessing=True,
#    callbacks=[checkpointer], verbose=1)