<a href="https://colab.research.google.com/github/emmanuelorji2013/Data-Science-Internship_Hamoye/blob/master/Stage_D_Project_Planet_Understanding_Amazon_from_Space.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Planet: Understanding Amazon from Space
Implemented using KFold and VGG19 as the pretrained model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#upload the credentials of the kaggle account
from google.colab import files
files.upload()

In [None]:
# The Kaggle APU expects kaggle.json to be in ~/.kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# This permission change avoids a warning on Kaggle tool startup
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download  nikitarom/planets-dataset/ 

In [None]:
#unzipping the zip files and deleting the zip files
!unzip \*.zip  && rm *.zip


In [None]:
!mkdir weights # Weights will be saved here for model checkpoint operation

In [None]:
import os, glob, shutil

# Combine test images into 1 folder (test-jpg)
srcDir = "test-jpg-additional/test-jpg-additional"
destDir = "planet/planet/test-jpg"
for filePath in glob.glob(srcDir + '/*'):
  # Move each file to destination Directory
  shutil.move(filePath, destDir)

## Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import fbeta_score

from keras.applications.vgg19 import VGG19

from collections import Counter

from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

from keras.models import Sequential
from keras.layers import Dense, Flatten, BatchNormalization

import cv2
from tqdm import tqdm


# Initializations
epochs = 50
batch_size = 128

input_size = 128
input_channels = 3

n_folds = 5

training = True

ensemble_voting = False  # If True, use voting for model ensemble, otherwise use averaging

In [None]:
test_im_names = [os.path.splitext(filename)[0] for filename in os.listdir('planet/planet/test-jpg')]
df_train_data = pd.read_csv('planet/planet/train_classes.csv')
df_test_data = pd.DataFrame({ 'image_name': sorted(test_im_names), 'tags': ""})

# Make a list of all possible labels
flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train_data['tags'].values])))

# Dictionary mapping labels to integer values 0-16
map_labels = {l: i for i, l in enumerate(labels)}
inv_map_labels = {i: l for l, i in map_labels.items()} # Inversion between keys and values in map_labels

kfold = KFold(n_splits=n_folds, shuffle=True, random_state=1)

fold_count = 0

y_full_test = []
thres_sum = np.zeros(17, np.float32)

In [None]:
print("There're {} labels\n".format(len(labels)))
labels

In [None]:
df_test_data

In [None]:
df_train_data

In [None]:
map_labels

## Split KFolds and train each fold

In [None]:
for train_index, test_index in kfold.split(df_train_data):

    fold_count += 1
    print('Fold ', fold_count)

    df_train = df_train_data.loc[train_index]
    if training:
        print('----Will trian with {} samples ----'.format(len(df_train)))

    df_valid = df_train_data.loc[test_index]
    print('----Will validate with {} samples '.format(len(df_valid)))

    #Custom train data generator
    def train_generator():
        while True:
            for start in range(0, len(df_train), batch_size):
                batch_x = []
                batch_y = []
                stop = min(start + batch_size, len(df_train))
                df_train_batch = df_train[start:stop]
                for f, tags in df_train_batch.values:
                    image = cv2.imread('planet/planet/train-jpg/{}.jpg'.format(f))
                    image = cv2.resize(image, (input_size, input_size))
                    image = transformations(image, np.random.randint(6))
                    targets = np.zeros(17)
                    for t in tags.split(' '):
                        targets[map_labels[t]] = 1
                    batch_x.append(image)
                    batch_y.append(targets)
                batch_x = np.array(batch_x, np.float32)
                batch_y = np.array(batch_y, np.uint8)
                yield batch_x, batch_y

    #Custom validation data generator
    def valid_generator():
        while True:
            for start in range(0, len(df_valid), batch_size):
                batch_x = []
                batch_y = []
                stop = min(start + batch_size, len(df_valid))
                df_valid_batch = df_valid[start:stop]
                for f, tags in df_valid_batch.values:
                    image = cv2.imread('planet/planet/train-jpg/{}.jpg'.format(f))
                    image = cv2.resize(image, (input_size, input_size))
                    image = transformations(image, np.random.randint(6))
                    targets = np.zeros(17)
                    for t in tags.split(' '):
                        targets[map_labels[t]] = 1
                    batch_x.append(image)
                    batch_y.append(targets)
                batch_x = np.array(batch_x, np.float32)
                batch_y = np.array(batch_y, np.uint8)
                yield batch_x, batch_y


    #Transformations for data augumentation
    def transformations(src, choice):
        if choice == 0:
            # Rotate 90
            src = cv2.rotate(src, rotateCode=cv2.ROTATE_90_CLOCKWISE)
        if choice == 1:
            # Rotate 90 and flip horizontally
            src = cv2.rotate(src, rotateCode=cv2.ROTATE_90_CLOCKWISE)
            src = cv2.flip(src, flipCode=1)
        if choice == 2:
            # Rotate 180
            src = cv2.rotate(src, rotateCode=cv2.ROTATE_180)
        if choice == 3:
            # Rotate 180 and flip horizontally
            src = cv2.rotate(src, rotateCode=cv2.ROTATE_180)
            src = cv2.flip(src, flipCode=1)
        if choice == 4:
            # Rotate 90 counter-clockwise
            src = cv2.rotate(src, rotateCode=cv2.ROTATE_90_COUNTERCLOCKWISE)
        if choice == 5:
            # Rotate 90 counter-clockwise and flip horizontally
            src = cv2.rotate(src, rotateCode=cv2.ROTATE_90_COUNTERCLOCKWISE)
            src = cv2.flip(src, flipCode=1)
        return src



    base_model = VGG19(include_top=False,
                       weights='imagenet',
                       input_shape=(input_size, input_size, input_channels))

    model = Sequential()
    # Batchnorm input
    model.add(BatchNormalization(input_shape=(input_size, input_size, input_channels)))
    # Base model
    model.add(base_model)
    # Classifier
    model.add(Flatten())
    model.add(Dense(17, activation='sigmoid'))

    opt = Adam(lr=1e-4)

    model.compile(loss='binary_crossentropy',
                  # We NEED binary here, since categorical_crossentropy l1 norms the output before calculating loss.
                  optimizer=opt,
                  metrics=['accuracy'])

    print("CallBacks")
    callbacks = [EarlyStopping(monitor='val_loss',
                               patience=4,
                               verbose=1,
                               min_delta=1e-4),
                 ReduceLROnPlateau(monitor='val_loss',
                                   factor=0.1,
                                   patience=2,
                                   cooldown=2,
                                   verbose=1),
                 ModelCheckpoint(filepath='weights/best_weights.fold_' + str(fold_count) + '.hdf5',
                                 save_best_only=True,
                                 save_weights_only=True)]

    print("Training")
    if training:
        model.fit(x=train_generator(),
                            steps_per_epoch=(len(df_train) // batch_size) + 1,
                            epochs=epochs,
                            verbose=2,
                            callbacks=callbacks,
                            validation_data=valid_generator(),
                            validation_steps=(len(df_valid) // batch_size) + 1)


    def optimise_f2_thresholds(y, p, verbose=True, resolution=100):
        def mf(x):
            p2 = np.zeros_like(p)
            for i in range(17):
                p2[:, i] = (p[:, i] > x[i]).astype(np.int)
            score = fbeta_score(y, p2, beta=2, average='samples')
            return score

        x = [0.2] * 17
        for i in range(17):
            best_i2 = 0
            best_score = 0
            for i2 in range(resolution):
                i2 /= float(resolution)
                x[i] = i2
                score = mf(x)
                if score > best_score:
                    best_i2 = i2
                    best_score = score
            x[i] = best_i2
            if verbose:
                print(i, best_i2, best_score)
        return x

    print("Load Weights")
    # Load best weights
    model.load_weights(filepath='weights/best_weights.fold_' + str(fold_count) + '.hdf5')

    valid_pred = model.predict(x=valid_generator(),
                                      steps=(len(df_valid) // batch_size) + 1)

    valid_y = []
    for f, tags in df_valid.values:
        targets = np.zeros(17)
        for t in tags.split(' '):
            targets[map_labels[t]] = 1
        valid_y.append(targets)
    valid_y = np.array(valid_y, np.uint8)

    # Find optimal f2 thresholds for local validation set
    thres = optimise_f2_thresholds(valid_y, valid_pred, verbose=False)

    print('F2 = {}'.format(fbeta_score(valid_y, np.array(valid_pred) > thres, beta=2, average='samples')))

    thres_sum += np.array(thres, np.float32)


    def test_generator(transformation):
        
        while True:
            for start in range(0, len(df_test_data), batch_size):
                batch_x = []
                stop = min(start + batch_size, len(df_test_data))
                df_test_batch = df_test_data[start:stop]
                for f, tags in df_test_batch.values:
                    image = cv2.imread('planet/planet/test-jpg/{}.jpg'.format(f))
                    image = cv2.resize(image, (input_size, input_size))
                    image = transformations(image, transformation)
                    batch_x.append(image)
                    
                batch_x = np.array(batch_x, np.float32)
                yield batch_x

    # 6-fold Test Time Augmentation
    p_full_test = []
    for i in range(6):
      p_test = model.predict(x=test_generator(transformation=i),
                                        steps=(len(df_test_data) // batch_size) + 1)
      p_full_test.append(p_test)
      

    p_test = np.array(p_full_test[0])
    for i in range(1, 6):
        p_test += np.array(p_full_test[i])
    p_test /= 6

    y_full_test.append(p_test)

result = np.array(y_full_test[0])

## Model Ensemble

In [None]:
if voting_ensemble:
    for f in range(len(y_full_test[0])):  # For each file
        for tag in range(17):  # For each tag
            preds = []
            for fold in range(n_folds):  # For each fold
                preds.append(y_full_test[fold][f][tag])
            pred = Counter(preds).most_common(1)[0][0]  # Most common tag prediction among folds
            result[f][tag] = pred
else:
    for fold in range(1, n_folds):
        result += np.array(y_full_test[fold])
    result /= n_folds
result = pd.DataFrame(result, columns=labels)

In [None]:
preds = []
thres = (thres_sum / n_folds).tolist()

for i in tqdm(range(result.shape[0]), miniters=1000):
    a = result.loc[[i]]
    a = a.apply(lambda x: x > thres, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))

df_test_data['tags'] = preds
submsnDest = r"/content/drive/My Drive/Colab Notebooks/Hamoye Internship/"
df_test_data.to_csv("{}/submission.csv".format(submsnDest), index=False)