In [3]:
import numpy as np
import pandas as pd
import os
from random import shuffle
from tqdm import tqdm

DATA_DIR = '../input/amazon/'
TRAIN_TIF_DIR = DATA_DIR + 'train-tif-v2/'
TRAIN_CSV = DATA_DIR + 'train.csv'
TEST_TIF_DIR = DATA_DIR + 'test-tif/'

IMG_SIZE = 100
LR = 1e-3

MODEL_NAME = 'amazon=-{}-{}.model'.format(LR, '2conv-basic')

In [4]:
CLOUD_COVER_LABELS = [
    'clear', 
    'cloudy', 
    'haze', 
    'partly_cloudy']

# read our data and take a look at what we are dealing with
train_csv = pd.read_csv(TRAIN_CSV)
train_csv.head()

tags = pd.DataFrame()

for label in CLOUD_COVER_LABELS:
    tags[label] = train_csv.tags.apply(lambda x: np.where(label in x, 1, 0))
    
train_csv = pd.concat([train_csv, tags], axis=1)
train_csv.head(n=2)

Unnamed: 0,image_name,tags,clear,cloudy,haze,partly_cloudy
0,train_0,haze primary,0,0,1,0
1,train_1,agriculture clear primary water,1,0,0,0


In [6]:
# need to fix class imbalance
train_csv[['clear', 'cloudy', 'haze', 'partly_cloudy']].sum()

# add class weights

clear            28203
cloudy            9581
haze              2695
partly_cloudy     7251
dtype: int64

In [4]:
# limit to 1000, remove this in real script
train = train_csv[0:1000]

In [5]:
from skimage import io
from scipy.misc import imresize
import cv2
import tifffile as tiff

# convert cloud cover labels to array [clear, cloudy, haze, partly_cloudy]
def get_cloud_cover_labels(row):
    labels = np.array([row.clear, row.cloudy, row.haze, row.partly_cloudy])
    return labels

# load image
# reduce image from 255,255,4 to 100,100,4
# flatten out to 1-D array in order R,G,B,NIR (should we use greyscale instead, ignore NIR?)
def load_image(filename):
    path = os.path.abspath(os.path.join(TRAIN_TIF_DIR, filename))
    if os.path.exists(path):
        img = tiff.imread(path)[:,:,:3]
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        return img
    # if you reach this line, you didn't find the image you're looking for
    print('Load failed: could not find image {}'.format(path))
    
# create training data from train.csv DataFrame
def create_training_data():
    train_images = []

    for index, row in tqdm(train.iterrows()):
        grey_image = load_image(row.image_name + '.tif')
        train_images.append([grey_image, 
                             get_cloud_cover_labels(row),
                             row.image_name])

    np.save('training_images.npy', train_images)
    return train_images

# load test data from test data folder
# reduce image to 100,100,4, flatten etc as above
def create_test_data():
    test_images = []
    
    for image_name in os.listdir(TRAIN_TIF_DIR):
        grey_image = load_image(row.image_name + '.tif')
        test_images.append([grey_image, image_name.split('.')[0]])
        
    return test_images

In [6]:
# at this point we have our training data in a list
# [0] - greyscale rgbn image
# [1] - array of labels where clear, cloudy, haze, partly_cloudy
# [2] - name of image, for reference

train_images = create_training_data()
len(train_images)

1000it [00:10, 92.99it/s]


1000

In [None]:
# import cv2
# import matplotlib.pyplot as plt
# import tifffile as tiff

# # path = os.path.abspath(os.path.join(TRAIN_TIF_DIR, 'train_3675.tif'))
# path = os.path.abspath(os.path.join(TRAIN_TIF_DIR, 'train_19.tif'))
# img = tiff.imread(path)[:,:,:3]
# img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
# plt.imshow(img, cmap='gray')
# plt.show()

In [7]:
# if you need to load the training data
# train_images = np.load('training_images.npy')

In [None]:
import tensorflow as tf
tf.reset_default_graph()

In [8]:
import tflearn
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.estimator import regression

convnet = input_data(shape=[None, IMG_SIZE, IMG_SIZE, 1], name='input')

convnet = conv_2d(convnet, 32, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)

convnet = conv_2d(convnet, 64, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)

convnet = conv_2d(convnet, 128, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)

convnet = conv_2d(convnet, 64, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)

convnet = conv_2d(convnet, 32, 5, activation='relu')
convnet = max_pool_2d(convnet, 5)

convnet = fully_connected(convnet, 1024, activation='relu')
convnet = dropout(convnet, 0.8)

convnet = fully_connected(convnet, 4, activation='softmax')
convnet = regression(convnet, optimizer='adam', learning_rate=LR, loss='categorical_crossentropy', name='targets')

model = tflearn.DNN(convnet)

In [9]:
if os.path.exists('{}.meta'.format(MODEL_NAME)):
    model.load(MODEL_NAME)
    print('model loaded!')

In [10]:
train_data = train_images[:-8000]
# need a cross validation set
cv_data = train_images[-8000:-4000]
test_data = train_images[-4000:]

In [22]:
X = np.array([i[0] for i in train_data]).reshape(-1, IMG_SIZE, IMG_SIZE, 1)
y = [i[1] for i in train_data]

X_test = np.array([i[0] for i in test_data]).reshape(-1, IMG_SIZE, IMG_SIZE, 1)
y_test = [i[1] for i in test_data]

In [None]:
model.fit({'input': X}, {'targets': Y}, n_epoch=3, validation_set=({'input': X_test}, {'targets': y_test}), 
    snapshot_step=500, show_metric=True, run_id=MODEL_NAME)

Training Step: 11  | total loss: [1m[32m11.64486[0m[0m | time: 28.780s
[2K| Adam | epoch: 001 | loss: 11.64486 - acc: 0.6889 -- iter: 704/900


In [None]:
model.save('/output/' + MODEL_NAME)

In [None]:
# need to measure F2 score instead of accuracy