In [15]:
%matplotlib inline

from bs4 import BeautifulSoup
import urllib
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pickle
from tqdm import tqdm,tqdm_notebook
import os

import keras
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten, Input, concatenate
from keras.layers import Conv2D, MaxPooling2D,UpSampling2D,Lambda, ZeroPadding2D
from keras import backend as K
from sklearn.model_selection import train_test_split

class MyException(Exception):
    pass

In [2]:
HOWMANY = 50
MAXLINKS = 1109
DEBUG = True
TMP_DIR = 'tmp'
FORCE_RELOAD = False#True
LOAD = True
PREPROCESS = True#False
batch_size = 128   # ile obrazków przetwarzamy na raz (aktualizacja wag sieci następuje raz na całą grupę obrazków)
epochs = 12         # ile epok będziemy uczyli
SIZE = (750,750)
SIDE = 75
IMPOSITION = 5
HOWMANYPERIMAGE = int(SIZE[0]*SIZE[1]/SIDE/SIDE)
IMAGESPERFILE = 100
assert int(SIZE[0]*SIZE[1]/SIDE/SIDE)==HOWMANYPERIMAGE

In [3]:
def loadImage(url):
    raw = urllib.request.urlopen(url).read()
    npraw= np.array(bytearray(raw),dtype=np.uint8)
    return cv2.imdecode(npraw,-1)#-1 -> as is (with the alpha channel)

def getImageName(url):
    return url.split('/').pop().split('.').pop(0)

def pickleBigDataset(prefix,dataset,size):
    j = int(np.ceil(len(dataset)/size))
    for i in range(1,j+1):
        np.save(os.path.join(TMP_DIR, prefix+str(j)),np.array(dataset[size*(i-1):size*i]))

def unpickleBigDataset(prefix):
    onlyfiles = [f for f in os.listdir(TMP_DIR) if os.path.isfile(os.path.join(TMP_DIR, f))
                 and f.startswith(prefix)]
    dataset = []
    if len(onlyfiles)>0:
        dataset = np.load(os.path.join(TMP_DIR, onlyfiles[0]))
        for f in onlyfiles[1:]:
            dataset=np.append(dataset,np.load(os.path.join(TMP_DIR, f)),axis=0)
    return dataset
#     return np.load(os.path.join(TMP_DIR, "{}.npy".format(prefix)))
    
            
def loadImagesFromSite(url,prefix):
    onlyfiles = [f for f in os.listdir(TMP_DIR) if os.path.isfile(os.path.join(TMP_DIR, f)) and f.startswith(prefix)]
    if len(onlyfiles)==0 or FORCE_RELOAD:
        imgs = []
        I = None
        
    else:
        imgs = [img for img in unpickleBigDataset(prefix)[:HOWMANY]]
        I = len(imgs)
    print("Cached images {}.".format(I if I is not None else 0))
    if len(imgs)<HOWMANY and len(imgs)<MAXLINKS:
        print("Loading images from {}".format(url))
        print("Proceeding from {} image.".format(I if I is not None else 0))

        s = IMAGESPERFILE

        with urllib.request.urlopen(url) as response:
            html = BeautifulSoup(response.read(),"lxml")
            i = I if I is not None else 0
            links = html.find_all('a')[I:HOWMANY]
            for link in tqdm_notebook(links):
                img = loadImage(link.get('href'))  
                img = cv2.resize(img,SIZE)
                imgs += [cv2.resize(img,SIZE)]
                if i%s==0:
                    pickleBigDataset(prefix,imgs,s)
                i+=1
        pickleBigDataset(prefix,imgs,s)
    
        
    return np.array(imgs)  

def saveDataset(X,Y,prefix=""):
    with open('pickledDatasetX'+prefix,'wb') as f:
        pickle.dump(X,f)
    with open('pickledDatasetY'+prefix,'wb') as f:
        pickle.dump(Y,f)
        
def loadDataset(prefix=""):
    try:
        X = unpickleBigDataset('x')
        Y = unpickleBigDataset('y')
        if len(X) == len(Y) and len(X) == HOWMANY:
            return X,Y
        else:
            print("Failed loading dataset from file system")
            return None,None
    except:
        print("Failed loading dataset from file system")
        return None,None
    
def display(X,Y,howmany=None):
    if howmany is None:
        howmany = X.shape[0]
        
    for i in range(howmany):
        print(X[i].max(),X[i].min())
        plt.figure()
        plt.subplot(1,2,1)
        plt.imshow(X[i])
        plt.subplot(1,2,2)
        plt.imshow(Y[i])
        

In [4]:
# def get_patches(image,size,side,imposition):
#     patches = []
    
    
#     for i in range(int(size[0]/side)):
#         for j in range(int(size[1]/side)):
#             patches += [image[i*side:(i+1)*side,j*side:(j+1)*side]]
#     return patches

def get_patches(image,size,side,imposition):
    patches = []
    
    if len(image.shape)==3:
        img = np.zeros((image.shape[0]+imposition,image.shape[1]+imposition,3))
        for i in range(3):
            img[...,i] = np.pad(image[...,i],((imposition,0),(imposition,0)),'reflect')
        image = img
    else:
        image = np.pad(image,((imposition,0),(imposition,0)),'reflect')

    for i in range(int(size[0]/side)):
        for j in range(int(size[1]/side)):
            imp1=np.max([i*side-imposition,0])
            imp2=(i+1)*side+imposition if imp1!=0 else (i+1)*side+imposition*2
            imp3=np.max([j*side-imposition,0])
            imp4=(j+1)*side+imposition if imp3!=0 else (j+1)*side+imposition*2
            patches += [image[imp1:imp2,imp3:imp4]]
    return patches

def preprocessorX(image):
    size,side,imposition = SIZE,SIDE,IMPOSITION
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    image = image.astype(np.float32)

    if image.max() > 1:
        image /= 255

    for i in range(3):
        image[...,i] -= image[...,i].mean()
        image[...,i] /= image[...,i].std()

    #remove outliers
    image[image<-3] = -3
    image[image>3] = 3

    #between -1,1
    for i in range(3):
        image[...,i] /= np.max(np.abs([image[...,i].min(),image[...,i].max()]))

    return get_patches(image,size,side,imposition)
    
def preprocessorY(image):
    size,side,imposition = SIZE,SIDE,IMPOSITION

    image = image.astype(np.float32)
    if image.max() > 1:
        image /= 255
    for i in range(3):
        image[...,i] = (image[...,i] - image[...,i].min())/(image[...,i].max() - image[...,i].min())
    return get_patches(image,size,side,imposition)
    
def getRoadStats(arr,mask):
    b = mask.astype(np.bool)
    x = arr[b]
    if len(x) != 0:
        return [x.max(0),x.min(0),x.mean(0),x.std(0),np.median(x,axis=0)]
    else:
        return None

def preprocessXY(X,Y):
    
    r = []
    for i in range(len(X)):
        s = getRoadStats(X[i],Y[i])
        if s is not None:
            r += [s]
            
    return np.array(r).mean(0)
    

def preprocess(images,preprocessor,prefix):
    onlyfiles = [f for f in os.listdir(TMP_DIR) if os.path.isfile(os.path.join(TMP_DIR, f)) and f.startswith(prefix)]
    if len(onlyfiles)==0:
        I = None
        result = []
    else:
        result = unpickleBigDataset(prefix)[:HOWMANY*HOWMANYPERIMAGE]
        I = len(result)
    print("Cached images {}.".format(len(result)))
    
    s = IMAGESPERFILE * HOWMANYPERIMAGE
    if len(result)<HOWMANY*HOWMANYPERIMAGE:
        print("Preprocessing images.")
        print("Proceeding from {} image.".format(I if I is not None else 0))
        i = I if I is not None else 0
        ignoring = []
        for image in tqdm_notebook(images[I:]):
            r = preprocessor(image)
            result += r
            if i%s==0:
                pickleBigDataset(prefix,result,s)
        pickleBigDataset(prefix,result,s)
    images = None
        
    return np.array(result)

In [18]:
def doSomeDeepLearning(X=None,Y=None,side=85):
    num_classes = 2    # ile klas będziemy rozpoznawali

    # input image dimensions
    img_rows, img_cols = side,side   # takie wymiary mają obrazki w bazie MNIST

    # the data, shuffled and split between train and test sets
    try:
        x_train = unpickleBigDataset('xain')
        y_train = unpickleBigDataset('yain')
        x_test = unpickleBigDataset('xest')
        y_test = unpickleBigDataset('yest')
        if len(x_train)==0 or len(y_train)==0 or len(x_test)==0 or len(y_test)==0:
            raise Exception
    except:
        if X is None or Y is None:
            raise MyException
        x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
        if K.image_data_format() == 'channels_first':
            x_train = x_train.reshape(x_train.shape[0], 3, img_rows, img_cols)
            x_test = x_test.reshape(x_test.shape[0], 3, img_rows, img_cols)
            y_train = y_train.reshape(y_train.shape[0], 1, img_rows, img_cols)
            y_test = y_test.reshape(y_test.shape[0], 1, img_rows, img_cols)
            input_shape = (1, img_rows, img_cols)
        else:
            x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 3)
            x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 3)
            y_train = y_train.reshape(y_train.shape[0], img_rows, img_cols, 1)
            y_test = y_test.reshape(y_test.shape[0], img_rows, img_cols, 1)
            input_shape = (img_rows, img_cols, 1)
        s = IMAGESPERFILE * HOWMANYPERIMAGE
        pickleBigDataset('xain',x_train,s)
        pickleBigDataset('yain',y_train,s)
        pickleBigDataset('xest',x_test,s)
        pickleBigDataset('yest',y_test,s)

    print('x_train shape:', x_train.shape)
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')
    
    curr_epoch = -1
    onlyfiles = [f for f in os.listdir('.') if os.path.isfile(os.path.join('.', f)) and f.startswith('moj_ulubiony_model') and f.endswith('.h5')]
    if len(onlyfiles) == 0:
        print("No saved model. Preparing model.")
        imput = Input(shape=(side,side,3))
        conv1 = Conv2D(32, kernel_size=(3,3),padding="same", activation='relu',kernel_initializer='he_normal')(imput)
        conv1 = Conv2D(32, kernel_size=(3,3),padding="same", activation='relu',kernel_initializer='he_normal')(conv1)
        conv1 = Conv2D(32, kernel_size=(3,3),padding="same", activation='relu',kernel_initializer='he_normal')(conv1)
        dropout1 = Dropout(0.2)(conv1)
        maxpool1 = MaxPooling2D(pool_size=(2, 2))(dropout1)
        conv2 = Conv2D(64, kernel_size=(3,3),padding="same", activation='relu',kernel_initializer='he_normal')(maxpool1)
        conv2 = Conv2D(64, kernel_size=(3,3),padding="same", activation='relu',kernel_initializer='he_normal')(conv2)
        conv2 = Conv2D(64, kernel_size=(3,3),padding="same", activation='relu',kernel_initializer='he_normal')(conv2)
        dropout2 = Dropout(0.25)(conv2)
        maxpool2 = MaxPooling2D(pool_size=(2, 2))(dropout2)
        conv3 = Conv2D(128, kernel_size=(3,3),padding="same", activation='relu',kernel_initializer='he_normal')(maxpool2)
        conv3 = Conv2D(128, kernel_size=(3,3),padding="same", activation='relu',kernel_initializer='he_normal')(conv3)
        conv3 = Conv2D(128, kernel_size=(3,3),padding="same", activation='relu',kernel_initializer='he_normal')(conv3)
        dropout3 = Dropout(0.25)(conv3)
        upsample1 = UpSampling2D(size=(2,2))(dropout3)
        
        concat1 = concatenate([upsample1,conv2,])#lambda1])
        conv4 = Conv2D(64, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(concat1)
        conv4 = Conv2D(64, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv4)
        conv4 = Conv2D(64, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv4)
        dropout4 = Dropout(0.25)(conv4)
        upsample2 = UpSampling2D(size=(2,2))(dropout4)
#         lambda1 = Lambda(lambda image: K.resize_images(image,84/85, 84/85, K.image_data_format()))(conv2)
#         crop1 = Cropping2D(((1,0),(1,0)))(conv1)
        zpad1 = ZeroPadding2D(((1,0),(1,0)))(upsample2)
        concat2 = concatenate([zpad1,conv1])
        conv5 = Conv2D(32, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(concat2)
        conv5 = Conv2D(32, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv5)
        conv5 = Conv2D(1, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv5)
        model = Model(inputs=imput, outputs=conv5)

        model.compile(loss=keras.losses.mean_squared_error,
                  optimizer=keras.optimizers.Adam(),       
                  metrics=['accuracy']) 
    elif len(onlyfiles) == 1:
        print("Saved model:\"{}\"".format(onlyfiles[0]))
        model = keras.models.load_model(onlyfiles[0])
    else:
        onlyfiles = map(lambda y:filter(lambda x:x is not None and x.startswith('epoch'),y.split('.')[0].split('_')),onlyfiles)
        curr_epoch = max(list(map(lambda x:int(x[4:]),onlyfiles)))
        model = keras.models.load_model("moj_ulubiony_model_epoch{}.h5".format(curr_epoch))
        print("Saved model:\"moj_ulubiony_model_epoch{}.h5\"".format(curr_epoch))
    curr_epoch += 1
    print("Current epoch:{}".format(curr_epoch))
    model.summary()
    for layer in model.layers:
        print(layer.get_config())
    
    
    for i in range(curr_epoch,epochs):
        model.fit(x_train, y_train,
                      batch_size=batch_size,
                      epochs=1,
                      verbose=1,
                      validation_data=(x_test, y_test))
        model.save("moj_ulubiony_model_epoch{}.h5".format(i))
    
    score = model.evaluate(x_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

In [19]:
if __name__=="__main__":
    try:
        doSomeDeepLearning()
    except MyException as e:
        urlX = "https://www.cs.toronto.edu/~vmnih/data/mass_roads/train/sat/index.html"
        urlY = "https://www.cs.toronto.edu/~vmnih/data/mass_roads/train/map/index.html"

        X,Y = loadDataset()
        if X is None and Y is None or DEBUG:
            if LOAD:
                print("Loading images (X)")
                X = loadImagesFromSite(urlX,'f')
            if PREPROCESS:
                print("Preprocessing images (X)")
                X = preprocess(X,preprocessorX,'x')
            if LOAD:
                print("Loading images (Y)")
                Y = loadImagesFromSite(urlY,'z')
            if PREPROCESS:
                print("Preprocessing images (Y)")
                Y = preprocess(Y,preprocessorY,'y')

                r = preprocessXY(X,Y)
                print("\n\t| {}\t\t\t| {}\t\t| {}".format('Hue', 'Saturation', 'Value'))
                l = ['max','min','avg','std','median']
                for i,(c1, c2, c3) in enumerate(r):  
                    print("{}\t| {}\t| {}\t| {}".format(l[i],c1, c2, c3))

        doSomeDeepLearning(X,Y)

Failed loading dataset from file system
Loading images (X)
Cached images 50.
Preprocessing images (X)
Cached images 5000.
Loading images (Y)
Cached images 50.
Preprocessing images (Y)
Cached images 5000.

	| Hue			| Saturation		| Value
max	| 0.7700667221219258	| 0.3697963440155764	| 0.5840015321515193
min	| -0.3213364080848538	| -0.521440711287964	| -0.3414744058013497
avg	| 0.25378781994227284	| -0.2957984856996971	| 0.15707026548488026
std	| 0.2547130276408191	| 0.1482575176135678	| 0.1729013025156554
median	| 0.28455884604208853	| -0.3250359185296453	| 0.17102898929846352
x_train shape: (3500, 85, 85, 3)
3500 train samples
1500 test samples
No saved model. Preparing model.
Current epoch:0
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 85, 85, 3)    0                                            
__

KeyboardInterrupt: 