In [25]:
%matplotlib inline

from bs4 import BeautifulSoup
import urllib
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pickle
from tqdm import tqdm,tqdm_notebook
import os

import keras
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten, Input, concatenate
from keras.layers import Conv2D, MaxPooling2D,UpSampling2D,Lambda
from keras import backend as K
from sklearn.model_selection import train_test_split

In [26]:
HOWMANY = 400
DEBUG = True
TMP_DIR = 'tmp'
FORCE_RELOAD = False#True
LOAD = True
PREPROCESS = True#False
batch_size = 128   # ile obrazków przetwarzamy na raz (aktualizacja wag sieci następuje raz na całą grupę obrazków)
epochs = 12         # ile epok będziemy uczyli
SIZE = (750,750)

In [27]:
def loadImage(url):
    raw = urllib.request.urlopen(url).read()
    npraw= np.array(bytearray(raw),dtype=np.uint8)
    return cv2.imdecode(npraw,-1)#-1 -> as is (with the alpha channel)

def getImageName(url):
    return url.split('/').pop().split('.').pop(0)

def pickleBigDataset(prefix,dataset,size):
#     onlyfiles = [f for f in os.listdir(TMP_DIR) if os.path.isfile(os.path.join(TMP_DIR, f)) and f[0]==prefix]
#     j = int(np.ceil(len(dataset)/size))
#     for i in tqdm_notebook(range(1,j+1)):
#         with open(os.path.join(TMP_DIR, prefix+str(i)),'wb') as f:
#             pickle.dump(dataset[size*(i-1):size*i],f)
    np.save(os.path.join(TMP_DIR, prefix),dataset)

def unpickleBigDataset(prefix):
#     onlyfiles = [f for f in os.listdir(TMP_DIR) if os.path.isfile(os.path.join(TMP_DIR, f)) and f[0]==prefix]
#     dataset = []
#     if len(onlyfiles)>0:
#         for f in tqdm_notebook(onlyfiles):
#             with open(os.path.join(TMP_DIR, f),'rb') as fl:
#                 dataset+=pickle.load(fl)
#     return dataset
    return np.load(os.path.join(TMP_DIR, "{}.npy".format(prefix)))
    
            
def loadImagesFromSite(url,prefix):
    onlyfiles = [f for f in os.listdir(TMP_DIR) if os.path.isfile(os.path.join(TMP_DIR, f)) and f.startswith(prefix)]
    if len(onlyfiles)==0 or FORCE_RELOAD:
        imgs = []
        I = None
        
    else:
        imgs = unpickleBigDataset(prefix)
        I = len(imgs)
    if len(imgs)!=1109:
        print("Loading images from {}".format(url))
        print("Cached images {}. Proceeding from {} image.".format(len(imgs),I if I is not None else 0))

        s = 100

        with urllib.request.urlopen(url) as response:
            html = BeautifulSoup(response.read(),"lxml")
            i = I if I is not None else 0
            links = html.find_all('a')[I:HOWMANY]
            for link in tqdm_notebook(links):
                img = loadImage(link.get('href'))    
                imgs += [cv2.resize(img,SIZE)]
                if i%s==0:
                    pickleBigDataset(prefix,imgs,s)
                i+=1
        pickleBigDataset(prefix,imgs,s)
    
        
    return np.array(imgs)  

def saveDataset(X,Y,prefix=""):
    with open('pickledDatasetX'+prefix,'wb') as f:
        pickle.dump(X,f)
    with open('pickledDatasetY'+prefix,'wb') as f:
        pickle.dump(Y,f)
        
def loadDataset(prefix=""):
    try:
        X = unpickleBigDataset('x')
        Y = unpickleBigDataset('y')
        return X,Y
    except:
        print("Failed loading dataset from file system")
        return None,None
    
def display(X,Y,howmany=None):
    if howmany is None:
        howmany = X.shape[0]
        
    for i in range(howmany):
        print(X[i].max(),X[i].min())
        plt.figure()
        plt.subplot(1,2,1)
        plt.imshow(X[i])
        plt.subplot(1,2,2)
        plt.imshow(Y[i])
        

In [28]:
# def get_patches(image,size,side,imposition):
#     patches = []
    
    
#     for i in range(int(size[0]/side)):
#         for j in range(int(size[1]/side)):
#             patches += [image[i*side:(i+1)*side,j*side:(j+1)*side]]
#     return patches

def get_patches(image,size,side,imposition):
    patches = []
    
    img = np.zeros((image.shape[0]+imposition,image.shape[1]+imposition,3))
    for i in range(3):
        img[...,i] = np.pad(image[...,i],((imposition,0),(imposition,0)),'reflect')
    image = img
    for i in range(int(size[0]/side)):
        for j in range(int(size[1]/side)):
            imp1=np.max([i*side-imposition,0])
            imp2=(i+1)*side+imposition if imp1!=0 else (i+1)*side+imposition*2
            imp3=np.max([j*side-imposition,0])
            imp4=(j+1)*side+imposition if imp3!=0 else (j+1)*side+imposition*2
            patches += [image[imp1:imp2,imp3:imp4]]
    return patches

def resize(image):
#     try:
    size = (750,750)
    side = 78
    imposition = 5
    if size!=SIZE:
        return cv2.resize(image,size),size,side,imposition
    else:
        return image,size,side,imposition
#     except:
#         print('Failed resizing... Image shall be ignored')
#         return None,None,None#I tak jest ich mnóstwo

def preprocessorX(image):
    image,size,side,imposition = resize(image)
    if image is not None:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

        image = image.astype(np.float32)
        
        if image.max() > 1:
            image /= 255

        for i in range(3):
            image[...,i] -= image[...,i].mean()
            image[...,i] /= image[...,i].std()

        #remove outliers
        image[image<-3] = -3
        image[image>3] = 3
        
        #between -1,1
        for i in range(3):
            image[...,i] /= np.max(np.abs([image[...,i].min(),image[...,i].max()]))

        return get_patches(image,size,side,imposition),False
    return [],True
    

def preprocessorY(image):
    image,size,side,imposition = resize(image)    
    if image is not None:
        image = image.astype(np.float32)
        
        if image.max() > 1:
            image /= 255
        
        for i in range(3):
            image[...,i] = (image[...,i] - image[...,i].min())/(image[...,i].max() - image[...,i].min())

        return get_patches(image,size,side,imposition),False
    return [],True

def getRoadStats(arr,mask):
    b = mask.astype(np.bool)
    x = arr[b]
    if len(x) != 0:
        return [x.max(0),x.min(0),x.mean(0),x.std(0),np.median(x,axis=0)]
    else:
        return None

def preprocessXY(X,Y,i1,i2):
    print(i1,i2)
    #TODO remove those not ignored in one, but ignored in other
    
    r = []
    for i in range(len(X)):
        s = getRoadStats(X[i],Y[i])
        if s is not None:
            r += [s]
    return np.array(r).mean(0)
    

def preprocess(images,preprocessor,prefix):
    onlyfiles = [f for f in os.listdir(TMP_DIR) if os.path.isfile(os.path.join(TMP_DIR, f)) and f.startswith(prefix)]
    if len(onlyfiles)==0:
        result = []
        I = None
    else:
        imgs = unpickleBigDataset(prefix)
        I = len(imgs)
    print("Cached images {}. Proceeding from {} image.".format(len(result),I if I is not None else 0))
    
    result = []
    s = 400
    print("Preprocessing images.")
    i = I if I is not None else 0
    ignoring = []
    for image in tqdm_notebook(images[I:]):
        r,ignored = preprocessor(image)
        
        result += r
        
        if i%s==0:
            pickleBigDataset(prefix,result,s)
    images = None
        
    return np.array(result),np.array(ignoring)    

In [29]:
def doSomeDeepLearning(X,Y,side=85):
    num_classes = 2    # ile klas będziemy rozpoznawali

    # input image dimensions
    img_rows, img_cols = side,side   # takie wymiary mają obrazki w bazie MNIST

    # the data, shuffled and split between train and test sets
    x_train = unpickleBigDataset('xain')
    y_train = unpickleBigDataset('yain')
    x_test = unpickleBigDataset('xest')
    y_test = unpickleBigDataset('yest')
    if len(x_train) = 0 or len(x_test) = 0 or len(y_train)=0 or len(y_test)=0:
        x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)

    # TensorFlow i Theano domyślnie inaczej interpretują kolejne wymiary tensora
    # TensorFlow: [batch, height, width, channels]
    #     Theano: [batch, channels, height, width]
    # 
    if K.image_data_format() == 'channels_first':
        x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
        x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
        input_shape = (1, img_rows, img_cols)
    else:
        x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
        x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
        input_shape = (img_rows, img_cols, 1)

    print('x_train shape:', x_train.shape)
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')
    
    curr_epoch = -1
    onlyfiles = [f for f in os.listdir('.') if os.path.isfile(os.path.join('.', f)) and f.startswith('moj_ulubiony_model') and f.endswith('.h5')]
    if len(onlyfiles) == 0:
        print("No saved model. Preparing model.")
        imput = Input(shape=(side,side,3))
        conv1 = Conv2D(32, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(imput)
        conv1 = Conv2D(32, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv1)
        conv1 = Conv2D(32, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv1)
        dropout1 = Dropout(0.25)(conv1)
        maxpool1 = MaxPooling2D(pool_size=(2, 2))(dropout1)
        conv2 = Conv2D(64, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(maxpool1)
        conv2 = Conv2D(64, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv2)
        conv2 = Conv2D(64, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv2)
        dropout2 = Dropout(0.25)(conv2)
        maxpool2 = MaxPooling2D(pool_size=(2, 2))(dropout2)
        conv3 = Conv2D(128, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(maxpool2)
        conv3 = Conv2D(128, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv3)
        conv3 = Conv2D(128, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv3)
        dropout3 = Dropout(0.25)(conv3)
        upsample1 = UpSampling2D(size=(2,2))(dropout3)
        concat1 = concatenate([upsample1,conv2])
        conv4 = Conv2D(64, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(concat1)
        conv4 = Conv2D(64, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv4)
        conv4 = Conv2D(64, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv4)
        dropout4 = Dropout(0.25)(conv4)
        upsample2 = UpSampling2D(size=(2,2))(dropout4)
        concat2 = concatenate([upsample2,conv1])
        conv5 = Conv2D(32, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(concat2)
        conv5 = Conv2D(32, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv5)
        conv5 = Conv2D(1, kernel_size=(3,3), activation='relu',padding="same",kernel_initializer='he_normal')(conv5)
        model = Model(inputs=imput, outputs=conv5)

        model.compile(loss=keras.losses.mean_squared_error,
                  optimizer=keras.optimizers.Adam(),       
                  metrics=['accuracy']) 
    elif len(onlyfiles) == 1:
        print("Saved model:\"{}\"".format(onlyfiles[0]))
        model = keras.models.load_model(onlyfiles[0])
    else:
        onlyfiles = map(lambda y:filter(lambda x:x is not None and x.startswith('epoch'),y.split('.')[0].split('_')),onlyfiles)
        curr_epoch = max(list(map(lambda x:int(x[4:]),onlyfiles)))
        model = keras.models.load_model("moj_ulubiony_model_epoch{}.h5".format(curr_epoch))
        print("Saved model:\"moj_ulubiony_model_epoch{}.h5\"".format(curr_epoch))
    curr_epoch += 1
    print("Current epoch:{}".format(curr_epoch))
    model.summary()
    for layer in model.layers:
        print(layer.get_config())
    
    
    for i in range(curr_epoch,epochs):
        model.fit(x_train, y_train,
                      batch_size=batch_size,
                      epochs=1,
                      verbose=1,
                      validation_data=(x_test, y_test))
        model.save("moj_ulubiony_model_epoch{}.h5".format(i))
    
    score = model.evaluate(x_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

In [None]:
if __name__=="__main__":
    urlX = "https://www.cs.toronto.edu/~vmnih/data/mass_roads/train/sat/index.html"
    urlY = "https://www.cs.toronto.edu/~vmnih/data/mass_roads/train/map/index.html"
    
    X,Y = loadDataset()
    if X is None and Y is None or DEBUG:
        if LOAD:
            print("Loading images")
            Y = loadImagesFromSite(urlY,'z')
            X = loadImagesFromSite(urlX,'f')
        if PREPROCESS:
            print("Preprocessing images")
            X,i1 = preprocess(X,preprocessorX,'x')
            Y,i2 = preprocess(Y,preprocessorY,'y')
        
            print(preprocessXY(X,Y,i1,i2))
        
    doSomeDeepLearning(X,Y)

Failed loading dataset from file system
Loading images
z []
A1
Loading images from https://www.cs.toronto.edu/~vmnih/data/mass_roads/train/map/index.html
Cached images 0. Proceeding from 0 image.
