# CNN for Urban Region

In [1]:
import os
from zipfile import ZipFile
import pandas as pd
import numpy as np
import rasterio
import random
import tensorflow
from sklearn.model_selection import train_test_split
import shutil
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import models

#to dos:
#generalize some code like pixel counts
#create a preprocessing script and outsource methods
#write different methods for normalization
#write some quality control methods, e.g. pdf (probability density functions, outlyer detection - implement
#into normalization methods)
#more to come

In [2]:
#Move Urban images to own directory
def get_urban(urban_dir, orig_dir, channels):

    img_list = os.listdir(orig_dir)
    survey_name = os.path.basename(orig_dir)
    for img_name in img_list:
        if img_name.endswith('.tif'):
            img_dir = os.path.join(orig_dir, img_name)
            with rasterio.open(img_dir) as img:
                if len(channels) == 0:
                    array = img.read()
                else:
                    array = img.read(channels)
            if array.shape[1]< 500:
                #Name tif data survey name + cluster
                img_survey_name = img_name.replace(img_name[:6], survey_name)
                urb_img = os.path.join(urban_dir, img_survey_name)
                os.rename(img_dir, urb_img)
                
    channel_size = array.shape[0]
    
    return channel_size

#Extract from each zip-file (each survey) all images which belongs to urban category (less than 500x500 pixels) and
#move them to own directory
def get_urban_img(base, channels):
    #Define folders
    urban_dir = os.path.join(base, 'urban')
    if not os.path.exists(urban_dir):
        os.mkdir(urban_dir)

    zip_files = os.listdir(base)
    for zip_file in zip_files:
        if zip_file.endswith('.zip'):
            zip_dir = os.path.join(base, zip_file)
            with ZipFile(zip_dir, 'r') as zipObj:
                # Extract all the contents of zip file in current directory
                zipObj.extractall(base)
                img_dir_name = os.path.splitext(zip_file)[0]
                img_dir = os.path.join(base, img_dir_name)
                channel_size = get_urban(urban_dir, img_dir, channels)
                shutil.rmtree(img_dir)

    return urban_dir, channel_size
    


In [3]:
#Create dataframe where the column source refers to the water source which was named the most
def get_cluster_num_from_img_name(img):
    cluster_num = None
    #Remove .tif (BFGE71FL0000203.tif )
    img = img[:img.rfind('.')]
    
    #Remove survey name (has always the length  8, BFGE71FL0000203)
    cluster_0 = img[8:]
    
    #Get cluster number by finding first entry in remaining string which is not 0 as they typically have the form
    #B0000203 (want 203)
    num_start = None
    for str_index in range(0, len(cluster_0)):
        if cluster_0[str_index] != '0':
            num_start = str_index
            break
    if num_start is None:
        cluster_num = 0
    else:
        cluster_num = cluster_0[num_start:]
    
    return cluster_num
    
    
def get_main_source_file(water_file):
   

    water_source = water_file.drop(labels=['ID','cluster', 'residence','year'], axis = 1)
    water_source = water_source.fillna(0)
    water_source=  water_source.idxmax(axis=1)
    water_source.name = 'source'


    df = pd.concat([water_file['ID'], water_file['cluster'], water_source], axis = 1)
    
    return df

#Create label dataframe where next to name (image name (Survey name+Cluster number)) the label in string format
#and categorial format (number) is provided, but only for sentinel images which main source belongs to the main label
#defined prior (which we want to classify); if they don't belong to one of those main labels thoses images are removed
def get_labels_df_for_img(csv_file, urban_dir, main_labels):
    
    img_list = os.listdir(urban_dir)
    water_file = pd.read_csv(csv_file)
    label_list = []
    column_names =  ["name", "source", "label"]
    
    water_file_max = get_main_source_file(water_file)
    
    for img in img_list:
        for index, survey_name in enumerate(water_file_max['ID']): 
                survey_name = survey_name.replace('HR', 'GE', 1)
                if survey_name in img:
                    #Check if cluster is fitting; note +.tif to ensure that e.g. 1 is not true for e.g. AOGe....2104.tif
                    cluster_img = int(get_cluster_num_from_img_name(img))
                    cluster_df = int(water_file_max.loc[index]['cluster'])
                    if cluster_img == cluster_df:
                        #Check if label is part of the main labels (if not remove the image)
                        source = water_file_max.loc[index]['source']
                        if source in main_labels:
                            img_label = [img, source, source]
                            label_list.append(img_label)
                        else:
                            img_dir = os.path.join(urban_dir,img)
                            os.remove(img_dir)
                            
    #Get data frame for label list                        
    label_array = np.array(label_list)
    label_df = pd.DataFrame(label_array, columns = column_names) 
    #Turn label column from string entries to categorical entries
    label_df.label = pd.Categorical(pd.factorize(label_df.label)[0] + 1)

    print(label_df)
    
    return label_df

In [4]:
#Split data set into training, validation, and testing; each subset gets its own folder

#Get ratio between validation to test set to be able to split the remaining data set properly (prior: Split into
#training set and remaining set)
def ratio_val_to_test(val, test):
    
    total = val + test
    one_perc = 100.00/total
    val_ratio = one_perc * val*0.01
    
    return val_ratio

def created_data_sets(urban_dir, split_size):

    img_list = os.listdir(urban_dir)

    #Create  validation, training und test folder

    train_dir = os.path.join(urban_dir,'training')
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)

    val_dir = os.path.join(urban_dir,'validation')
    if not os.path.exists(val_dir):
        os.mkdir(val_dir)

    test_dir = os.path.join(urban_dir,'test')
    if not os.path.exists(test_dir):
        os.mkdir(test_dir)

    #Split into the data sets and move them to their respective folder
    
    X_train, X_rem = train_test_split(img_list, train_size= split_size[0])
    
    X_val, X_test = train_test_split(X_rem, train_size = ratio_val_to_test(split_size[1], split_size[2]))

    for img in X_train:
            img_dir = os.path.join(urban_dir, img)
            train_img = os.path.join(train_dir, img)
            os.rename(img_dir, train_img)

    for img in X_val:
            img_dir = os.path.join(urban_dir, img)
            val_img = os.path.join(val_dir, img)
            os.rename(img_dir, val_img)

    for img in X_test:
            img_dir = os.path.join(urban_dir, img)
            test_img = os.path.join(test_dir, img)
            os.rename(img_dir, test_img)
    
    
    return train_dir, val_dir, test_dir

In [5]:
#Alternative calculation way of mean and std if dataset is not too big
def calc_mean_std(data_dir, input_height, input_width,clipping_values, channels, channel_size):
    
    img_list = os.listdir(data_dir)
    assert all([i.endswith('.tif') for i in img_list])
    
    pixels = np.ndarray(shape(len(img_list), channel_size, input_height, input_width))
    
    #Create "array of all images"
    for i, img_name in enumerate(img_list):
        img_dir = os.path.join(data_dir, img_name)
        with rasterio.open(img_dir) as img:
            if len(channels) == 0:
                array = img.read()
            else:
                array = img.read(channels)

            
        array = array.astype('float32')
        #Clipping
        array = np.clip(array,a_min = clipping_values[0], a_max = clipping_values[1])
        #Ensure that that all arrays have the same size
        array = array[:,:input_height,:input_width]
        pixels[i] = array
        
    #Calculate Mean and Standard deviation along images (axis 0), width & heigth(axis:2,3) for each channel (axis:1)       
    means = pixels.mean(axis=(0,2,3), dtype='float64')
    stds = pixels.std(axis=(0,2,3), dtype='float64')

    
    return means, stds

In [6]:
#Calculate mean for each channel over all pixels for training set; for validation and test set you need to take
#mean and std of training set as well as in real case scenarios you don't know them beforehand to calculate them
def calc_mean(data_dir,input_height, input_width, clipping_values, channels):
    
    img_list = os.listdir(data_dir)
    assert all([i.endswith('.tif') for i in img_list])
    #pixels = np.ndarray(shape=(len(img_list), 13,201, 201))
    #Variable to save the summation of the pixels values
    sum_arr = 0
    #Count of pixels
    sum_pixel = input_height*input_width*len(img_list)

    for i, img_name in enumerate(img_list):
        img_dir = os.path.join(data_dir, img_name)
        with rasterio.open(img_dir) as img:
            if len(channels) == 0:
                array = img.read()
            else:
                array = img.read(channels)

        array = array.astype('float32')
        array[np.isnan(array)] = 0
        #Clipping
        array = np.clip(array,a_min = clipping_values[0],a_max = clipping_values[1])
        #Ensure that that all arrays have the same size
        array = array[:,:input_height,:input_width]
        #pixels[i] = array
        sum_arr += array.sum(axis = (1,2))
        
    #Calculate mean
    means = sum_arr/sum_pixel
    
    return means

In [7]:
#Calculate standard deviation (note:mean function has to be executed beforehand as it is required as input)
def calc_std(means, data_dir, input_height, input_width, clipping_values, channels):
    img_list = os.listdir(data_dir)
    assert all([i.endswith('.tif') for i in img_list])
    
    sum_arr = 0
    #Count of pixels
    sum_pixel = input_height*input_width*len(img_list)
    
    #Calculate mean and take it ^2
    for i, img_name in enumerate(img_list):
        img_dir = os.path.join(data_dir, img_name)
        with rasterio.open(img_dir) as img:
            if len(channels) == 0:
                array = img.read()
            else:
                array = img.read(channels)

        array = array.astype('float32')
        array[np.isnan(array)] = 0

        #Clipping
        array = np.clip(array,a_min = clipping_values[0],a_max =clipping_values[1])
        #Ensure that that all arrays have the same size
        array = array[:,:input_height,:input_width]
    
        array = np.power(array.transpose(1,2,0) - means, 2).transpose(2, 0, 1)
        sum_arr += array.sum(axis = (1,2))
    
    stds = np.sqrt(sum_arr/sum_pixel)
    
    return stds

In [8]:
#Generate iterable object for model.fit()
def generator(x_dir, labels, batch_size, means, stds, input_height, input_width, clipping_values, channels, channel_size, num_labels):

    x_list = os.listdir(x_dir)
    assert all([i.endswith('.tif') for i in x_list])
    #Shuffle elements in list, so that batches consists of images of different surveys
    random.shuffle(x_list)
    #generate batches (x : input, y: label)
    batch_x = np.zeros(shape=(batch_size, channel_size,input_height, input_width))
    batch_y = np.zeros(shape=(batch_size,num_labels), dtype=int)
    #Iterator
    batch_ele = 0

    for x in x_list:
        #Get training sample x
        img_dir = os.path.join(x_dir, x)
        
        with rasterio.open(img_dir) as img:
            if len(channels) == 0:
                array = img.read().astype("float32")
            else:
                array = img.read(channels).astype("float32")

        
        array[np.isnan(array)] = 0
        assert not np.any(np.isnan(array)), "Float"
        #Clipping
        array = np.clip(array,a_min = clipping_values[0],a_max = clipping_values[1])

        assert not np.any(np.isnan(array)), "After clipping"
        #Ensure that that all arrays have the same size via cropping
        array = array[:,:input_height,:input_width]
        
        #Normalize the array
        array = ((array.transpose(1,2,0)-means)/stds).transpose(2, 0, 1)
        assert not np.any(np.isnan(array)), "Normalize"
        # Add to batch
        batch_x[batch_ele] = array     

        #Get corresponding Label y: old version
        '''for index, survey_name in enumerate(labels['ID']): 
            survey_name = survey_name.replace('HR', 'GE', 1)
            if survey_name in x:
                #find for this row in dataframe labels corresponding cluster and check with this if it is filename of image
                cluster = labels.loc[index]['cluster']
                #cluster solely not enough as e.g. 1 may also be in 100, 101, 110, ....
                cluster_string = '000'+str(cluster)+'.tif'
                if cluster_string in x:
                    one_hot = np.zeros(shape = 3)
                    label_pos = (labels.loc[index]['label'])-1
                    print(labels.loc[index]['label'], label_pos)
                    #One hot encoding
                    one_hot[label_pos] = 1
                    batch_y[batch_ele] = one_hot'''
        
        #Get corresponding label y
        for index, survey_name in enumerate(labels['name']):
            if survey_name in x:
                one_hot = np.zeros(shape = num_labels)
                #As indexing starts at 0 
                label_pos = (labels.loc[index]['label'])-1
                #One hot encoding
                one_hot[label_pos] = 1
                batch_y[batch_ele] = one_hot
                
        #Check if batch is already full (Note: Index in batch array is from 0...4 hence we need to add +1 to batch_ele)
        if (batch_ele+1) == batch_size:
            batch_x = batch_x.transpose(0,2,3,1)
            #Return of batch_x,batch_y
            yield batch_x,batch_y
            #Reset settings -> Start of next batch generation
            batch_ele = 0
            batch_x = np.zeros(shape=(batch_size, channel_size,input_height, input_width))
            batch_y = np.zeros(shape=(batch_size, num_labels), dtype=int)

        else:
            batch_ele += 1
    
    
#    return batch_x, batch_y

In [9]:

#PARAMETERS
base = '/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/NN/sentinel/'
orig_dir = os.path.join(base, 'AOGE71FL')
water_source_file = "/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/SatelliteImage__GEE/correlation/SAV_Data/water-source/joined-surveys-2013-grouped.csv"
batch_size = 5
# 3 Main label categories (which are kept)
main_labels = ['piped', 'groundwater', 'bottled water']
num_labels = len(main_labels)
#Training, validation and test set size 
#[Training size, validation size, test size]
split_size = [0.8,0.1,0.1]
#Input height
input_height  = 201
#Input width 
input_width = 201
#Minimum and maximum values (for clipping above and below those values)
#[Minimum value, maximum value]
clipping_values = [0, 3000]

#channels (define channels which should be used, if all should list can stay empty channel = [])
channels = [4,3,2]

In [10]:
#Functions
urban_dir, channel_size =  get_urban_img(base, channels)
print('Moved urbane images to seperate folder')


3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
Moved urbane images to seperate folder


In [12]:
urban_dir ='/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/NN/sentinel/urban'
labels_df = get_labels_df_for_img(water_source_file, urban_dir, main_labels)
print("Add to label names categorical labels and removed images which don't belong to one of the main labels")

                     name       source label
0    AOGE71FL00000124.tif        piped     1
1    AOGE71FL00000121.tif  groundwater     2
2    AOGE71FL00000101.tif        piped     1
3    AOGE71FL00000100.tif        piped     1
4    AOGE71FL00000094.tif        piped     1
..                    ...          ...   ...
356  BFGE71FL00000164.tif        piped     1
357  BFGE71FL00000163.tif        piped     1
358  BFGE71FL00000162.tif        piped     1
359  BFGE71FL00000149.tif  groundwater     2
360  BFGE71FL00000144.tif        piped     1

[361 rows x 3 columns]
Add to label names categorical labels and removed images which don't belong to one of the main labels


In [13]:
train_dir, val_dir, test_dir = created_data_sets(urban_dir, split_size)
print('Split up data set into training, validation and test data')

Split up data set into training, validation and test data


In [14]:
train_dir = '/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/NN/sentinel/urban/training'
val_dir = '/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/NN/sentinel/urban/validation'
means = calc_mean(train_dir, input_height, input_width, clipping_values, channels)
stds = calc_std(means, train_dir, input_height, input_width, clipping_values, channels)
print('Calculated mean and standard deviation for each channel (for training set)')

Calculated mean and standard deviation for each channel (for training set)


In [16]:
training_generator = generator(train_dir, labels_df, batch_size, means, stds, input_height, input_width, clipping_values, channels, channel_size, num_labels)
#Check if shape is correct
print('Created x and y for training data')
'''for data_batch, labels_batch in training_generator:
    print('This is the shape of the training data batch:', data_batch.shape)
    print('This is the shape of the training label batch:', labels_batch.shape)
    break'''

validation_generator = generator(val_dir, labels_df, batch_size, means, stds, input_height, input_width, clipping_values, channels, channel_size, num_labels)

Created x and y for training data


In [20]:
model = models.Sequential()
model.add(layers.Conv2D(16, (3, 3), activation='relu',input_shape=(201, 201, channel_size)))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(32, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))


In [21]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 199, 199, 16)      448       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 99, 99, 16)        0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 97, 97, 32)        4640      
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 48, 48, 32)        0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 46, 46, 64)        18496     
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 23, 23, 64)        0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 21, 21, 64)       

In [22]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
history = model.fit(
            training_generator,
            steps_per_epoch=5,
            epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#FOLLOWING PART IS JUST TRY OUT AND NOT FINISHED AT ALL

In [38]:
#RESNET Model
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
#import cv2
import numpy as np
from keras import layers
from keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D
from keras.models import Model, load_model
from keras.initializers import glorot_uniform
from keras.utils import plot_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import keras.backend as K
import tensorflow as tf

In [39]:
def identity_block(X, f, filters, stage, block):
    # defining name basis
    conv_name_base = 'res' + str(stage) + block + '_branch'
    bn_name_base = 'bn' + str(stage) + block + '_branch'

    # Retrieve Filters
    F1, F2, F3 = filters

    # Save the input value. We'll need this later to add back to the main path. 
    X_shortcut = X

    # First component of main path
    X = Conv2D(filters = F1, kernel_size = (1, 1), strides = (1,1), padding = 'valid', name = conv_name_base + '2a', kernel_initializer = glorot_uniform(seed=0))(X)
    X = BatchNormalization(axis = 3, name = bn_name_base + '2a')(X)
    X = Activation('relu')(X)

    # Second component of main path
    X = Conv2D(filters = F2, kernel_size = (f, f), strides = (1,1), padding = 'same', name = conv_name_base + '2b', kernel_initializer = glorot_uniform(seed=0))(X)
    X = BatchNormalization(axis = 3, name = bn_name_base + '2b')(X)
    X = Activation('relu')(X)

    # Third component of main path
    X = Conv2D(filters = F3, kernel_size = (1, 1), strides = (1,1), padding = 'valid', name = conv_name_base + '2c', kernel_initializer = glorot_uniform(seed=0))(X)
    X = BatchNormalization(axis = 3, name = bn_name_base + '2c')(X)

    # Final step: Add shortcut value to main path, and pass it through a RELU activation
    X = Add()([X, X_shortcut])
    X = Activation('relu')(X)

    return X

In [40]:
tf.reset_default_graph()
with tf.Session() as test:
    A_prev = tf.placeholder("float", [3, 4, 4, 6])
    X = np.random.randn(3, 4, 4, 6)
    A = identity_block(A_prev, f = 2, filters = [2, 4, 6], stage = 1, block = 'a')
    test.run(tf.global_variables_initializer())
    out = test.run([A], feed_dict={A_prev: X, K.learning_phase(): 0})
    print("out = ", out[0][1][1][0])

AttributeError: module 'tensorflow' has no attribute 'reset_default_graph'