# CNN for Urban Region

In [1]:
import os
import pandas as pd
import numpy as np
import rasterio
import random
import tensorflow
from sklearn.model_selection import train_test_split


from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import models

In [2]:
#Move Urban images to own directory



def get_urban(base, orig_dir):

    #Define folders
    urban_dir = os.path.join(base, 'urban')
    if not os.path.exists(urban_dir):
        os.mkdir(urban_dir)


    img_list = os.listdir(orig_dir)
    survey_name = os.path.basename(orig_dir)
    for img_name in img_list:
        if img_name.endswith('.tif'):
            img_dir = os.path.join(orig_dir, img_name)
            img = rasterio.open(img_dir)
            array = img.read()
            if array.shape[1]< 500:
                #Name tif data survey name + cluster
                img_survey_name = img_name.replace(img_name[:6], survey_name)
                urb_img = os.path.join(urban_dir, img_survey_name)
                os.rename(img_dir, urb_img)
                
    return urban_dir

In [3]:
#Create label df with ID, cluster number, water_source as string and water_source as numerical label (panda df)
def get_labels(csv_file):
    
    water_file = pd.read_csv(csv_file)
    water_source = water_file.drop(labels=['ID','cluster', 'residence','year'], axis = 1)
    water_source = water_source.fillna(0)
    water_source=  water_source.idxmax(axis=1)
    water_source.name = 'source'
    water_label = water_source.copy()
    water_label.name = 'label'

    df = pd.concat([water_file['ID'], water_file['cluster'], water_source, water_label], axis = 1)
    df.label = pd.Categorical(pd.factorize(df.label)[0] + 1)
    df.cluster = df.cluster.astype('int')
    
    return df

In [4]:
def keep_only_main_group_labels(labels, urban_dir):
    
    img_list = os.listdir(urban_dir)
    # 3 Main label categories (which are kept)
    main_labels = ['piped', 'groundwater', 'bottled water']
    
    for img in img_list:
            #Get corresponding label y
            #find corresponding surveynames (row-value in filename of image)
            for index, survey_name in enumerate(labels['ID']): 
                survey_name = survey_name.replace('HR', 'GE', 1)
                if survey_name in img:
                    #find for this row in dataframe labels corresponding cluster and check with this if it is filename of image
                    cluster = labels.loc[index]['cluster']
                    #cluster solely not enough as e.g. 1 may also be in 100, 101, 110, ....
                    cluster_string = '000'+str(cluster)+'.tif'
                    if cluster_string in img:
                        label = labels.loc[index]['source']
                        #remove every image which has not as label one of the three main labels defined above
                        if not label in main_labels:
                            img_dir = os.path.join(urban_dir,img)
                            print('Removed', img, 'as it has the label', label)
                            os.remove(img_dir)

In [5]:
def created_data_sets(urban_dir):

    # Divide into training (80%), validation (10%), test(10%) data sets
    img_list = os.listdir(urban_dir)

    #Create  validation, training und test folder

    train_dir = os.path.join(urban_dir,'training')
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)

    val_dir = os.path.join(urban_dir,'validation')
    if not os.path.exists(val_dir):
        os.mkdir(val_dir)

    test_dir = os.path.join(urban_dir,'test')
    if not os.path.exists(test_dir):
        os.mkdir(test_dir)

    #Split into the data sets and move them to their respective folder
    X_train, X_rem = train_test_split(img_list, train_size=0.8)

    X_val, X_test = train_test_split(X_rem, train_size = 0.5)

    for img in X_train:
            img_dir = os.path.join(urban_dir, img)
            train_img = os.path.join(train_dir, img)
            os.rename(img_dir, train_img)

    for img in X_val:
            img_dir = os.path.join(urban_dir, img)
            val_img = os.path.join(val_dir, img)
            os.rename(img_dir, val_img)

    for img in X_test:
            img_dir = os.path.join(urban_dir, img)
            test_img = os.path.join(test_dir, img)
            os.rename(img_dir, test_img)
    
    
    return train_dir, val_dir, test_dir

In [8]:
#Alternative calculation way of mean and std if dataset is not too big
def calc_mean_std(data_dir):
    
    img_list = os.listdir(data_dir)
    assert all([i.endswith('.tif') for i in img_list])
    pixels = np.ndarray(shape=(len(img_list), 3,201, 201))

    #Create "array of all images"
    for i, img_name in enumerate(img_list):
        img_dir = os.path.join(data_dir, img_name)
        img = rasterio.open(img_dir)
        array = img.read()
        array = array.astype('float32')
        #Clip to max 10.000
        array = np.clip(array,a_min = 0, a_max = 10000)
        #Ensure that that all arrays have the same size
        array = array[:,:201,:201]
        pixels[i] = array
        
    #Calculate Mean and Standard deviation along images (axis 0), width & heigth(axis:2,3) for each channel (axis:1)       
    means = pixels.mean(axis=(0,2,3), dtype='float64')
    stds = pixels.std(axis=(0,2,3), dtype='float64')

    
    return means, stds

In [9]:
#Calculate mean for each channel over all pixels for training set; for validation and test set you need to take
#mean and std of training set as well as in real case scenarios you don't know them beforehand to calculate them
def calc_mean(data_dir):
    
    img_list = os.listdir(data_dir)
    assert all([i.endswith('.tif') for i in img_list])
    #pixels = np.ndarray(shape=(len(img_list), 3,201, 201))
    #Variable to save the summation of the pixels values
    sum_arr = 0
    #Count of pixels
    sum_pixel = 201*201*len(img_list)

    for i, img_name in enumerate(img_list):
        img_dir = os.path.join(data_dir, img_name)
        img = rasterio.open(img_dir)
        array = img.read()
        array = array.astype('float32')
        #Clip to max 10.000
        array = np.clip(array,a_min = 0,a_max = 10000)
        #Ensure that that all arrays have the same size
        array = array[:,:201,:201]
        #pixels[i] = array
        sum_arr += array.sum(axis = (1,2))
        
    #Calculate mean
    means = sum_arr/sum_pixel
    
    return means

In [10]:
#Calculate standard deviation (note:mean function has to be executed beforehand as it is required as input)
def calc_std(means, data_dir):
    img_list = os.listdir(data_dir)
    assert all([i.endswith('.tif') for i in img_list])
    
    sum_arr = 0
    #Count of pixels
    sum_pixel = 201*201*len(img_list)
    
    #Work out mean and take it ^2
    for i, img_name in enumerate(img_list):
        img_dir = os.path.join(data_dir, img_name)
        img = rasterio.open(img_dir)
        array = img.read()
        array = array.astype('float32')
        #Clip to max 10.000
        array = np.clip(array,a_min = 0,a_max = 10000)
        #Ensure that that all arrays have the same size
        array = array[:,:201,:201]
    
        array = np.power(array.transpose(1,2,0) - means, 2).transpose(2, 0, 1)
        sum_arr += array.sum(axis = (1,2))
    
    stds = np.sqrt(sum_arr/sum_pixel)
    
    return stds


In [11]:
#Clip to 10.000
#201x201 size
#shuffle randomly order
#normalize 
def generator(x_dir, labels, batch_size, means, stds):

    x_list = os.listdir(x_dir)
    assert all([i.endswith('.tif') for i in x_list])
    #Shuffle elements in list, so that batches consists of images of different surveys
    random.shuffle(x_list)

    batch_x = np.zeros(shape=(batch_size, 3,201, 201))
    batch_y = np.zeros(shape=(batch_size))
    batch_ele = 0

    for x in x_list:
        #Get training sample x
        img_dir = os.path.join(x_dir, x)
        img = rasterio.open(img_dir)
        array = img.read()
        array = array.astype('float32')
        #Clip to max 10.000
        array = np.clip(array,a_min = 0,a_max = 10000)
        #Ensure that that all arrays have the same size
        array = array[:,:201,:201]
        #Normalize the array
        array = ((array.transpose(1,2,0)-means)/stds).transpose(2, 0, 1)
        # Add to batch
        batch_x[batch_ele] = array     

        #Get corresponding Label y
        #find corresponding surveynames (row-value in filename of image)
        for index, survey_name in enumerate(labels['ID']): 
            survey_name = survey_name.replace('HR', 'GE', 1)
            if survey_name in x:
                #find for this row in dataframe labels corresponding cluster and check with this if it is filename of image
                cluster = labels.loc[index]['cluster']
                #cluster solely not enough as e.g. 1 may also be in 100, 101, 110, ....
                cluster_string = '000'+str(cluster)+'.tif'
                if cluster_string in x:
                    batch_y[batch_ele] = labels.loc[index]['label']

        batch_ele += 1

        #Check if batch is already full (Note: Index in batch array is from 0...4 hence we need to add +1 to batch_ele)
        if (batch_ele) == batch_size:
            batch_x = batch_x.transpose(0,2,3,1)
            yield batch_x,batch_y
            #Reset settings -> Start of next batch creation
            batch_ele = 0
            batch_x = np.zeros(shape=(batch_size, 3,201, 201))
            batch_y = np.zeros(shape=(batch_size))

    return batch_x, batch_y

In [15]:

#PARAMETERS
base = '/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/NN/sentinel/'
orig_dir = os.path.join(base, 'AOGE71FL')
water_source_file = "/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/SatelliteImage__GEE/correlation/SAV_Data/water-source/joined-surveys-2013-grouped.csv"
batch_size = 5

#Functions
urban_dir = get_urban(base, orig_dir) # has to change if all surveys are used!
print('Moved urbane images to seperate folder')

labels_df = get_labels(water_source_file)
print('Add to label names categorical labels')

keep_only_main_group_labels(labels_df, urban_dir)
print("Removed every images which is not part of one of the main groups 'piped, groundwater, or bottled water'")

train_dir, val_dir, test_dir = created_data_sets(urban_dir)
print('Split up data set into training, validation and test data')

means = calc_mean(train_dir)
stds = calc_std(means, train_dir)
print('Calculated mean and standard deviation for each channel (for training set)')



training_generator = generator(train_dir, labels_df, batch_size, means, stds)
#Check if shape is correct
for data_batch, labels_batch in training_generator:
    print('This is the shape of the training data batch:', data_batch.shape)
    print('This is the shape of the training label batch:', labels_batch.shape)
    break

validation_generator = generator(val_dir, labels_df, batch_size, means, stds)

Moved urbane images to seperate folder
Add to label names categorical labels
Removed AOGE71FL00000120.tif as it has the label surface water
Removed AOGE71FL00000115.tif as it has the label external source
Removed AOGE71FL00000113.tif as it has the label external source
Removed AOGE71FL00000112.tif as it has the label external source
Removed AOGE71FL00000080.tif as it has the label external source
Removed AOGE71FL00000073.tif as it has the label external source
Removed AOGE71FL00000070.tif as it has the label external source
Removed AOGE71FL00000064.tif as it has the label surface water
Removed AOGE71FL00000060.tif as it has the label external source
Removed AOGE71FL00000054.tif as it has the label external source
Removed AOGE71FL00000049.tif as it has the label external source
Removed AOGE71FL00000047.tif as it has the label external source
Removed AOGE71FL00000046.tif as it has the label surface water
Removed AOGE71FL00000044.tif as it has the label external source
Removed AOGE71FL000

In [None]:
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',input_shape=(201, 201, 3)))

model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='softmax'))


2021-07-15 17:38:58.490756: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-07-15 17:38:58.542672: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 2595291136 exceeds 10% of free system memory.
2021-07-15 17:39:00.114084: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 2595291136 exceeds 10% of free system memory.


In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
history = model.fit(
            training_generator,
            steps_per_epoch=5,
            epochs=10,
            validation_data=validation_generator,
            validation_steps=25)