# CNN for Urban Region

In [2]:
import os
from zipfile import ZipFile
import pandas as pd
import numpy as np
import rasterio
import random
import tensorflow
from sklearn.model_selection import train_test_split
import shutil

from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import models
#to dos:
#generalize some code like pixel counts
#create a preprocessing script and outsource methods
#write different methods for normalization
#write some quality control methods, e.g. pdf (probability density functions, outlyer detection - implement
#into normalization methods)
#more to come

In [3]:
#Move Urban images to own directory

def get_urban(urban_dir, orig_dir):

    img_list = os.listdir(orig_dir)
    survey_name = os.path.basename(orig_dir)
    for img_name in img_list:
        if img_name.endswith('.tif'):
            img_dir = os.path.join(orig_dir, img_name)
            img = rasterio.open(img_dir)
            array = img.read()
            if array.shape[1]< 500:
                #Name tif data survey name + cluster
                img_survey_name = img_name.replace(img_name[:6], survey_name)
                urb_img = os.path.join(urban_dir, img_survey_name)
                os.rename(img_dir, urb_img)
                

In [4]:
base = '/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/NN/sentinel'
def get_urban_img(base):
    #Define folders
    urban_dir = os.path.join(base, 'urban')
    if not os.path.exists(urban_dir):
        os.mkdir(urban_dir)

    zip_files = os.listdir(base)
    for zip_file in zip_files:
        if zip_file.endswith('.zip'):
            zip_dir = os.path.join(base, zip_file)
            with ZipFile(zip_dir, 'r') as zipObj:
                # Extract all the contents of zip file in current directory
                zipObj.extractall(base)
                img_dir_name = os.path.splitext(zip_file)[0]
                img_dir = os.path.join(base, img_dir_name)
                get_urban(urban_dir, img_dir)
                shutil.rmtree(img_dir)

    return urban_dir
    


In [5]:
def get_main_source_file(water_file):
   

    water_source = water_file.drop(labels=['ID','cluster', 'residence','year'], axis = 1)
    water_source = water_source.fillna(0)
    water_source=  water_source.idxmax(axis=1)
    water_source.name = 'source'


    df = pd.concat([water_file['ID'], water_file['cluster'], water_source], axis = 1)
    
    return df

def get_labels_df_for_img(csv_file, urban_dir, main_labels):

    img_list = os.listdir(urban_dir)
    water_file = pd.read_csv(csv_file)
    label_list = []
    column_names =  ["name", "source", "label"]
    
    water_file_max = get_main_source_file(water_file)
    
    for img in img_list:
        for index, survey_name in enumerate(water_file_max['ID']): 
                survey_name = survey_name.replace('HR', 'GE', 1)
                if survey_name in img:
                    #find for this row in dataframe labels corresponding cluster and check with this if it is filename of image
                    cluster = water_file_max.loc[index]['cluster']
                    #cluster solely not enough as e.g. 1 may also be in 100, 101, 110, ....
                    cluster_string = '000'+str(int(cluster))+'.tif'
                    if cluster_string in img:
                        source = water_file_max.loc[index]['source']
                        if source in main_labels:
                            img_label = [img, source, source]
                            label_list.append(img_label)
                        else:
                            img_dir = os.path.join(urban_dir,img)
                            os.remove(img_dir)
                            
    #Get data frame for label list                        
    label_array = np.array(label_list)
    label_df = pd.DataFrame(label_array, columns = column_names) 

    label_df.label = pd.Categorical(pd.factorize(label_df.label)[0] + 1)

    print(label_df)
    
    return label_df

In [6]:
def created_data_sets(urban_dir):

    # Divide into training (80%), validation (10%), test(10%) data sets
    img_list = os.listdir(urban_dir)

    #Create  validation, training und test folder

    train_dir = os.path.join(urban_dir,'training')
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)

    val_dir = os.path.join(urban_dir,'validation')
    if not os.path.exists(val_dir):
        os.mkdir(val_dir)

    test_dir = os.path.join(urban_dir,'test')
    if not os.path.exists(test_dir):
        os.mkdir(test_dir)

    #Split into the data sets and move them to their respective folder
    X_train, X_rem = train_test_split(img_list, train_size=0.8)

    X_val, X_test = train_test_split(X_rem, train_size = 0.5)

    for img in X_train:
            img_dir = os.path.join(urban_dir, img)
            train_img = os.path.join(train_dir, img)
            os.rename(img_dir, train_img)

    for img in X_val:
            img_dir = os.path.join(urban_dir, img)
            val_img = os.path.join(val_dir, img)
            os.rename(img_dir, val_img)

    for img in X_test:
            img_dir = os.path.join(urban_dir, img)
            test_img = os.path.join(test_dir, img)
            os.rename(img_dir, test_img)
    
    
    return train_dir, val_dir, test_dir

In [7]:
#Alternative calculation way of mean and std if dataset is not too big
def calc_mean_std(data_dir):
    
    img_list = os.listdir(data_dir)
    assert all([i.endswith('.tif') for i in img_list])
    #to do: pls generalize code - read out the amount of pixel!
    pixels = np.ndarray(shape=(len(img_list), 13,201, 201))

    #Create "array of all images"
    for i, img_name in enumerate(img_list):
        img_dir = os.path.join(data_dir, img_name)
        img = rasterio.open(img_dir)
        array = img.read()
        array = array.astype('float32')
        #Clip to max 10.000
        array = np.clip(array,a_min = 0, a_max = 10000)
        #Ensure that that all arrays have the same size
        array = array[:,:201,:201]
        pixels[i] = array
        
    #Calculate Mean and Standard deviation along images (axis 0), width & heigth(axis:2,3) for each channel (axis:1)       
    means = pixels.mean(axis=(0,2,3), dtype='float64')
    stds = pixels.std(axis=(0,2,3), dtype='float64')

    
    return means, stds

In [9]:
#Calculate mean for each channel over all pixels for training set; for validation and test set you need to take
#mean and std of training set as well as in real case scenarios you don't know them beforehand to calculate them
def calc_mean(data_dir):
    
    img_list = os.listdir(data_dir)
    assert all([i.endswith('.tif') for i in img_list])
    #pixels = np.ndarray(shape=(len(img_list), 13,201, 201))
    #Variable to save the summation of the pixels values
    sum_arr = 0
    #Count of pixels
    #to do: pls generalize code - read out the amount of pixel!
    sum_pixel = 201*201*len(img_list)

    for i, img_name in enumerate(img_list):
        img_dir = os.path.join(data_dir, img_name)
        with rasterio.open(img_dir) as img:
            array = img.read()
        array = array.astype('float32')
        array[np.isnan(array)] = 0
        #Clip to max 10.000
        array = np.clip(array,a_min = 0,a_max = 10000)
        #Ensure that that all arrays have the same size
        array = array[:,:201,:201]
        #pixels[i] = array
        sum_arr += array.sum(axis = (1,2))
        
    #Calculate mean
    means = sum_arr/sum_pixel
    
    return means

In [10]:
#Calculate standard deviation (note:mean function has to be executed beforehand as it is required as input)
def calc_std(means, data_dir):
    img_list = os.listdir(data_dir)
    assert all([i.endswith('.tif') for i in img_list])
    
    sum_arr = 0
    #Count of pixels
    #to do: pls generalize code - read out the amount of pixel!
    sum_pixel = 201*201*len(img_list)
    
    #Work out mean and take it ^2
    for i, img_name in enumerate(img_list):
        img_dir = os.path.join(data_dir, img_name)
        with rasterio.open(img_dir) as img:
            array = img.read()
        array = array.astype('float32')
        array[np.isnan(array)] = 0

        #Clip to max 10.000
        array = np.clip(array,a_min = 0,a_max = 10000)
        #Ensure that that all arrays have the same size
        array = array[:,:201,:201]
    
        array = np.power(array.transpose(1,2,0) - means, 2).transpose(2, 0, 1)
        sum_arr += array.sum(axis = (1,2))
    
    stds = np.sqrt(sum_arr/sum_pixel)
    
    return stds

In [11]:
#Clip to 10.000
#201x201 size
#shuffle randomly order
#normalize 
def generator(x_dir, labels, batch_size, means, stds):

    x_list = os.listdir(x_dir)
    assert all([i.endswith('.tif') for i in x_list])
    #Shuffle elements in list, so that batches consists of images of different surveys
    random.shuffle(x_list)

    batch_x = np.zeros(shape=(batch_size, 13,201, 201))
    batch_y = np.zeros(shape=(batch_size,3), dtype=int)
    batch_ele = 0

    for x in x_list:
        #Get training sample x
        img_dir = os.path.join(x_dir, x)
        with rasterio.open(img_dir) as img:
            array = img.read().astype("float32")
        
        array[np.isnan(array)] = 0
        assert not np.any(np.isnan(array)), "Float"
        #Clip to max 10.000
        array = np.clip(array,a_min = 0,a_max = 10000)

        assert not np.any(np.isnan(array)), "Crop to 10000"
        #Ensure that that all arrays have the same size
        array = array[:,:201,:201]
        
        #Normalize the array
        array = ((array.transpose(1,2,0)-means)/stds).transpose(2, 0, 1)
        assert not np.any(np.isnan(array)), "Normalize"
        # Add to batch
        batch_x[batch_ele] = array     

        #Get corresponding Label y
        #find corresponding surveynames (row-value in filename of image)
        '''for index, survey_name in enumerate(labels['ID']): 
            survey_name = survey_name.replace('HR', 'GE', 1)
            if survey_name in x:
                #find for this row in dataframe labels corresponding cluster and check with this if it is filename of image
                cluster = labels.loc[index]['cluster']
                #cluster solely not enough as e.g. 1 may also be in 100, 101, 110, ....
                cluster_string = '000'+str(cluster)+'.tif'
                if cluster_string in x:
                    one_hot = np.zeros(shape = 3)
                    label_pos = (labels.loc[index]['label'])-1
                    print(labels.loc[index]['label'], label_pos)
                    #One hot encoding
                    one_hot[label_pos] = 1
                    batch_y[batch_ele] = one_hot'''
        
        for index, survey_name in enumerate(labels['name']):
            if survey_name in x:
                one_hot = np.zeros(shape = 3)
                label_pos = (labels.loc[index]['label'])-1
                #One hot encoding
                one_hot[label_pos] = 1
                batch_y[batch_ele] = one_hot
                
        #Check if batch is already full (Note: Index in batch array is from 0...4 hence we need to add +1 to batch_ele)
        if (batch_ele+1) == batch_size:
            batch_x = batch_x.transpose(0,2,3,1)
            yield batch_x,batch_y
            #Reset settings -> Start of next batch creation
            batch_ele = 0
            batch_x = np.zeros(shape=(batch_size, 13,201, 201))
            batch_y = np.zeros(shape=(batch_size, 3), dtype=int)

        else:
            batch_ele += 1
    
    
#    return batch_x, batch_y

In [12]:

#PARAMETERS
base = '/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/NN/sentinel/'
orig_dir = os.path.join(base, 'AOGE71FL')
water_source_file = "/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/SatelliteImage__GEE/correlation/SAV_Data/water-source/joined-surveys-2013-grouped.csv"
batch_size = 5
# 3 Main label categories (which are kept)
main_labels = ['piped', 'groundwater', 'bottled water']
    

#Functions
urban_dir =  get_urban_img(base)# has to change if all surveys are used!
print('Moved urbane images to seperate folder')

labels_df = get_labels_df_for_img(water_source_file, urban_dir, main_labels)
#labels_df = get_labels(water_source_file, urban_dir)
print('Add to label names categorical labels')

train_dir, val_dir, test_dir = created_data_sets(urban_dir)
print('Split up data set into training, validation and test data')

means = calc_mean(train_dir)
stds = calc_std(means, train_dir)
print('Calculated mean and standard deviation for each channel (for training set)')





Moved urbane images to seperate folder
                     name       source label
0    AOGE71FL00000124.tif        piped     1
1    AOGE71FL00000121.tif  groundwater     2
2    AOGE71FL00000101.tif        piped     1
3    AOGE71FL00000100.tif        piped     1
4    AOGE71FL00000094.tif        piped     1
..                    ...          ...   ...
356  BFGE71FL00000164.tif        piped     1
357  BFGE71FL00000163.tif        piped     1
358  BFGE71FL00000162.tif        piped     1
359  BFGE71FL00000149.tif  groundwater     2
360  BFGE71FL00000144.tif        piped     1

[361 rows x 3 columns]
Add to label names categorical labels
Split up data set into training, validation and test data
Calculated mean and standard deviation for each channel (for training set)


In [13]:
training_generator = generator(train_dir, labels_df, batch_size, means, stds)
#Check if shape is correct
print('Created x and y for training data')
'''for data_batch, labels_batch in training_generator:
    print('This is the shape of the training data batch:', data_batch.shape)
    print('This is the shape of the training label batch:', labels_batch.shape)
    break'''

validation_generator = generator(val_dir, labels_df, batch_size, means, stds)

Created x and y for training data


In [14]:
model = models.Sequential()
model.add(layers.Conv2D(16, (3, 3), activation='relu',input_shape=(201, 201, 13)))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(32, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))


2021-07-22 14:25:04.906470: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 199, 199, 16)      1888      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 99, 99, 16)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 97, 97, 32)        4640      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 48, 48, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 46, 46, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 23, 23, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 21, 21, 64)        3

In [16]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
history = model.fit(
            training_generator,
            steps_per_epoch=5,
            epochs=10)

2021-07-22 14:25:05.693018: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-07-22 14:25:05.705508: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2096155000 Hz


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
#RESNET Model