# Machine Learning Model Prep
**Model to provide a multiclass prediction of the LULC types contained with a patch. 
Improve Training and testing patches to fine tune a EO model. Used with 4 bands × 2,000 pixels × 2,000 pixels that contain plentiful rivers (label = 1) and roads (label = 2), among other LULC labels. There is a corresponding image of pixelwise labels with a single band and 2,000 pixels × 2,000 pixels. Make 4 band x 64 x 64 pixel image patches.**
#### 1. Reads the image and labels, the filenames of which are provided to the function as input
#### 2. Creates an appropriate sampling grid of the image data
#### 3. Creates multiclass labels as the count of each label in a grid cell
#### 4. Ignores grid cells that do not contain one of the two target classes
#### 5. Splits the cells and their labels into training (80%) and test (20%) sets
#### 6. Applies bandwise zero-centering and scaling against some constants; band mean and std
#### 7. Returns the training and test sets and their respective labels.

In [None]:
# Libraries
import os
import numpy as np
import pandas as pd
import rasterio as rio
import tifffile as tiff
from patchify import patchify

In [None]:
# Remote Sensing test and training split function
def training_test_split_labels(image_path,label_path):
    '''This function creates a set of patches from:
    image_path: multiband image 
    label_path: label image''' 
    
    # Set up folder environments for returned results    
    cwd = os.getcwd()
    
    trainpath = cwd + r'/Training'
    testpath = cwd + r'/Test'    
    multilabelpath = cwd + r'/Multilabels'
    
    if not os.path.exists(trainpath):
        os.makedirs(trainpath)
    if not os.path.exists(testpath):
        os.makedirs(testpath) 
    if not os.path.exists(multilabelpath):
        os.makedirs(multilabelpath)
        
    # Rastrio Open 
    def open_raster(path):
        with rio.open(path, 'r') as ds:
            return ds.read()
        
    # Load images 
    raster_img = open_raster(image_path)    
    ground_labels = open_raster(label_path)
    
    # Training split %    
    training_split = 0.8
    
    # Patch Label image 
    for lbl in range(ground_labels.shape[0]):
        
        label_mask = ground_labels[lbl]
        
        # 64x64 patch 
        patches_label = patchify(label_mask, (64, 64), step=64)
        
        # Multiclass dictionary to store counts and label index
        multiclass_dict = {"label_index":[],"label_class":[],"count":[]}
        
        count = 0
    
        for i in range(patches_label.shape[0]):
            for j in range(patches_label.shape[1]):
            
                # Single Patch 
                single_patch_label = patches_label[i,j,:,:]
            
                # Create Multiclass labels 
                label_classes = np.unique(single_patch_label,return_counts=True) 
                multiclass_dict["label_index"].append(str(lbl) + '_' + str(i)+str(j)) 
                multiclass_dict["label_class"].append(label_classes[0]) 
                multiclass_dict["count"].append(label_classes[1]) 
            
                # Check if labels are River = 1 OR Road = 2 
                # Save results if either present 
                if 1 in multiclass_dict["label_class"][count]:
                
                    # Create training and test labels
                    fraction_index = round(training_split * single_patch_label.shape[1]) 
                    label_training_matrix = single_patch_label[:fraction_index, :] 
                    label_test_matrix = single_patch_label[fraction_index:, :] 
                
                    # Return results 
                    tiff.imwrite(trainpath + '/training_label_' + str(lbl) + '_' + str(i )+str(j)+ ".tif", label_training_matrix) 
                    tiff.imwrite(testpath + '/test_label_' + str(lbl) + '_' + str(i)+str(j)+ ".tif", label_test_matrix) 
                
                elif 2 in multiclass_dict["label_class"][count]: 
                
                    # Create training and test labels 
                    fraction_index = round(training_split * single_patch_label.shape[1]) 
                    label_training_matrix = single_patch_label[:fraction_index, :] 
                    label_test_matrix = single_patch_label[fraction_index:, :] 
                
                    # Return results
                    tiff.imwrite(trainpath + '/training_label_' + str(lbl) + '_' + str(i )+str(j)+ ".tif", label_training_matrix) 
                    tiff.imwrite(testpath + '/test_label_' + str(lbl) + '_' + str(i)+str(j)+ ".tif", label_test_matrix) 
                
                count += 1 
        # Save multiclass label dictionary as a CSV 
        multiclass_df = pd.DataFrame.from_dict(multiclass_dict)
        multiclass_df.to_csv(multilabelpath + '/multiclass_labels_' + str(lbl) + '.csv')
        
    # Patch raster image 
    for img in range(raster_img.shape[0]):
        
        image_mask = raster_img[img] 
        
        # 64x64 patch 
        patches_img = patchify(image_mask, (64, 64), step=64)
        
        count = 0
        
        for i in range(patches_img.shape[0]):
            for j in range(patches_img.shape[1]):
                
                # Single Patch 
                single_patch_img = patches_img[i,j,:,:]
                
                # Check if labels are River = 1 OR Road = 2 
                # Save results if either present
                if 1 in multiclass_dict["label_class"][count]:
                    
                    # Normalize 
                    patch_img_mean = np.mean(single_patch_img) 
                    patch_img_var = np.var(single_patch_img) 
                    patch_img_std = np.sqrt(patch_img_var) 
                    normalized_patch_img = (single_patch_img - patch_img_mean) / patch_img_std
                    
                    # Create training and test labels
                    fraction_index = round(training_split * normalized_patch_img.shape[1])
                    img_training_matrix = normalized_patch_img[:fraction_index, :] 
                    img_test_matrix = normalized_patch_img[fraction_index:, :] 
                    
                    # Return results
                    tiff.imwrite(trainpath + '/training_image_' + str(img) + '_' + str(i )+str(j)+ ".tif", img_training_matrix) 
                    tiff.imwrite(testpath + '/test_image_' + str(img) + '_' + str(i)+str (j)+ ".tif", img_test_matrix) 
                    
                elif 2 in multiclass_dict["label_class"][count]: 
                    
                    # Normalize 
                    patch_img_mean = np.mean(single_patch_img) 
                    patch_img_var = np.var(single_patch_img) 
                    patch_img_std = np.sqrt(patch_img_var) 
                    normalized_patch_img = (single_patch_img - patch_img_mean) / patch_img_std
                    
                    # Create training and test labels
                    fraction_index = round(training_split * normalized_patch_img.shape[1])
                    img_training_matrix = normalized_patch_img[:fraction_index, :] 
                    img_test_matrix = normalized_patch_img[fraction_index:, :] 
                    
                    # Return results
                    tiff.imwrite(trainpath + '/training_image_' + str(img) + '_' + str(i )+str(j)+ ".tif", img_training_matrix) 
                    tiff.imwrite(testpath + '/test_image_' + str(img) + '_' + str(i)+str (j)+ ".tif", img_test_matrix) 
                
                count += 1 