In [0]:
!pip install -q rasterio rasterstats geopandas 

[K     |████████████████████████████████| 15.1MB 15.0MB/s 
[K     |████████████████████████████████| 921kB 22.8MB/s 
[K     |████████████████████████████████| 11.9MB 17.7MB/s 
[K     |████████████████████████████████| 81kB 21.3MB/s 
[K     |████████████████████████████████| 10.1MB 18.6MB/s 
[?25h  Building wheel for simplejson (setup.py) ... [?25l[?25hdone
  Building wheel for munch (setup.py) ... [?25l[?25hdone


In [0]:
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np

from sklearn import svm

import rasterio
import rasterstats
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from os import path as op
import pickle

In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
root_dir = "/content/drive/My Drive"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


# Random Forest Model for Crop Type and Land Classification

Using data created by SERVIR East Africa, RCMRD, and FEWSET, we demonstrate how to train a Random Forest classifier over Trans Nzoia county, Kenya.

In [0]:
# read in training data
training_vectors = gpd.read_file(op.join(root_dir, 'servir-day5-gee-colab/training_combined.geojson'))
training_vectors.head()

Unnamed: 0,name,descripti,altitude,RTKStation,BUFF_DIST,ORIG_FID,OBJECTID_1,OBJECTID,Id,gridcode,Class,Shape_Leng,Shape_Area,geometry
0,Sugarcane,,-1.0,1,20.0,0.0,,,,,,,,"MULTIPOLYGON (((34.83212 1.17829, 34.83215 1.1..."
1,Forestland,,-1.0,1,20.0,1.0,,,,,,,,"MULTIPOLYGON (((35.30961 1.01328, 35.30964 1.0..."
2,Maize,early reproductive,-1.0,1,20.0,2.0,,,,,,,,"MULTIPOLYGON (((34.90904 1.09515, 34.90907 1.0..."
3,Sugarcane,no change..maize farm on the right and far lef...,-1.0,1,20.0,3.0,,,,,,,,"MULTIPOLYGON (((34.90750 1.08934, 34.90753 1.0..."
4,Maize,reproductive good crop,1763.4144,1,20.0,4.0,,,,,,,,"MULTIPOLYGON (((34.87144 0.82953, 34.87147 0.8..."


In [0]:
# find all unique values of training data names to use as classes
classes = np.unique(training_vectors.name)
# classes = np.array(sorted(training_vectors.name.unique()))
classes

array(['Built', 'Cloud', 'Fallow', 'Forestland', 'Grassland', 'Maize',
       'Shadow', 'Sugarcane', 'Sunflower', 'Waterbody'], dtype=object)

In [0]:
# create a dictionary to convert class names into integers for modeling
class_dict = dict(zip(classes, range(len(classes))))
class_dict                  

{'Built': 0,
 'Cloud': 1,
 'Fallow': 2,
 'Forestland': 3,
 'Grassland': 4,
 'Maize': 5,
 'Shadow': 6,
 'Sugarcane': 7,
 'Sunflower': 8,
 'Waterbody': 9}

In [0]:
# this larger cell reads data from a raster file for each training vector

# raster information
raster_file = op.join(root_dir, 'servir-day2/Trans_nzoia_2019_05_02.tif')
bands = 6

# a custom function for getting each value from the raster
def all_values(x):
    return x

# set up our training data lists
X_raw = list(range(bands))
y_raw = []

for band in range(bands):
    X_raw[band] = []
    
    # for each band and each training vector geometry, read the corresponding raster data
    stats = rasterstats.zonal_stats(
        training_vectors.geometry,
        raster_file,
        add_stats={'all_values':all_values},
        band=band+1
    )
    
    # for each returned value, store the valid observations and a
    # corresponding number of vector labels
    for idx, stat in enumerate(stats):
        vals = stat['all_values']
        obs = list(vals.data[~vals.mask])
        X_raw[band] += obs
        if band == 0:
            y_raw += [class_dict[training_vectors.name.iloc[idx]] for _ in obs]





In [0]:
# convert the training data lists into the appropriate shape and format for scikit-learn
X = np.array(X_raw).swapaxes(0, 1)
y = np.array(y_raw)
(X.shape, y.shape)

((143737, 6), (143737,))

In [0]:
# (optional) add extra band indices

# helper function for calculating ND*I indices (bands in the final dimension)
def band_index(arr, a, b):
    return np.expand_dims((arr[..., a] - arr[..., b]) / (arr[..., a] + arr[..., b]), axis=1)

ndvi = band_index(X, 3, 2)
ndwi = band_index(X, 1, 3)

X = np.concatenate([X, ndvi, ndwi], axis=1)
X.shape

(143737, 8)

In [0]:
# split the data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
# calculate class weights to allow for training on inbalanced training samples
labels, counts = np.unique(y_train, return_counts=True)
class_weight_dict = dict(zip(labels, 1 / counts))
class_weight_dict

{0: 0.0045662100456621,
 1: 0.006369426751592357,
 2: 0.00211864406779661,
 3: 2.1707984196587506e-05,
 4: 0.0009775171065493646,
 5: 1.7588293232024764e-05,
 6: 0.00048520135856380397,
 7: 0.000126806999746386,
 8: 0.1,
 9: 0.0041841004184100415}

In [0]:
%%time
# initialize a lightgbm


svm = svm.SVC(class_weight = class_weight_dict, gamma='scale', decision_function_shape='ovo')

CPU times: user 47 µs, sys: 4 µs, total: 51 µs
Wall time: 54.4 µs


In [0]:
%%time
# fit the model to the data (training)
svm.fit(X_train, y_train)

CPU times: user 1h 24min 30s, sys: 5.24 s, total: 1h 24min 35s
Wall time: 1h 24min 40s


SVC(C=1.0, cache_size=200,
    class_weight={0: 0.0045662100456621, 1: 0.006369426751592357,
                  2: 0.00211864406779661, 3: 2.1707984196587506e-05,
                  4: 0.0009775171065493646, 5: 1.7588293232024764e-05,
                  6: 0.00048520135856380397, 7: 0.000126806999746386, 8: 0.1,
                  9: 0.0041841004184100415},
    coef0=0.0, decision_function_shape='ovo', degree=3, gamma='scale',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [0]:
# predict on X_test to evaluate the model
preds = svm.predict(X_test)
cm = confusion_matrix(y_test, preds, labels=labels)

In [0]:
model_name = 'svm_model.sav'
pickle.dump(lgbm, open(op.join(root_dir, model_name), 'wb'))


NameError: ignored

In [0]:
# plot the confusion matrix
%matplotlib inline
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
       yticks=np.arange(cm.shape[0]),
       # ... and label them with the respective list entries
       xticklabels=classes, yticklabels=classes,
       title='Normalized Confusion Matrix',
       ylabel='True label',
       xlabel='Predicted label')

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
fmt = '.2f'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, format(cm[i, j], fmt),
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()

## Generate predictions over the full image

In [0]:
# if want to use the pretrained model for new imagery

# helper function for calculating ND*I indices (bands in the final dimension)
# match the pretrained model weight with the saved model above
model_name = 'light_gbm.sav'

def band_index(arr, a, b):
    return np.expand_dims((arr[..., a] - arr[..., b]) / (arr[..., a] + arr[..., b]), axis=1)
  
lgbm = pickle.load(open(op.join(root_dir, model_name), 'rb'))

In [0]:
lgbm

LGBMClassifier(boosting_type='gbdt',
               class_weight={0: 0.0045662100456621, 1: 0.006369426751592357,
                             2: 0.00211864406779661, 3: 2.1707984196587506e-05,
                             4: 0.0009775171065493646,
                             5: 1.7588293232024764e-05,
                             6: 0.00048520135856380397, 7: 0.000126806999746386,
                             8: 0.1, 9: 0.0041841004184100415},
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=-1, metric='multi_logloss', min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, num_class=10, num_leaves=31, objective='multiclass',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [0]:
# src.close()
# dst.close()

In [0]:
# if want to use the pretrained model for new imagery
# The pretrained model is called "random_forest.sav"
# helper function for calculating ND*I indices (bands in the final dimension)

# open connections to our input and output images

new_image = op.join(root_dir, 'Trans_nzoia_2019_10-04.tif')

output_image = op.join(root_dir, "lgbm_classification.tif")
src = rasterio.open(new_image, 'r')
profile = src.profile
profile.update(
    dtype=rasterio.uint8,
    count=1,
)
dst = rasterio.open(output_image, 'w', **profile)

# perform prediction on each small image patch to minimize required memory
patch_size = 500

for i in range((src.shape[0] // patch_size) + 1):
    for j in range((src.shape[1] // patch_size) + 1):
        # define the pixels to read (and write)
        window = rasterio.windows.Window(
            j * patch_size,
            i * patch_size,
            # don't read past the image bounds
            min(patch_size, src.shape[1] - j * patch_size),
            min(patch_size, src.shape[0] - i * patch_size)
        )
        
        data = src.read(window=window)
        # read the image into the proper format, adding indices if necessary
        img_swp = np.moveaxis(data, 0, 2)
        img_flat = img_swp.reshape(-1, img_swp.shape[-1])

        img_ndvi = band_index(img_flat, 3, 2)
        img_ndwi = band_index(img_flat, 1, 3)

        img_w_ind = np.concatenate([img_flat, img_ndvi, img_ndwi], axis=1)

        # remove no data values, store the indices for later use
        # a later cell makes the assumption that all bands have identical no-data value arrangements
        m = np.ma.masked_invalid(img_w_ind)
        to_predict = img_w_ind[~m.mask].reshape(-1, img_w_ind.shape[-1])
        
        # predict
        if not len(to_predict):
            continue
        img_preds = lgbm.predict(to_predict)
        
        # add the prediction back to the valid pixels (using only the first band of the mask to decide on validity)
        # resize to the original image dimensions
        output = np.zeros(img_flat.shape[0])
        output[~m.mask[:,0]] = img_preds.flatten()
        output = output.reshape(*img_swp.shape[:-1])
        
        # create our final mask
        mask = (~m.mask[:,0]).reshape(*img_swp.shape[:-1])

        # write to the final file
        dst.write(output.astype(rasterio.uint8), 1, window=window)
        dst.write_mask(mask, window=window)
        # write to the final file
        dst.write(output.astype(rasterio.uint8), 1, window=window)
        dst.write_mask(mask, window=window)

src.close()
dst.close()