# Week 8: Creating an automated Sentinel-2 processing chain

Individual learning outcomes: At the end of this week, all students should be able to calculate the Normalized Difference Vegetation Index (NDVI) from the Sentinel-2 spectral bands and extract statistics of pixel values within polygons of a shapefile from within Python.

We will use the images we downloaded with the processing chain we built in last week's practical.

Connect to our Google Drive from Colab.

In [None]:
# Load the Drive helper and mount your Google Drive as a drive in the virtual machine
from google.colab import drive
drive.mount('/content/drive')

Install libraries

In [None]:
#import required libraries, including the sentinelsat library this time
!pip install rasterio
!pip install sentinelsat
!pip install geopandas
# install the function to get zonal statistics from the rasterstats library
!pip install rasterstats
from rasterstats import zonal_stats
import csv 
import pickle


import geopandas as gpd
import rasterio
from rasterio import plot
from rasterio.plot import show_hist
from rasterio.windows import Window
import matplotlib.pyplot as plt
import numpy as np
from sentinelsat.sentinel import SentinelAPI, read_geojson, geojson_to_wkt, geojson
from collections import OrderedDict
from osgeo import gdal, ogr
import json
import os
from os import listdir
from os.path import isfile, isdir, join
import math
from math import floor, ceil
from pyproj import Proj
from pprint import pprint
import shutil
import sys
import zipfile
%matplotlib inline

We need a help function at a later stage, so let's define that now. It converts between lat/lon coordinates and pixel locations in a raster.

In [None]:
# define a helper function that converts latitude, longitude coordinates into pixel locations
def longlat2window(lon, lat, dataset):
    """
    Args:
        lon (tuple): Tuple of min and max lon
        lat (tuple): Tuple of min and max lat
        dataset: Rasterio dataset

    Returns:
        rasterio.windows.Window
    """
    p = Proj(dataset.crs)
    t = dataset.transform
    xmin, ymin = p(lon[0], lat[0])
    xmax, ymax = p(lon[1], lat[1])
    col_min, row_min = ~t * (xmin, ymin)
    col_max, row_max = ~t * (xmax, ymax)
    return Window.from_slices(rows=(floor(row_max), ceil(row_min)),
                              cols=(floor(col_min), ceil(col_max)))


# Processing Sentinel-2 images

The workflow for this practical is:
* Calculate the Normalised Difference Vegetation Index from the 10 m bands of the images we downloaded last time
* Save the NDVI files as Geotiff format
* Visualise the NDVI images
* Extract zonal statistics of NDVI for polygons in our shapefile
* Save the statistics as a csv file for use in Excel
* Plot the statistics to explore them

In [None]:
# set up your directories for the satellite data
# Note that we do all the downloading and data analysis on the temporary drive
#    on Colab. We will copy the output directory to our Google Drive at the end.
#    Colab has more disk space (about 40 GB free space) than Google Drive (15 GB).
#    However, the data on the Colab disk space are NOT kept when you log out.

# path to your Google Drive
wd = "/content/drive/MyDrive/practicals20-21" 
print("Connected to data directory: " + wd)

# path to the directory where we saved the downloaded Sentinel-2 images last time
datadir = "/content/drive/MyDrive/practicals20-21/download"
print("Looking for image data in: " + datadir)

# path to your temporary drive on the Colab Virtual Machine
cd = "/content/work"

# Name of the shape file
shapefile = join(wd, 'oakham', 'Polygons_small.shp') # ESRI Shapefile of the study area

# directory for downloading the Sentinel-2 granules
# Note that we are using the 'join' function imported from the os library here for the first time
# It is an easy way of merging strings into a directory structure.
# It is clever and chooses the / or \ depending on whether you are on Windows or Linux.
downloaddir = join(cd, 'download') # where we save the downloaded images
quickdir = join(cd, 'quicklooks')  # where we save the quicklooks
outdir = join(cd, 'out')           # where we save any other outputs

# CAREFUL: This code removes the named directories and everything inside them to free up space
# Note: shutil provides a lot of useful functions for file and directory management
try:
  shutil.rmtree(downloaddir)
except:
  print(downloaddir + " not found.")

try:
  shutil.rmtree(quickdir)
except:
  print(quickdir + " not found.")

try:
  shutil.rmtree(outdir)
except:
  print(outdir + " not found.")

# create the new directories, unless they already exist
os.makedirs(cd, exist_ok=True)
os.makedirs(downloaddir, exist_ok=True)
os.makedirs(quickdir, exist_ok=True)
os.makedirs(outdir, exist_ok=True)

print("Connected to Colab temporary data directory: " + cd)

print("\nList of contents of " + wd)
for f in sorted(os.listdir(wd)):
  print(f)

# check whether the file with the login details exists
if "sencredentials.txt" not in os.listdir(wd):
  print("\nERROR: File sencredentials.txt not found. Cannot log in to Data Hub.\n")

Get the extent of the shapefile to define our search area.

In [None]:
# Get the shapefile layer's extent
driver = ogr.GetDriverByName("ESRI Shapefile")
ds = driver.Open(shapefile, 0)
lyr = ds.GetLayer()
extent = lyr.GetExtent()
print("Extent of the area of interest (shapefile):\n", extent)

# get projection information from the shapefile to reproject the images to
outSpatialRef = lyr.GetSpatialRef()
print("\nSpatial referencing information of the shapefile:\n", outSpatialRef)

# close file
ds = None 


# Explore the data directory structure of our downloaded files


In [None]:
# where we stored the downloaded files
os.chdir(datadir)
print("contents of ", datadir, ":")
!ls -l

# copy the data from Google Drive (permanent storage) to Colab (temporary storage but more disk space)
# we write a function to delete the destination directory if it exists to avoid errors
def copytree_overwrite(from_path, to_path):
    if os.path.exists(to_path):
        shutil.rmtree(to_path)
    shutil.copytree(from_path, to_path)

# call our function
copytree_overwrite(datadir, downloaddir)

# look at copied files on the Colab drive
os.chdir(downloaddir)
print("contents of ", downloaddir, ":")
!ls -l


# Calculate NDVI

Before we calculate the Normalised Difference Vegetation Index from our images, we need to find the directories that contain the band files. They are located in the 10m subdirectories of our downloaded Sentinel-2 directories for each image. We modify the code from last week as follows.

We will mark up any changes as follows:

```
'''
Changed from last week
'''
```



In [None]:
'''
Changed from last week:
* Modified variable names
* Changed from 20 m to 10 m resolution
* Changed from TCI file to RED and NIR files
'''

# make a list of all RED and NIR image band files across all downloaded image directories
s2IDs = [] # empty list of all Sentinel-2 granule IDs we have downloaded
s2dirs = [] # empty list of all directory paths pointing to the 10 m resolution band files
files_red = [] # empty list of all Red band file names
files_nir = [] # empty list of all NIR band file names

# get the list of all directories in the download directory
# there is one directory for each Sentinel-2 image (granule)
dirlist = [d for d in listdir(downloaddir) if isdir(join(downloaddir, d))]

# iterate over all Sentinel-2 image directories
for d in range(len(dirlist)):
  # the directory names have the following structure, for example:
  # S2A_MSIL2A_20190919T110721_N0213_R137_T30UXD_20190919T140654.SAFE
  # the first part of the directory name is the granule ID
  # so we split off the ".SAFE" as follows:
  sceneID = dirlist[d].split(".")[0] 
  s2IDs.append(sceneID) #append the unique identifier to the list

  # find the GRANULE, then L2A_*, then IMG_DATA, then R10m directory
  thisdir = join(downloaddir, dirlist[d], "GRANULE")

  # find the full name of the L2A_* subdirectory (contains the scene ID)
  subdirlist = [s for s in listdir(thisdir) if isdir(join(thisdir, s))]
  for y in range(len(subdirlist)):
    if subdirlist[y].split("_")[0] == "L2A":
      thisdir = join(thisdir, subdirlist[y])

  # add IMG_DATA/R10m to subdirectory, this is where the TCI image is found
  s2dir = join(thisdir, "IMG_DATA", "R10m")
  s2dirs.append(s2dir) # add it to our list

  # find the TCI image file name
  files_10m = [f for f in listdir(s2dir) if isfile(join(s2dir, f))]

  '''
  Changed from last week:
  * Changed from TCI file to RED and NIR files

  Be aware of the indentation. It determines that both IF statements are within the same FOR loop.
  '''

  # We split the filename into components based on the underscore _
  # e.g. "T30UXD_20190919T110721_B04_10m.jp2"
  # becomes ["T30UXD", "20190919T110721", "B04", "10m.jp2"]
  # so the component indexed 2 contains the band number
  for y in range(len(files_10m)):
    if files_10m[y].split("_")[2] == "B04": # Band 4 is the Red band
      files_red.append(files_10m[y]) # remember the path to the Red band file
    if files_10m[y].split("_")[2] == "B08": # Band 8 is the NIR band
      files_nir.append(files_10m[y]) # remember the path to the NIR band file

# the output looks neater if we print each element of the list of strings in a new line
print("List of all Granule IDs:")
for i in s2IDs:
  print(i)
print("List of all Sentinel-2 directories:")
for i in s2dirs:
  print(i)
print("List of all Red band image files:")
for i in files_red:
  print(i)
print("List of all NIR band image files:")
for i in files_nir:
  print(i)


```
'''
Changed from last week
'''
```


# Warp all our images to the same projection as our shapefile.

Remember we did this last week with GDAL.

In [None]:
'''
Changed from last week:
* Includes getting information on the number of files to process
* Processes both the Red and the NIR image files instead of just the TCI files
'''

# how many files are in the file list?
nfiles = len(files_red)

# get the spatial referencing system of our shapefile into which we want to reproject the TCI images
# remember, we did this when we opened the shapefile earlier and saved it in outSpatialRef
print("Reprojecting all band images to the following projection:")
print(outSpatialRef)

warpfiles_red = [] # make an empty list where we can remember all the warped Red file names
warpfiles_nir = [] # same for NIR files

# iterate over all Sentinel-2 image directories and warp the image
for x in range(nfiles):
  # join the directory path with the Red file name
  file_red = join(s2dirs[x], files_red[x])
  # do the same for NIR
  file_nir = join(s2dirs[x], files_nir[x])

  # make a directory path and file name for the warped output file
  warpfile_red = join(quickdir, s2IDs[x] + "_Red_warped.jp2")
  warpfiles_red.append(warpfile_red) # add it to our list
  # same for NIR
  warpfile_nir = join(quickdir, s2IDs[x] + "_NIR_warped.jp2")
  warpfiles_nir.append(warpfile_nir)

  # call the GDAL Warp command for the Red and NIR bands
  ds = gdal.Warp(warpfile_red, file_red, dstSRS=outSpatialRef)
  ds = None #remember to close and save the output file
  ds = gdal.Warp(warpfile_nir, file_nir, dstSRS=outSpatialRef)
  ds = None

pprint(warpfiles_red)
pprint(warpfiles_nir)

# Now calculate the NDVI from the warped Red and NIR bands

The Normalized Difference Vegetation Index (NDVI) is an indicator of the proportion and condition of green vegetation. Generally for surfaces with some vegetation the value of NDVI is positive, for surfaces without vegetation the value is near zero, while for water and clouds the value is usually negative. The closer to the positive end, the higher the density of the vegetation cover, that is, it is consistent with its dense and developed stage. This value gradually decreases for less dense vegetation cover, which has positive but not very high values.

In [None]:
'''
Changed from last week:
* This cell is new and calculates NDVI.
'''

# make an empty list to store the file names of our new NDVI files
ndvifiles = []

# iterate over all warped Sentinel-2 images and calculate NDVI
for x in range(nfiles):
  # to get the input bands, join the directory path with the Red file name
  file_red = warpfiles_red[x]
  # do the same for NIR
  file_nir = warpfiles_nir[x]

  # open the red band file
  redfile = rasterio.open(file_red, 'r') 
  # load the data from the red band file
  band_red = redfile.read(1)

  # open the NIR band file
  nirfile = rasterio.open(file_nir, 'r') 
  # load the data from the NIR band file
  band_nir = nirfile.read(1)

  # The Sentinel-2 bands are delivered as uint16 data type (unsigned integer 16 bits per pixel).
  # This means that we cannot do floating point calculations on them without first converting them to float.
  # When I first tried this practical, I got strange NDVI images without this conversion!
  # Convert the band arrays to float:
  print("Data type when band is read from file: ", band_red.dtype)
  band_red = np.float32(band_red)
  print("Data type after we have converted the band: ", band_red.dtype)
  band_nir = np.float32(band_nir)

  '''
  Calculate the vegetation index. This is done pixel by pixel using the NumPy masked array arithmetic.
  '''

  # We need to handle exceptions to the calculation. Where the sum of the two bands
  # in the denominator is zero (NIR+Red), the NDVI formula would give an error otherwise.
  # We do this by setting the NumPy error state to 'ignore' for this calculation only:
  # https://numpy.org/doc/stable/reference/generated/numpy.errstate.html

  with np.errstate(divide='ignore'): # this only applies to the following indented lines of code
    # NDVI formula:
    ndvi = np.divide((band_nir - band_red), (band_nir + band_red)) # ignore division by zero errors here
    ndvi[(band_nir + band_red) == 0] = 0 # where NIR + Red is zero, set the NDVI to zero

  # make a directory path and file name for the NDVI output file in Geotiff format
  ndvifile = join(quickdir, s2IDs[x] + "_NDVI_warped.tif")
  ndvifiles.append(ndvifile)
  print("\nCreating NVDI file with CRS and Transform:")
  print(ndvifile)
  print(redfile.crs)
  print(redfile.transform)
  
  # save the NDVI image band as a Geotiff file
  outfile = rasterio.open(ndvifile, 'w', driver='Gtiff', width=redfile.width, 
                          height=redfile.height, count=1, crs=redfile.crs, 
                          transform=redfile.transform, dtype=np.float32)
  outfile.write(ndvi, 1)
  outfile.close()

# Clip the NDVI files to the shapefile extent

In [None]:
'''
Changed from last week:
* This cell is modified from something we did before.
* It has been changed to work on a single data band in the image file.
* The colour map is set to 'Greens'.
* It clips the NDVI raster files in the list 'ndvifiles' to the shapefile extent.
'''

# We need our old helper function to convert an image to uint8 data type for plotting
def show_ndvi(afile, ax=None, xlim=None, ylim=None): 
  '''
  Changed from last week:
  * This is an adaptation of our old function 'tci'
  * It rescales the image data, changes it to uint8 data type and plots it
  * amin is now set to zero, so we ignore all negative NDVI values in the plotting
  '''
  # afile is a handle to an image file opened with RasterIO.Open()
  # ax is the axes handle to plot the map on
  # xlim =[xmin, xmax] is the map extent to be shown in x direction
  # ylim =[ymin, ymax] is the map extent to be shown in y direction
  # bands is the order of image bands in the source file to become RGB channels

  # read band data
  a = afile.read(1)

  # exclude negative values
  a[a<0] = 0

  amin = a.min()
  amax = a.max()
  
  # catch errors if all values are the same
  if (amax-amin) == 0:
    print("WARNING: max and min are the same values")

  anewmin = 0.0
  anewmax = 255.0
  ascaled = (a - amin) * ((anewmax - anewmin) / (amax - amin)) + anewmin
  a_uint8 = ascaled.astype(np.uint8)

  # save the uint8 image as a temporary Geotiff file
  tmpfile = rasterio.open('tmp_ndvi_imagefile_ cjdlsbYFEOGFHEWBVUW.tiff',
                            'w',driver='Gtiff', width=afile.width, height=afile.height,
                            count=1, crs=afile.crs, transform=afile.transform, 
                            dtype=np.uint8)
  tmpfile.write(a_uint8, 1)
  tmpfile.close()

  # try plotting the image again
  imgfile = rasterio.open(r'tmp_ndvi_imagefile_ cjdlsbYFEOGFHEWBVUW.tiff', count=1)

  if (xlim==None):
    xlim=[afile.bounds.left, afile.bounds.right]
    # afile.bounds returns a BoundingBox(left, bottom, right, top) object

  if (ylim==None):
    ylim=[afile.bounds.bottom, afile.bounds.top]
  
  # zoom in to an area of interest
  ax.set_xlim(xlim)
  ax.set_ylim(ylim)
  plot.show(imgfile, ax=ax, cmap='Greens')

  imgfile.close()

  # remove the temporary file
  os.remove('tmp_ndvi_imagefile_ cjdlsbYFEOGFHEWBVUW.tiff')

  return()


# clip the files
zoomfiles = [] # remember the file names

# arrange our subplots, assuming a 16:9 screen ratio
cols = min(nfiles, 4) # maximum of 4 plots in one row
rows = math.ceil(nfiles / cols) # round up to nearest integer

# create a figure with subplots
fig, ax = plt.subplots(rows, cols, figsize=(21,7))
fig.patch.set_facecolor('white')

# iterate over all NDVI files and plot them
for x in range(nfiles):
  ndvifile = ndvifiles[x]

  # make the filename of the new zoom image file
  zoomfile = ndvifile.split(".")[0] + "_zoom.tif"
  zoomfiles.append(zoomfile) # remember the zoom file name in our list
  print(zoomfile)

  # clip it with rasterio to the shapefile extent
  # rasterio offers an option called 'window' to load a subset of a raster file

  # open the source file
  with rasterio.open(ndvifile, 'r') as src:
    
    # convert the shapefile extent to a rasterio window object
    window = longlat2window((extent[0], extent[1]), (extent[2], extent[3]), src)
    print("Window coordinates: ", window)
    
    # read all bands but only for the window extent
    arr = src.read(window=window, out_shape=(src.count, window.height, window.width))
    print("Window array size: ", arr.shape)

    # get the data type
    dt = arr.dtype

    # open the destination file
    # copy metadata from source file
    # BUT we must change the geotransform to the window with the update below
    # https://rasterio.readthedocs.io/en/latest/topics/windowed-rw.html
    kwargs = src.meta.copy()
    kwargs.update({'height': window.height,
                   'width': window.width,
                   'transform': rasterio.windows.transform(window, src.transform),
                   'driver': 'Gtiff', 
                   'count': src.count,
                   'crs': src.crs,
                   'dtype': dt
                   })

    with rasterio.open(zoomfile, 'w', **kwargs) as dst:
      dst.write(arr)

      # close the destination file
      dst.close()

    # close the sourcefile
    src.close()

  # plot it
  with rasterio.open(zoomfile, "r") as img:
    show_ndvi(img, ax=ax[x])
    # set a title for the subplot
    mytitle = s2IDs[x]
    ax[x].set_title(mytitle, fontsize=8)

# Now let's extract some statistics on NDVI from a small polygon

We will use the zonal statistics function from the Rasterstats library for this purpose, as it is easy to use.

https://pythonhosted.org/rasterstats/manual.html

We will save the files in .csv format, so they can be read into Excel. We generate one file for each image.

In Python we can also save files with entire objects in their original form. The pickle library allows us to do that.

In [None]:
# make an empty list to store our output statistics csv file names, one per image
statsfiles = []

# make an empty list to store our output statistics pickle file names, one per image
pklfiles = []

# iterate over all NDVI files and extract zonal statistics
for x in range(nfiles):
  zoomfile = zoomfiles[x]

  # make the filename of the new statistics output file
  statsfile = zoomfile.split(".")[0] + "_stats.csv"
  statsfiles.append(statsfile) # remember the file name in our list

  # get zonal statistics for all polygons in the shapefile
  # the result is a list of dictionaries
  '''
  exclude zero values from calculation by setting a nodata value
  '''
  stats = zonal_stats(shapefile, zoomfile, nodata=0, stats="count min mean max median")

  # get the number of polygons of the shapefile.
  # there should be one row with statistics for each of them.
  n = len(stats)
  print("The shapefile has ", n, " polygons.")

  # Write the statistics results to a text file (overwrite if exists)
  # opening the csv file in 'w' mode 
  f = open(statsfile, "w", newline="\n")
    
  with f: 
    # write the header line
    header = ["count", "min", "mean", "max", "median"] 
    writer = csv.DictWriter(f, fieldnames = header) 
    writer.writeheader() 
    # iterate over each polygon
    for i in range(n): 
      # writing data row-wise into the csv file 
      writer.writerow(stats[i]) 

  # save and close the file
  f.close()

  # open the file
  f = open(statsfile,"r") 

  # read and print its contents (all lines)
  pprint(f.read().splitlines())

  # close the file
  f.close()
  print("\nSaved statistics file: " + statsfile)

  # make the filename of the new pickle file for the stats object
  pklfile = zoomfile.split(".")[0] + "_stats.pkl"
  pklfiles.append(pklfile) # remember the file name in our list

  # write object to file
  pickle.dump(stats, open(pklfile, 'wb'))
  print("\nSaved statistics file: " + pklfile + "\n")


# Plotting data as graphs
Before we finish, let's just explore how we can make graphs to explore our results.

In [None]:
# iterate over all statistics files
for x in range(nfiles):
  pklfile = pklfiles[x]

  # read object from file
  s = pickle.load(open(pklfile, 'rb'))
  print("\nThe pickled file looks like this:")
  print(s)
  print("The stored object is of type: ", type(s))
  print("The list has this many elements: ", len(s))
  
  print("We can iterate over the elements in the list.")
  for poly in range(len(s)):
    print(s[poly])

  print("The elements are of type:", type(s[0]))
  print("We can get the elements in the dictionary like this:")
  for poly in range(len(s)):
    a = s[poly] # get the list element
    b = a["min"] # get the dictionary entry for "min"
    print("Minimum = ", b)

  print("To get all dictionary entries for the minimum NDVI per polygon as a list object:")
  a_min = [b["min"] for b in s]
  pprint(a_min)

  # and get the other statistics too
  a_max = [b["max"] for b in s]
  a_mn = [b["mean"] for b in s]
  a_n = [b["count"] for b in s]
  a_md = [b["median"] for b in s]

  # make a bar chart of the data

