# Patch downloading

This notebook contains the code to download the training patches. The steps are the followings:

1. Filter the leak data to only contain leaks that were present in the main distribution pipe. 
2. Set the coordinates of the leak to the right CRS in order to be compatible with GEE.
3. Define the functions for the download task: 
    - `get_image`: Get the collection of images that contains the leak coordinate and were captured before leak repair.
    - `bands_clip_image`: Select the bands that will be downloaded and the patch size (in meters).
    - `download_image`: Download and write the patch in the folder named patches_raw.
4. Download the patches. Each patch is downloaded in a zip file which contains one tif file per band. 

In [1]:
import pandas as pd
from janitor import clean_names
from datetime import timedelta
import ee
import pyproj
import requests
import winsound

In [2]:
# Read leak df to get dates
leaks_raw = pd.read_excel("../data/raw/Ordenes Fugas Ejecutadas Dic2022 XY.xlsx")
leaks_raw.head()

Unnamed: 0,Id,FECHA DE CREACIÓN,FECHA DE INICIO,FECHA FINAL,CODUNIDAD,FECHALEGALIZACION,NOMBREESTADO,NOMBRETRABAJO,CAUSAL,X,Y
0,1,2021-10-14,2022-11-24,2022-12-07,757,2022-12-07,Cerrada,FTA - FUGA EN TOMA DE AGUA,SE CANCELA,781021.888082,2428227.0
1,2,2021-09-28,2022-11-24,2022-12-07,757,2022-12-07,Cerrada,FTA - FUGA EN TOMA DE AGUA,SE CANCELA,778394.89032,2426703.0
2,3,2022-03-30,2022-11-24,2022-11-26,757,2022-12-22,Cerrada,FTA - FUGA EN TOMA DE AGUA,SE CANCELA,786182.935,2421650.0
3,4,2022-04-24,2022-11-25,2022-11-26,757,2022-12-22,Cerrada,FTA - FUGA EN TOMA DE AGUA,SE CANCELA,780261.739782,2421571.0
4,5,2022-06-07,2022-11-25,2022-11-26,757,2022-12-22,Cerrada,FTA - FUGA EN TOMA DE AGUA,SE CANCELA,781486.184276,2430545.0


## Cleaning

In [3]:
# Clean leaks data
leaks_clean = clean_names(leaks_raw)
leaks_clean.columns # The column of interest is fecha_de_inicio

Index(['id', 'fecha_de_creacion', 'fecha_de_inicio', 'fecha_final',
       'codunidad', 'fechalegalizacion', 'nombreestado', 'nombretrabajo',
       'causal', 'x', 'y'],
      dtype='object')

In [4]:
# First we need to filter leaks that are detected in the main pipe. Code in column nombretrabajo: FTC

# Column with codes
leaks_clean["leak_type"] = leaks_clean.nombretrabajo.str.extract(r"([A-Z]+)", expand = False)

# Filter
leaks_clean = leaks_clean.query("leak_type == 'FTC'")

# Second, we need to filter to causal == "SE REPARA FTC, GENERA BA"
leaks_clean = leaks_clean.query("causal == 'SE REPARA FTC, GENERA BA'")

# Drop irrelevant columns
leaks_clean = leaks_clean.drop(["codunidad", "nombreestado", "nombretrabajo", "fecha_de_creacion"], axis = 1)

# Sort df by fecha_de_inicio
leaks_clean = leaks_clean.sort_values(by = "fecha_de_inicio").reset_index(drop = True)

# Show clean df 
leaks_clean.head() # This df is going to be necessary to define the date of each image to be downloaded

Unnamed: 0,id,fecha_de_inicio,fecha_final,fechalegalizacion,causal,x,y,leak_type
0,2366,2022-12-01,2022-12-01,2022-12-01,"SE REPARA FTC, GENERA BA",781563.33511,2419492.0,FTC
1,2479,2022-12-01,2022-12-01,2022-12-01,"SE REPARA FTC, GENERA BA",785033.684233,2423310.0,FTC
2,2443,2022-12-01,2022-12-01,2022-12-01,"SE REPARA FTC, GENERA BA",781521.553633,2419196.0,FTC
3,2460,2022-12-02,2022-12-02,2022-12-02,"SE REPARA FTC, GENERA BA",779658.087686,2416386.0,FTC
4,2409,2022-12-02,2022-12-02,2022-12-02,"SE REPARA FTC, GENERA BA",781491.668852,2419196.0,FTC


## Functions to download patches

In [5]:
# Get image from image collection
def get_image(start, end, poi_leak):
    collection = ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED")\
    .filterBounds(poi_leak)\
    .filterDate(ee.Date(start), ee.Date(end))\
    .sort("CLOUDY_PIXEL_PERCENTAGE")

    print("Images found: ", collection.size().getInfo())

    img = collection.first()

    print("Date of selected image: ", ee.Date(img.get("system:time_start")).format("yyyy-MM-dd").getInfo(),
          "\nCloud %: ", img.get("CLOUDY_PIXEL_PERCENTAGE").getInfo()) 
    
    return img  

In [6]:
# Define the bands to select and the patch size (radius in meters respect to leak point)
def bands_clip_image(image, buffer_size = 100, bands = ["B4", "B3", "B2"]):
    # Clip image
    image = image.clip(poi.buffer(buffer_size).bounds(proj = "EPSG:32613", maxError = 0.001))

    # Select bands
    image = image.select(bands)

    return image

In [7]:
# Download patch image
def download_image(image, path, date_label):
    url = image.getDownloadURL(
        {
        "scale": 10,
        "crs": "EPSG:32613",
        "fileFormat": "GeoTIFF",
        "maxPixels": 1e13
        }
    )

    r = requests.get(url, allow_redirects = True)
    open(path + "S2" + "_" + date_label + ".zip", "wb").write(r.content)
    print("Download complete")

## Functions to estimate LST 10m band

In [8]:
def get_landsat_images(start, end, poi_leak):

    selected_Landsat_collection = ee.ImageCollection("LANDSAT/LC08/C02/T1_L2")\
                                    .filterBounds(poi_leak)\
                                    .filterDate(ee.Date(start), ee.Date(end))\
                                    .sort("CLOUD_COVER")

    img_landsat = selected_Landsat_collection.first()

    print("Date of selected image (Landsat):", ee.Date(img_landsat.get("system:time_start")).format("yyyy-MM-dd").getInfo(),
        "\nLandsat images found: ", selected_Landsat_collection.size().getInfo(),
        "\nCloud %: ", img_landsat.get("CLOUD_COVER").getInfo())
    
    if selected_Landsat_collection.size().getInfo() == 0:
        print("THERES NO LANDSAT IMAGES FOR THE SELECTED DATES")

    return img_landsat

$LST_{10m} = \beta_0 + \beta_1 NDVI_{10m} + \beta_2 NDBI_{10m} + \beta_3 NDWI_{10m}$

where: 

$\beta_0, \beta_1, \beta_2, \beta_3$ are estimators calculated with the following linear regression model.

$LST_{30m} = \beta_0 + \beta_1 NDVI_{30m} + \beta_2 NDBI_{30m} + \beta_3 NDWI_{30m}$


In [9]:
# Create compund bands in order to perform linear regression
def lst_regression(landsat_image, sentinel_image, buffer_size, residuals = True):
    ndvi_landsat = landsat_image.normalizedDifference(["SR_B5", "SR_B4"]).rename("ndvi")
    ndwi_landsat = landsat_image.normalizedDifference(["SR_B3", "SR_B5"]).rename("ndwi")
    ndbi_landsat = landsat_image.normalizedDifference(["SR_B6", "SR_B5"]).rename("ndbi")
    lst_landsat_30m = landsat_image.select("ST_B10").rename("Landsat_LST_30m")

    ndvi_sentinel = sentinel_image.normalizedDifference(["B8", "B4"]).rename("s2_ndvi")
    ndwi_sentinel = sentinel_image.normalizedDifference(["B3", "B11"]).rename("s2_ndwi")
    ndbi_sentinel = sentinel_image.normalizedDifference(["B11", "B8"]).rename("s2_ndbi")


    # Linear regression
    bands = ee.Image(1).addBands(ndvi_landsat).addBands(ndbi_landsat).addBands(ndwi_landsat).addBands(lst_landsat_30m).rename(["constant", "ndvi", "ndbi", "ndwi", "lst"])

    img_landsat_regression = bands.reduceRegion(
        reducer = ee.Reducer.linearRegression(4, 1),
        geometry = poi.buffer(buffer_size).bounds(proj = "EPSG:32613", maxError = 0.001),
        scale = 30,
        maxPixels = 1e13
    )

    # Get coefficients of linear regression
    coefList2 = ee.Array(img_landsat_regression.get("coefficients")).toList()
    intercept2 = ee.Image(ee.Number(ee.List(coefList2.get(0)).get(0))).reproject(crs = "EPSG:32613")
    intercept2_list = ee.List(coefList2.get(0)).get(0)
    slopeNDVI2 = ee.Image(ee.Number(ee.List(coefList2.get(1)).get(0))).reproject(crs = "EPSG:32613")
    slopeNDVI2_list =  ee.List(coefList2.get(1)).get(0)
    slopeNDBI2 = ee.Image(ee.Number(ee.List(coefList2.get(2)).get(0))).reproject(crs = "EPSG:32613")
    slopeNDBI2_list =  ee.List(coefList2.get(2)).get(0)
    slopeNDWI2 = ee.Image(ee.Number(ee.List(coefList2.get(3)).get(0))).reproject(crs = "EPSG:32613")
    slopeNDWI2_list =  ee.List(coefList2.get(3)).get(0)

    # Downscale image
    lst_landsat_10m1 = ee.Image(intercept2).add(slopeNDVI2.multiply(ndvi_sentinel)).add(slopeNDBI2.multiply(ndbi_sentinel).add(slopeNDWI2.multiply(ndwi_sentinel)))

    # Create model 
    LST_model = intercept2.add(slopeNDVI2.multiply(ndvi_landsat))\
                .add(slopeNDBI2.multiply(ndbi_landsat))\
                .add(slopeNDWI2.multiply(ndwi_landsat)).clip(poi.buffer(buffer_size).bounds(proj = "EPSG:32613", maxError = 0.001))

    # Get residuals
    residuals = lst_landsat_30m.subtract(LST_model)

    # Define gaussian kernel in order to perform convolution for image smoothness
    gaussian = ee.Kernel.gaussian(radius = 1.5, units = "pixels")

    # Apply convolution 
    residuals_gaussian = residuals.resample("bicubic").convolve(gaussian)

    # Final downscaled image
    lst_landsat_10m_final = ee.Image(intercept2).add(slopeNDVI2.multiply(ndvi_sentinel))\
                                                .add(slopeNDBI2.multiply(ndbi_sentinel)).add(slopeNDWI2.multiply(ndwi_sentinel))

    # Get final downscaled image with residuals
    lst_landsat_10m_final_residuals = lst_landsat_10m_final.add(residuals_gaussian)

    if residuals == True:
        return lst_landsat_10m_final_residuals.clip(poi.buffer(buffer_size).bounds(proj = "EPSG:32613", maxError = 0.001))
    else: 
        return lst_landsat_10m_final.clip(poi.buffer(buffer_size).bounds(proj = "EPSG:32613", maxError = 0.001))

In [10]:
def add_lst10m_band(sentinel_image, lst_image):
    
    return sentinel_image.addBands(lst_image.rename("LST_10m"))

## Automate Download for all the leaks

In [11]:
# Initialize GEE API
ee.Initialize()

### Downloas leak patches

In [13]:
bands = ["B4", "B3", "B2", "B1", "B5", "B6", "B7", "B8", "B8A", "B9", "B11", "B12", "WVP"]
i = 0
download_path = "../data/patches_raw/leak/"
patch_size = 100
sentinel_end_date_list = []
sentinel_start_date_list = []
landsat_end_date_list = []
landsat_start_date_list = []
sentinel_img_date = []
landsat_img_date = []
sentinel_cloud = []
landsat_cloud = []

for leak in range(leaks_clean.shape[0]):
    print("="*100)
    print("Leak index: ", leak)
    
    # Dates for patch
    end_date_sentinel = leaks_clean.fecha_final[leak] - timedelta(days = 5)
    start_date_sentinel = end_date_sentinel - timedelta(days = 7)    
    sentinel_start_date_list.append(start_date_sentinel)
    sentinel_end_date_list.append(end_date_sentinel)

    end_date_landsat = leaks_clean.fecha_final[leak] - timedelta(days = 2)
    start_date_landsat = end_date_landsat - timedelta(days = 17)    
    landsat_start_date_list.append(start_date_landsat)
    landsat_end_date_list.append(end_date_landsat)

    if leak == 0:
        previous_end_date = ""

    if previous_end_date == end_date_sentinel:
        i += 1
        print("Leak detected at the same date")
    else:
        i = 0

    # Date for zip label
    date_lab = str(leaks_clean.fechalegalizacion[leak])[:10] + "_" + str(i)
    
    # Coords of leaks
    leak_lat_ogcrs = leaks_clean["y"][leak]
    leak_lon_ogcrs = leaks_clean["x"][leak]

    # Transform to degrees
    transformer = pyproj.Transformer.from_crs("epsg:32613", "epsg:4326")
    leak_lon, leak_lat = transformer.transform(leak_lon_ogcrs, leak_lat_ogcrs)

    # Point of leak
    poi = ee.Geometry.Point([leak_lat, leak_lon])
 
    print("Leak coords: ", (leak_lat, leak_lon), "\n", 
          "Start and end dates: ", (start_date_sentinel, end_date_sentinel))

    # Get sentinel and landsat collection of images according to leak coord 
    img = get_image(start = start_date_sentinel, end = end_date_sentinel, poi_leak = poi)
    img_landsat = get_landsat_images(start = start_date_landsat, end = end_date_landsat, poi_leak = poi)

    # Append date of images
    sentinel_img_date.append(ee.Date(img.get("system:time_start")).format("yyyy-MM-dd").getInfo())
    landsat_img_date.append(ee.Date(img_landsat.get("system:time_start")).format("yyyy-MM-dd").getInfo())

    # Append cloud %
    sentinel_cloud.append(img.get("CLOUDY_PIXEL_PERCENTAGE").getInfo())
    landsat_cloud.append(img_landsat.get("CLOUD_COVER").getInfo())

    # Select bands and reduce the hole image to a patch centered on the leak
    img = bands_clip_image(img, buffer_size = patch_size, bands = bands)

    # Estimate lst from landsat image and add its lst band to sentinel image
    lst_image = lst_regression(landsat_image = img_landsat, sentinel_image = img, buffer_size = patch_size, residuals = True)
    img = add_lst10m_band(img, lst_image = lst_image)

    # Download image
    download_image(image = img, path = download_path, date_label = date_lab)

    previous_end_date = end_date_sentinel

print("+"*100)
print("PATCH DOWNLOAD COMPLETED")
winsound.Beep(2000, 1000)

Leak index:  0
Leak coords:  (-102.27569493914878, 21.857000495984423) 
 Start and end dates:  (Timestamp('2022-11-19 00:00:00'), Timestamp('2022-11-26 00:00:00'))
Images found:  3
Date of selected image:  2022-11-25 
Cloud %:  0.874395
Date of selected image (Landsat): 2022-11-19 
Landsat images found:  2 
Cloud %:  2.08
Download complete
Leak index:  1
Leak detected at the same date
Leak coords:  (-102.24148408060327, 21.890893572554063) 
 Start and end dates:  (Timestamp('2022-11-19 00:00:00'), Timestamp('2022-11-26 00:00:00'))
Images found:  3
Date of selected image:  2022-11-25 
Cloud %:  0.874395
Date of selected image (Landsat): 2022-11-19 
Landsat images found:  2 
Cloud %:  2.08
Download complete
Leak index:  2
Leak detected at the same date
Leak coords:  (-102.27614954129095, 21.854335579474345) 
 Start and end dates:  (Timestamp('2022-11-19 00:00:00'), Timestamp('2022-11-26 00:00:00'))
Images found:  3
Date of selected image:  2022-11-25 
Cloud %:  0.874395
Date of selected 

### Download non leak patches

In [14]:
bands = ["B4", "B3", "B2", "B1", "B5", "B6", "B7", "B8", "B8A", "B9", "B11", "B12", "WVP"]
i = 0
download_path = "../data/patches_raw/non_leak/"
patch_size = 100

sentinel_end_date_list_nonleak = []
sentinel_start_date_list_nonleak = []
landsat_end_date_list_nonleak = []
landsat_start_date_list_nonleak = []
sentinel_img_date_nonleak = []
landsat_img_date_nonleak = []
sentinel_cloud_nonleak = []
landsat_cloud_nonleak = []

for leak in range(leaks_clean.shape[0]):
    print("="*100)
    print("Leak index: ", leak)
    
    # Dates for patch after leak
    end_date_sentinel = leaks_clean.fecha_final[leak] + timedelta(days = 15)
    start_date_sentinel = end_date_sentinel - timedelta(days = 7)    
    sentinel_start_date_list_nonleak.append(start_date_sentinel)
    sentinel_end_date_list_nonleak.append(end_date_sentinel)
    
    end_date_landsat = leaks_clean.fecha_final[leak] + timedelta(days = 20)
    start_date_landsat = end_date_landsat - timedelta(days = 15)    
    landsat_start_date_list_nonleak.append(start_date_landsat)
    landsat_end_date_list_nonleak.append(end_date_landsat)

    if leak == 0:
        previous_end_date = ""

    if previous_end_date == end_date_sentinel:
        i += 1
        print("Leak detected at the same date")
    else:
        i = 0

    # Date for zip label
    date_lab = str(leaks_clean.fechalegalizacion[leak])[:10] + "_" + str(i)
    
    # Coords of leaks
    leak_lat_ogcrs = leaks_clean["y"][leak]
    leak_lon_ogcrs = leaks_clean["x"][leak]

    # Transform to degrees
    transformer = pyproj.Transformer.from_crs("epsg:32613", "epsg:4326")
    leak_lon, leak_lat = transformer.transform(leak_lon_ogcrs, leak_lat_ogcrs)

    # Point of leak
    poi = ee.Geometry.Point([leak_lat, leak_lon])
 
    print("Leak coords: ", (leak_lat, leak_lon), "\n", 
          "Start and end dates: ", (start_date_sentinel, end_date_sentinel))

    # Get sentinel and landsat collection of images according to leak coord 
    img = get_image(start = start_date_sentinel, end = end_date_sentinel, poi_leak = poi)
    img_landsat = get_landsat_images(start = start_date_landsat, end = end_date_landsat, poi_leak = poi)

    # Append date of images
    sentinel_img_date_nonleak.append(ee.Date(img.get("system:time_start")).format("yyyy-MM-dd").getInfo())
    landsat_img_date_nonleak.append(ee.Date(img_landsat.get("system:time_start")).format("yyyy-MM-dd").getInfo())

    # Append cloud %
    sentinel_cloud_nonleak.append(img.get("CLOUDY_PIXEL_PERCENTAGE").getInfo())
    landsat_cloud_nonleak.append(img_landsat.get("CLOUD_COVER").getInfo())
    
    # Select bands and reduce the hole image to a patch centered on the leak
    img = bands_clip_image(img, buffer_size = patch_size, bands = bands)

    # Estimate lst from landsat image and add its lst band to sentinel image
    lst_image = lst_regression(landsat_image = img_landsat, sentinel_image = img, buffer_size = patch_size, residuals = True)
    img = add_lst10m_band(img, lst_image = lst_image)

    # Download image
    download_image(image = img, path = download_path, date_label = date_lab)

    previous_end_date = end_date_sentinel

print("+"*100)
print("PATCH DOWNLOAD COMPLETED")
winsound.Beep(2000, 1000)

Leak index:  0
Leak coords:  (-102.27569493914878, 21.857000495984423) 
 Start and end dates:  (Timestamp('2022-12-09 00:00:00'), Timestamp('2022-12-16 00:00:00'))
Images found:  3
Date of selected image:  2022-12-15 
Cloud %:  8.1e-05
Date of selected image (Landsat): 2022-12-12 
Landsat images found:  1 
Cloud %:  0.04
Download complete
Leak index:  1
Leak detected at the same date
Leak coords:  (-102.24148408060327, 21.890893572554063) 
 Start and end dates:  (Timestamp('2022-12-09 00:00:00'), Timestamp('2022-12-16 00:00:00'))
Images found:  3
Date of selected image:  2022-12-15 
Cloud %:  8.1e-05
Date of selected image (Landsat): 2022-12-12 
Landsat images found:  1 
Cloud %:  0.04
Download complete
Leak index:  2
Leak detected at the same date
Leak coords:  (-102.27614954129095, 21.854335579474345) 
 Start and end dates:  (Timestamp('2022-12-09 00:00:00'), Timestamp('2022-12-16 00:00:00'))
Images found:  3
Date of selected image:  2022-12-15 
Cloud %:  8.1e-05
Date of selected ima

In [None]:
# Add all the info from download to leaks_clean df

leaks_clean["sentinel_end_date_list"] = sentinel_end_date_list
leaks_clean["sentinel_start_date_list"] = sentinel_start_date_list
leaks_clean["landsat_end_date_list"] = landsat_end_date_list
leaks_clean["landsat_start_date_list"] = landsat_start_date_list
leaks_clean["sentinel_img_date"] = sentinel_img_date
leaks_clean["landsat_img_date"] = landsat_img_date
leaks_clean["sentinel_cloud"] = sentinel_cloud
leaks_clean["landsat_cloud"] = landsat_cloud

leaks_clean["sentinel_end_date_list_nonleak"] = sentinel_end_date_list_nonleak
leaks_clean["sentinel_start_date_list_nonleak"] = sentinel_start_date_list_nonleak
leaks_clean["landsat_end_date_list_nonleak"] = landsat_end_date_list_nonleak
leaks_clean["landsat_start_date_list_nonleak"] = landsat_start_date_list_nonleak
leaks_clean["sentinel_img_date_nonleak"] = sentinel_img_date_nonleak
leaks_clean["landsat_img_date_nonleak"] = landsat_img_date_nonleak
leaks_clean["sentinel_cloud_nonleak"] = sentinel_cloud_nonleak
leaks_clean["landsat_cloud_nonleak"] = landsat_cloud_nonleak

In [None]:
leaks_clean_old = pd.read_csv("../data/clean/leaks_clean.csv")

In [None]:
leaks_clean.to_csv("../data/clean/leaks_clean.csv")