## Filtering and Batch Download the SENTINEL12MS (~500 GB)

### Version 1.0.0
### Created by: Cesar Aybar
### Modified by: N/A
### Date: 2018-09-25
### Dataset Creator: [Schmitt et. al 2019](https://www.isprs-ann-photogramm-remote-sens-spatial-inf-sci.net/IV-2-W7/153/2019/isprs-annals-IV-2-W7-153-2019.pdf)

If you are planning to use this dataset don't forget to cited as follow:

Schmitt, M., Hughes, L. H., Qiu, C., & Zhu, X. X. (2019). SEN12MS--A Curated Dataset of Georeferenced Multi-Spectral Sentinel-1/2 Imagery for Deep Learning and Data Fusion. arXiv preprint arXiv:1906.07789.


In [0]:
system("sudo add-apt-repository ppa:ubuntugis/ubuntugis-unstable")
system("sudo apt-get update")
system("sudo apt-get install libudunits2-dev libgdal-dev libgeos-dev libproj-dev")

In [43]:
install.packages('RCurl')
install.packages('rgdal')
install.packages('raster')
install.packages('reticulate')

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)


In [0]:
#mode function
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

In [0]:
# Start Batch Download (500 GB>)
library(RCurl)
library(raster)
library(tidyverse)
library(reticulate)

In [0]:
# Global parameters -------------------------------------------------------
FTP_DATASET <- 'ftp://m1474000:m1474000@dataserv.ub.tum.de/'
TABLE_DATASET <- "https://raw.githubusercontent.com/csaybar/Geohormiguitas/master/data/table_landuse_report.csv"
FTP_DATASETS <- getURL(FTP_DATASET,
                       ftp.use.epsv = FALSE,
                       ftplistonly = TRUE,
                       crlf = TRUE) %>% 
  map(~ strsplit(.x,'\r*\n')[[1]]) %>%
  '[['(1) %>% 
  .[grepl("\\.tar\\.gz$", .)]
table_landuse  <- read_csv(TABLE_DATASET)

In [0]:
##############################
# JUST CHANGE THESE PARAMETERS
#summer, spring, winter, fall
SEASON <- "summer"
LAND_USE_SELECTED <- c("2", "8", "10", "11", "9", "17", "12", "14", "13") # see table_landuse
LAND_USE_SELECTED <- as.numeric(LAND_USE_SELECTED)
dir.create('data')
##############################

In [0]:
# Download dataset by season ----------------------------------------------
ftp_dataset_season <- FTP_DATASETS[grepl(SEASON, FTP_DATASETS)]

In [0]:
# land_use modis
where_landuse <- grepl("_lc",ftp_dataset_season)
landuse_url <- paste0(FTP_DATASET, ftp_dataset_season[where_landuse])
landuse_local <- paste0("data/", ftp_dataset_season[where_landuse])
download.file(url = landuse_url, destfile = landuse_local)
untar(landuse_local,exdir = "data/") 

In [0]:
full_modis_images <- list.files(path = gsub("\\_lc.tar.gz","",landuse_local),pattern = "\\.tif$", recursive = TRUE,full.names = TRUE)
selected_images <- data_frame(
  mode_value = rep(NA, length(full_modis_images)),
  filename = rep(NA, length(full_modis_images))
)
count <- 1

In [13]:
#Time for a diet-soda
for(mod in full_modis_images) {
  modis_img <- raster(mod)
  value <- Mode(getValues(modis_img))
  if(any(value %in% LAND_USE_SELECTED)) {    
    selected_images[count,] <- c(value, names(modis_img))
    count <- count + 1 
  }  
  if(count %% 1000 == 0) {
   print(sprintf('%s files selected', count))
  }
}

ERROR: ignored

In [0]:
selected_images_no_na <- na.omit(selected_images) %>% as_tibble()
write_csv(selected_images_no_na, "selected_image_summer.csv")