### main


In [1]:
import os
import numpy as np
import pandas as pd
from osgeo import gdal
from joblib import dump, load
from sklearn.cluster import KMeans
from image import read_image, create_image

SEED = 42

In [7]:
img_dir = "data/images/high_resolution/SENTINEL-2B_MSI_20210511_084252/"
bands_filenames = [
    "SENTINEL-2B_MSI_20210511_084252_channel2_1.tif",
    "SENTINEL-2B_MSI_20210511_084252_channel3_1.tif",
    "SENTINEL-2B_MSI_20210511_084252_channel4_1.tif",
    "SENTINEL-2B_MSI_20210511_084252_channel8_1.tif",
]
bands_names = ["Blue", "Green", "Red", "VNIR"]
data_path = "data/SENTINEL-2B_MSI_20210511_084252_Blue_Red_Green_VNIR.csv"
kmeans_backup_path = "data/backups/SENTINEL-2B_MSI_20210511_084252_kmeans_trained_on_Blue_Red_Green_VNIR.joblib"
labels_path = "data/SENTINEL-2B_MSI_20210511_084252_kmeans_labels_trained_on_Blue_Red_Green_VNIR.csv"

In [3]:
%%time

data = pd.DataFrame(columns=bands_names)
for band_name, band_filename in zip(bands_names, bands_filenames):
    band_path = os.path.join(img_dir, band_filename)
    data[band_name] = read_image(band_path)

CPU times: total: 41.2 s
Wall time: 4min 17s


In [4]:
data

Unnamed: 0,Blue,Green,Red,VNIR
0,998,750,605,654
1,1008,746,612,682
2,1008,758,612,699
3,1016,748,610,703
4,1022,765,603,717
...,...,...,...,...
120560395,837,628,496,376
120560396,823,634,496,395
120560397,819,628,495,399
120560398,827,626,490,440


In [5]:
%%time

data.to_csv(data_path, index=False)

CPU times: total: 3min
Wall time: 3min 35s


In [6]:
chunksize = 5000000
chunks_number = int(np.ceil(data.shape[0] / chunksize))
chunks_number

25

In [15]:
%%time

path, ext = os.path.splitext(data_path)
for i, chunk in enumerate(pd.read_csv(data_path, chunksize=chunksize)):
    chunk.to_csv(f"{path}_chunk_{i}{ext}", index=False)

CPU times: total: 3min 27s
Wall time: 7min 36s


In [3]:
%%time

path, ext = os.path.splitext(data_path)
data_chunk = pd.read_csv(f"{path}_chunk_{0}{ext}", index_col=False)

CPU times: total: 1.27 s
Wall time: 1.93 s


In [4]:
data_chunk

Unnamed: 0,Blue,Green,Red,VNIR
0,998,750,605,654
1,1008,746,612,682
2,1008,758,612,699
3,1016,748,610,703
4,1022,765,603,717
...,...,...,...,...
4999995,984,836,926,1763
4999996,1024,896,1074,2012
4999997,1039,930,1174,2094
4999998,1055,959,1182,2245


In [3]:
%%time

data = pd.read_csv(data_path, index_col=False)

CPU times: total: 59.1 s
Wall time: 1min 9s


In [4]:
data

Unnamed: 0,Blue,Green,Red,VNIR
0,998,750,605,654
1,1008,746,612,682
2,1008,758,612,699
3,1016,748,610,703
4,1022,765,603,717
...,...,...,...,...
120560395,837,628,496,376
120560396,823,634,496,395
120560397,819,628,495,399
120560398,827,626,490,440


In [5]:
%%time

kmeans = KMeans(n_clusters=20, random_state=SEED).fit(data)

CPU times: total: 5h 19min 57s
Wall time: 5h 16min 55s


In [12]:
%%time

dump(kmeans, kmeans_backup_path)
# kmeans = load(kmeans_backup_path)

CPU times: total: 453 ms
Wall time: 514 ms


['data/backups/SENTINEL-2B_MSI_20210511_084252_kmeans_trained_on_Blue_Red_Green_VNIR.joblib']

In [8]:
kmeans_labels = pd.Series(kmeans.labels_)

In [9]:
kmeans_labels

0            3
1            3
2            3
3            3
4            3
            ..
120560395    9
120560396    9
120560397    9
120560398    9
120560399    9
Length: 120560400, dtype: int32

In [10]:
%%time

kmeans_labels.to_csv(labels_path, index=False)

CPU times: total: 1min 59s
Wall time: 2min 4s


In [11]:
%%time

sample_img_path = os.path.join(img_dir, bands_filenames[0])
path, ext = os.path.splitext(labels_path)
img_path = f"{path}.tif"
sample_img = gdal.Open(sample_img_path, gdal.GA_ReadOnly)
driver = sample_img.GetDriver()
x_size, y_size = sample_img.RasterXSize, sample_img.RasterYSize
img = driver.Create(img_path, x_size, y_size, 1, gdal.GDT_UInt16)
img.SetGeoTransform(sample_img.GetGeoTransform())
img.SetProjection(sample_img.GetProjection())
img.GetRasterBand(1).Fill(0)
band = img.GetRasterBand(1)
raster = np.zeros((y_size, x_size), dtype=np.uint8)
for y in y_size:
    for x in x_size:
        raster[y][x] = value[0]
band.WriteArray(raster)
img = band = None

CPU times: total: 0 ns
Wall time: 0 ns


(5000000,)

In [35]:


for band_name, band_filename in zip(bands_names, bands_filenames):
    band_path = os.path.join(IMG_DIR, band_filename)
    band_img = gdal.Open(band_path, gdal.GA_ReadOnly)

    x_size, y_size = band_img.RasterXSize, band_img.RasterYSize

    band = band_img.GetRasterBand(1)
    band_data = band.ReadAsArray(0, 0, x_size, y_size)

    band_col = band_data.ravel()
    data[band_name] = band_col

    band_img = band = band_col = None

CPU times: total: 53.5 s
Wall time: 1min 19s


In [36]:
data


Unnamed: 0,Blue,Green,Red,VNIR
0,998,750,605,654
1,1008,746,612,682
2,1008,758,612,699
3,1016,748,610,703
4,1022,765,603,717
...,...,...,...,...
120560395,837,628,496,376
120560396,823,634,496,395
120560397,819,628,495,399
120560398,827,626,490,440


In [37]:
%%time

data.to_csv('data/SENTINEL-2B_MSI_20210511_084252_Blue_Red_Green_VNIR.csv', index=False)

CPU times: total: 3min 35s
Wall time: 3min 52s


In [None]:
%%time

data = pd.read_csv('data/SENTINEL-2B_MSI_20210511_084252_kmeans_labels_trained_on_Blue_Red_Green_VNIR.csv', index_col=False, dtype=np.uint8)

In [39]:
%%time

kmeans = KMeans(n_clusters=20, random_state=SEED).fit(data)

CPU times: total: 4h 38min 36s
Wall time: 5h 15min 1s


In [40]:
kmeans.labels_


array([3, 3, 3, ..., 9, 9, 9])

In [23]:
dump(kmeans, kmeans_backup_path)
# rf = load(kmeans_backup_path)


FileNotFoundError: [Errno 2] No such file or directory: 'data/backups/rf_fitted_on_data_train.joblib'

In [41]:
%%time

data.to_csv('data/SENTINEL-2B_MSI_20210511_084252_kmeans_labels_trained_on_Blue_Red_Green_VNIR.csv', index=False)

CPU times: total: 3min 32s
Wall time: 3min 58s


In [6]:
%%time

kmeans_labels = pd.read_csv('data/SENTINEL-2B_MSI_20210511_084252_kmeans_labels_trained_on_Blue_Red_Green_VNIR.csv', index_col=False, dtype=np.uint8)

CPU times: total: 31.2 s
Wall time: 1min 40s


In [7]:
%%time

sample_path = os.path.join(IMG_DIR, bands_filenames[0])
sample_img = gdal.Open(sample_path, gdal.GA_ReadOnly)

output_path = os.path.join(IMG_DIR, 'kmeans_labels_trained_on_Blue_Red_Green_VNIR.img')
driver = sample_img.GetDriver()
output_img = driver.Create(output_path, sample_img.RasterXSize, sample_img.RasterYSize, 1, gdal.GDT_UInt16)

output_img.SetGeoTransform(sample_img.GetGeoTransform())
output_img.SetProjection(sample_img.GetProjection())
output_img.GetRasterBand(1).Fill(0)

output_band = output_img.GetRasterBand(1)

CPU times: total: 141 ms
Wall time: 6.08 s


In [8]:
%%time

x_size, y_size = sample_img.RasterXSize, sample_img.RasterYSize
raster = np.zeros((y_size, x_size), dtype=np.uint8)

CPU times: total: 0 ns
Wall time: 0 ns


In [11]:
kmeans_labels = np.array(kmeans_labels)


In [12]:
kmeans_labels


array([[230, 238,  93, 142],
       [240, 234, 100, 170],
       [240, 246, 100, 187],
       ...,
       [ 51, 116, 239, 143],
       [ 59, 114, 234, 184],
       [ 69, 127, 254, 234]], dtype=uint8)

In [13]:
%%time

for y, x, kmeans_label in zip(list(range(y_size)), list(range(x_size)), kmeans_labels):
    raster[y][x] = kmeans_label

ValueError: setting an array element with a sequence.

In [None]:
%%time

output_band.WriteArray(raster)
output_img = None
output_band = None

In [None]:
[
    "channel2",
    "channel3",
    "channel4",
    "channel8_",
]


In [12]:
for img in sentinel_imgs:
    img_path = os.path.join(HIGH_RES_IMG_DIR, img)
    bands_filenames = list(
        filter(
            lambda x: any(band_name in x for band_name in bands_names),
            os.listdir(img_path),
        )
    )
    print(img, bands_filenames)


SENTINEL-2B_MSI_20210325_085404 ['SENTINEL-2B_MSI_20210325_085404_channel2_1.tif', 'SENTINEL-2B_MSI_20210325_085404_channel3_1.tif', 'SENTINEL-2B_MSI_20210325_085404_channel4_1.tif', 'SENTINEL-2B_MSI_20210325_085404_channel8_1.tif']
SENTINEL-2B_MSI_20190425_085420 ['SENTINEL-2B_MSI_20190425_085420_channel2_1.tif', 'SENTINEL-2B_MSI_20190425_085420_channel3_1.tif', 'SENTINEL-2B_MSI_20190425_085420_channel4_1.tif', 'SENTINEL-2B_MSI_20190425_085420_channel8_1.tif']
SENTINEL-2B_MSI_20210511_084252 ['SENTINEL-2B_MSI_20210511_084252_channel2_1.tif', 'SENTINEL-2B_MSI_20210511_084252_channel3_1.tif', 'SENTINEL-2B_MSI_20210511_084252_channel4_1.tif', 'SENTINEL-2B_MSI_20210511_084252_channel8_1.tif']
SENTINEL-2B_MSI_20211008_084414 ['SENTINEL-2B_MSI_20211008_084414_channel2_1.tif', 'SENTINEL-2B_MSI_20211008_084414_channel3_1.tif', 'SENTINEL-2B_MSI_20211008_084414_channel4_1.tif', 'SENTINEL-2B_MSI_20211008_084414_channel8_1.tif']
SENTINEL-2A_MSI_20201117_084357 ['SENTINEL-2A_MSI_20201117_084357_ch

In [2]:
DATA_DIR = "data/"
IMG_DIR = "data/images/"
LOW_RES_IMG_DIR = "data/images/low_resolution/"
HIGH_RES_IMG_DIR = "data/images/high_resolution/"

sentinel_imgs = [
    "SENTINEL-2B_MSI_20210325_085404",
    "SENTINEL-2B_MSI_20190425_085420",
    "SENTINEL-2B_MSI_20210511_084252",
    "SENTINEL-2B_MSI_20211008_084414",
    "SENTINEL-2A_MSI_20201117_084357",
]

bands_names = [
    "channel2",
    "channel3",
    "channel4",
    "channel8_",
]


In [None]:
%%time

for band_filename in bands_filenames:
    band_path = os.path.join(img_path, band_filename)
    band = gdal.Open(band_path, gdal.GA_ReadOnly)
    geotransform = band.GetGeoTransform()
    name = band_filename[-9:-6]
    x_size, y_size = band.RasterXSize, band.RasterYSize
    top_left_x, top_left_y = geotransform[0], geotransform[3]
    pixel_width, pixel_height = geotransform[1], geotransform[5]
    rotation_1, rotation_2 = geotransform[2], geotransform[4]
    print(f"name: {name}, size: {x_size}x{y_size}, origin: ({top_left_x}, {top_left_y}), pixel size: {pixel_width}x{pixel_height}, rotation: ({rotation_1}, {rotation_2})")

### auxiliary


In [5]:
os.listdir(HIGH_RES_IMG_DIR)


['LANDSAT_8_OLI_TIRS_20210604_082849',
 'LANDSAT_8_OLI_TIRS_20210706_082857',
 'SENTINEL-2A_MSI_20200901_053433',
 'SENTINEL-2A_MSI_20200921_053434',
 'SENTINEL-2A_MSI_20201117_084357',
 'SENTINEL-2A_MSI_20210327_052418',
 'SENTINEL-2A_MSI_20220524_053423',
 'SENTINEL-2B_MSI_20190425_085420',
 'SENTINEL-2B_MSI_20190813_053920',
 'SENTINEL-2B_MSI_20210325_085404',
 'SENTINEL-2B_MSI_20210511_084252',
 'SENTINEL-2B_MSI_20210531_052344',
 'SENTINEL-2B_MSI_20210703_053405',
 'SENTINEL-2B_MSI_20211008_084414']

In [6]:
list(filter(lambda x: x.endswith(".img"), os.listdir(LOW_RES_IMG_DIR)))


['2010-05-01.img',
 '2010-05-02.img',
 '2010-05-03.img',
 '2010-07-01.img',
 '2010-07-02.img',
 '2010-07-03.img',
 '2010-09-01.img',
 '2010-09-02.img',
 '2010-09-03.img',
 'lccswm2010_4.img',
 'priv_mod_v10_2011layer1.img',
 'priv_mod_v10_2011layer2.img']

In [16]:
img_path = os.path.join(HIGH_RES_IMG_DIR, high_res_img_dirs[0])
img_filenames = list(filter(lambda x: x.endswith(".tif"), os.listdir(img_path)))
img_filenames


['LANDSAT_8_OLI_TIRS_20210604_082849_lc2_ang_sen_azimuth_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_ang_sen_zenith_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_ang_sol_azimuth_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_ang_sol_zenith_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b10_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b11_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b1_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b2_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b3_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b4_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b5_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b6_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b7_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b8_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b9_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_qa_l1_px_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_lc2_qa_l1_rs_1.tif',
 'LANDSAT_8_OLI_TIRS_20210604_082849_source_1.tif',


In [15]:
bands_filenames = [
    "LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b1_1.tif",
    "LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b2_1.tif",
    "LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b3_1.tif",
    "LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b4_1.tif",
    "LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b5_1.tif",
    "LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b6_1.tif",
    "LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b7_1.tif",
    "LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b8_1.tif",
    "LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b9_1.tif",
    "LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b10_1.tif",
    "LANDSAT_8_OLI_TIRS_20210604_082849_lc2_b11_1.tif",
]


In [21]:
%%time

for band_filename in bands_filenames:
    band_path = os.path.join(img_path, band_filename)
    band = gdal.Open(band_path, gdal.GA_ReadOnly)
    geotransform = band.GetGeoTransform()
    name = band_filename[-9:-6]
    x_size, y_size = band.RasterXSize, band.RasterYSize
    top_left_x, top_left_y = geotransform[0], geotransform[3]
    pixel_width, pixel_height = geotransform[1], geotransform[5]
    rotation_1, rotation_2 = geotransform[2], geotransform[4]
    print(f"name: {name}, size: {x_size}x{y_size}, origin: ({top_left_x}, {top_left_y}), pixel size: {pixel_width}x{pixel_height}, rotation: ({rotation_1}, {rotation_2})")

name: _b1, size: 8091x8171, origin: (320385.0, 6318615.0), pixel size: 30.0x-30.0, rotation: (0.0, 0.0)
name: _b2, size: 8091x8171, origin: (320385.0, 6318615.0), pixel size: 30.0x-30.0, rotation: (0.0, 0.0)
name: _b3, size: 8091x8171, origin: (320385.0, 6318615.0), pixel size: 30.0x-30.0, rotation: (0.0, 0.0)
name: _b4, size: 8091x8171, origin: (320385.0, 6318615.0), pixel size: 30.0x-30.0, rotation: (0.0, 0.0)
name: _b5, size: 8091x8171, origin: (320385.0, 6318615.0), pixel size: 30.0x-30.0, rotation: (0.0, 0.0)
name: _b6, size: 8091x8171, origin: (320385.0, 6318615.0), pixel size: 30.0x-30.0, rotation: (0.0, 0.0)
name: _b7, size: 8091x8171, origin: (320385.0, 6318615.0), pixel size: 30.0x-30.0, rotation: (0.0, 0.0)
name: _b8, size: 16181x16341, origin: (320392.5, 6318607.5), pixel size: 15.0x-15.0, rotation: (0.0, 0.0)
name: _b9, size: 8091x8171, origin: (320385.0, 6318615.0), pixel size: 30.0x-30.0, rotation: (0.0, 0.0)
name: b10, size: 8091x8171, origin: (320385.0, 6318615.0), pix