# low to high resolution

In [12]:
import os
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.cluster import KMeans
from image import read_image, create_image

SEED = 42

## sentinel, 10 meters resolution, blue, green, red, VNIR bands

In [13]:
img_dir = "data/images/high_resolution/SENTINEL-2B_MSI_20210511_084252/"
bands_filenames = [
    "SENTINEL-2B_MSI_20210511_084252_channel2_1.tif",
    "SENTINEL-2B_MSI_20210511_084252_channel3_1.tif",
    "SENTINEL-2B_MSI_20210511_084252_channel4_1.tif",
    "SENTINEL-2B_MSI_20210511_084252_channel8_1.tif",
]
bands_names = ["Blue", "Green", "Red", "VNIR"]
data_path = "data/SENTINEL-2B_MSI_20210511_084252_Blue_Red_Green_VNIR.csv"

### extracting table data from image

In [3]:
%%time

data = pd.DataFrame(columns=bands_names)
for band_name, band_filename in zip(bands_names, bands_filenames):
    band_path = os.path.join(img_dir, band_filename)
    data[band_name] = read_image(band_path)

CPU times: total: 41.2 s
Wall time: 4min 17s


In [4]:
data

Unnamed: 0,Blue,Green,Red,VNIR
0,998,750,605,654
1,1008,746,612,682
2,1008,758,612,699
3,1016,748,610,703
4,1022,765,603,717
...,...,...,...,...
120560395,837,628,496,376
120560396,823,634,496,395
120560397,819,628,495,399
120560398,827,626,490,440


In [5]:
%%time

data.to_csv(data_path, index=False)

CPU times: total: 3min
Wall time: 3min 35s


### kmeans, 20 clusters


In [10]:
kmeans_backup_path = "data/backups/SENTINEL-2B_MSI_20210511_084252_kmeans_20_clusters_trained_on_Blue_Red_Green_VNIR.joblib"
labels_path = "data/SENTINEL-2B_MSI_20210511_084252_kmeans_20_clusters_labels_trained_on_Blue_Red_Green_VNIR.csv"
sample_img_path = os.path.join(img_dir, bands_filenames[0])
result_img_path = "data/images/high_resolution/results/SENTINEL-2B_MSI_20210511_084252_kmeans_20_clusters_labels_trained_on_Blue_Red_Green_VNIR.tif"

In [3]:
%%time

data = pd.read_csv(data_path, index_col=False)

CPU times: total: 59.1 s
Wall time: 1min 9s


In [4]:
data

Unnamed: 0,Blue,Green,Red,VNIR
0,998,750,605,654
1,1008,746,612,682
2,1008,758,612,699
3,1016,748,610,703
4,1022,765,603,717
...,...,...,...,...
120560395,837,628,496,376
120560396,823,634,496,395
120560397,819,628,495,399
120560398,827,626,490,440


In [5]:
%%time

kmeans = KMeans(n_clusters=20, random_state=SEED).fit(data)

CPU times: total: 5h 19min 57s
Wall time: 5h 16min 55s


In [12]:
%%time

dump(kmeans, kmeans_backup_path)

CPU times: total: 453 ms
Wall time: 514 ms


['data/backups/SENTINEL-2B_MSI_20210511_084252_kmeans_trained_on_Blue_Red_Green_VNIR.joblib']

In [None]:
%%time

kmeans = load(kmeans_backup_path)

In [8]:
kmeans_labels = pd.Series(kmeans.labels_)

In [9]:
kmeans_labels

0            3
1            3
2            3
3            3
4            3
            ..
120560395    9
120560396    9
120560397    9
120560398    9
120560399    9
Length: 120560400, dtype: int32

In [10]:
%%time

kmeans_labels.to_csv(labels_path, index=False)

CPU times: total: 1min 59s
Wall time: 2min 4s


In [3]:
%%time

kmeans_labels = pd.read_csv(labels_path, index_col=False)

CPU times: total: 10 s
Wall time: 10.8 s


In [4]:
%%time

kmeans_labels = np.array(kmeans_labels).ravel()

CPU times: total: 344 ms
Wall time: 361 ms


In [14]:
%%time

kmeans_labels, np.unique(kmeans_labels)

CPU times: total: 3.41 s
Wall time: 3.65 s


(array([3, 3, 3, ..., 9, 9, 9], dtype=int64),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19], dtype=int64))

In [7]:
%%time

create_image(sample_img_path, result_img_path, kmeans_labels)

CPU times: total: 45.5 s
Wall time: 47.1 s


### kmeans, 10 clusters

In [14]:
kmeans_backup_path = "data/backups/SENTINEL-2B_MSI_20210511_084252_kmeans_10_clusters_trained_on_Blue_Red_Green_VNIR.joblib"
labels_path = "data/SENTINEL-2B_MSI_20210511_084252_kmeans_10_clusters_labels_trained_on_Blue_Red_Green_VNIR.csv"
sample_img_path = os.path.join(img_dir, bands_filenames[0])
result_img_path = "data/images/high_resolution/results/SENTINEL-2B_MSI_20210511_084252_kmeans_10_clusters_labels_trained_on_Blue_Red_Green_VNIR.tif"

In [4]:
%%time

data = pd.read_csv(data_path, index_col=False)

CPU times: total: 50.9 s
Wall time: 1min 3s


In [5]:
data

Unnamed: 0,Blue,Green,Red,VNIR
0,998,750,605,654
1,1008,746,612,682
2,1008,758,612,699
3,1016,748,610,703
4,1022,765,603,717
...,...,...,...,...
120560395,837,628,496,376
120560396,823,634,496,395
120560397,819,628,495,399
120560398,827,626,490,440


In [6]:
%%time

kmeans = KMeans(n_clusters=10, random_state=SEED).fit(data)

CPU times: total: 2h 13min 32s
Wall time: 2h 28min 47s


In [7]:
%%time

dump(kmeans, kmeans_backup_path)

CPU times: total: 547 ms
Wall time: 2.94 s


['data/backups/SENTINEL-2B_MSI_20210511_084252_kmeans_10_clusters_trained_on_Blue_Red_Green_VNIR.joblib']

In [None]:
%%time

kmeans = load(kmeans_backup_path)

In [8]:
kmeans_labels = pd.Series(kmeans.labels_)

In [9]:
kmeans_labels

0            0
1            0
2            0
3            0
4            0
            ..
120560395    0
120560396    0
120560397    0
120560398    0
120560399    0
Length: 120560400, dtype: int32

In [10]:
%%time

kmeans_labels.to_csv(labels_path, index=False)

CPU times: total: 1min 45s
Wall time: 1min 47s


In [4]:
%%time

kmeans_labels = pd.read_csv(labels_path, index_col=False)

CPU times: total: 11.9 s
Wall time: 13.1 s


In [5]:
%%time

kmeans_labels = np.array(kmeans_labels).ravel()

CPU times: total: 562 ms
Wall time: 600 ms


In [6]:
%%time

kmeans_labels, np.unique(kmeans_labels)

CPU times: total: 4.16 s
Wall time: 4.44 s


(array([0, 0, 0, ..., 0, 0, 0], dtype=int64),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64))

In [15]:
%%time

create_image(sample_img_path, result_img_path, kmeans_labels)

CPU times: total: 55.5 s
Wall time: 1min 2s


### gaussian mixture