# Практика

In [47]:
%%time

import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from osgeo import gdal
from joblib import dump, load
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

SEED = 42

CPU times: total: 0 ns
Wall time: 0 ns


In [27]:
%%time

img_dir = "data/images/"

sample_filename = "lccswm2010_4.img"
channels_filenames = [
    "priv_mod_v10_2011layer1.img",
    "priv_mod_v10_2011layer2.img",
    "2010-05-01.img",
    "2010-05-02.img",
    "2010-05-03.img",
    "2010-07-01.img",
    "2010-07-02.img",
    "2010-07-03.img",
    "2010-09-01.img",
    "2010-09-02.img",
    "2010-09-03.img",
]
output_filename = "lc_map.img"

column_names = [
    'CLASS',
    'X',
    'Y',
    'WINTER1',
    'WINTER2',
    'SPRING1',
    'SPRING2',
    'SPRING3',
    'SUMMER1',
    'SUMMER2',
    'SUMMER3',
    'FALL1',
    'FALL2',
    'FALL3'
]

CPU times: total: 0 ns
Wall time: 0 ns


## Получение табличных данных из изображений

In [3]:
%%time

sample_path = os.path.join(img_dir, sample_filename)
sample_img = gdal.Open(sample_path, gdal.GA_ReadOnly)

sample_x_size = sample_img.RasterXSize
sample_y_size = sample_img.RasterYSize

sample_band = sample_img.GetRasterBand(1)
sample_data = sample_band.ReadAsArray(0, 0, sample_x_size, sample_y_size)

sample_img = None
sample_band = None

class_col = sample_data.ravel()[np.flatnonzero(sample_data)]
y_col, x_col = np.nonzero(sample_data)

CPU times: total: 10.7 s
Wall time: 11.3 s


In [5]:
%%time

data = np.column_stack((class_col, x_col, y_col))

CPU times: total: 3.73 s
Wall time: 5.27 s


In [6]:
%%time

data[:10]

CPU times: total: 0 ns
Wall time: 18 ms


array([[   20, 37492,  1429],
       [   20, 37494,  1429],
       [   20, 37495,  1429],
       [   20, 37496,  1429],
       [   20, 37487,  1431],
       [   20, 37486,  1432],
       [   20, 37487,  1432],
       [   20, 37483,  1434],
       [   20, 37484,  1434],
       [   13, 37515,  1434]], dtype=int64)

In [7]:
%%time

for channel_filename in channels_filenames:
    channel_path = os.path.join(img_dir, channel_filename)
    channel_img = gdal.Open(channel_path, gdal.GA_ReadOnly)

    channel_x_size = channel_img.RasterXSize
    channel_y_size = channel_img.RasterYSize

    channel_band = channel_img.GetRasterBand(1)
    channel_data = channel_band.ReadAsArray(0, 0, channel_x_size, channel_y_size)

    channel_img = None
    channel_band = None

    channel_col = channel_data[y_col, x_col]

    data = np.column_stack((data, channel_col))

CPU times: total: 7min 21s
Wall time: 24min 23s


In [31]:
%%time

data[:10]

CPU times: total: 0 ns
Wall time: 0 ns


array([[   20, 37492,  1429,  5517,  4840,   406,   450,   264,   375,
          450,   264,   406,   450,   264],
       [   20, 37494,  1429,  6099,  5531,   499,   580,   349,   224,
          580,   349,   499,   580,   349],
       [   20, 37495,  1429,  6351,  5795,   533,   578,   596,   186,
          578,   596,   533,   578,   596],
       [   20, 37496,  1429,  6729,  6179,   379,   411,   573,   196,
          411,   573,   379,   411,   573],
       [   20, 37487,  1431,  7467,  6886,   197,   158,    90,   185,
          158,    90,   145,    68,     3],
       [   20, 37486,  1432,  7284,  6719,   191,   218,    47,   260,
          218,    47,   127,    57,     8],
       [   20, 37487,  1432,  6999,  6378,   343,   288,   379,   247,
          288,   379,   190,    34,   239],
       [   20, 37483,  1434,  5548,  4771,    30,    23,    22,   181,
           23,    22,    30,    23,    22],
       [   20, 37484,  1434,  5419,  4707,   113,    98,   197,   285,
         

In [23]:
%%time

data_pandas = pd.DataFrame(data=data, columns=column_names, dtype=np.uint16)

CPU times: total: 46.7 s
Wall time: 3min 14s


In [24]:
%%time

data_pandas.to_csv('data/lc_sample.csv', index=False)

CPU times: total: 5min 10s
Wall time: 5min 50s


In [34]:
%%time

data_pandas.head(10)

CPU times: total: 0 ns
Wall time: 0 ns


Unnamed: 0,CLASS,X,Y,WINTER1,WINTER2,SPRING1,SPRING2,SPRING3,SUMMER1,SUMMER2,SUMMER3,FALL1,FALL2,FALL3
0,20,37492,1429,5517,4840,406,450,264,375,450,264,406,450,264
1,20,37494,1429,6099,5531,499,580,349,224,580,349,499,580,349
2,20,37495,1429,6351,5795,533,578,596,186,578,596,533,578,596
3,20,37496,1429,6729,6179,379,411,573,196,411,573,379,411,573
4,20,37487,1431,7467,6886,197,158,90,185,158,90,145,68,3
5,20,37486,1432,7284,6719,191,218,47,260,218,47,127,57,8
6,20,37487,1432,6999,6378,343,288,379,247,288,379,190,34,239
7,20,37483,1434,5548,4771,30,23,22,181,23,22,30,23,22
8,20,37484,1434,5419,4707,113,98,197,285,98,197,197,124,38
9,13,37515,1434,9940,9614,602,928,1977,776,928,1977,625,863,1771


In [35]:
%%time

data_pandas.describe()

CPU times: total: 1min 2s
Wall time: 1min 36s


Unnamed: 0,CLASS,X,Y,WINTER1,WINTER2,SPRING1,SPRING2,SPRING3,SUMMER1,SUMMER2,SUMMER3,FALL1,FALL2,FALL3
count,74029670.0,74029670.0,74029670.0,74029670.0,74029670.0,74029670.0,74029670.0,74029670.0,74029670.0,74029670.0,74029670.0,74029670.0,74029670.0,74029670.0
mean,7.61112,24376.72,12002.4,4548.318,4731.402,548.9833,2052.998,1848.421,421.4263,2556.332,1787.649,512.4377,2131.846,1697.764
std,6.342813,8210.278,3491.216,2416.994,2041.213,222.3997,591.3594,544.046,251.9643,701.8792,547.6155,253.7448,548.4777,582.3136
min,1.0,5820.0,1429.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,3.0,17874.0,9354.0,2529.0,3018.0,383.0,1693.0,1519.0,255.0,2174.0,1474.0,338.0,1835.0,1355.0
50%,4.0,25348.0,12143.0,3989.0,4178.0,506.0,1966.0,1812.0,333.0,2462.0,1704.0,470.0,2098.0,1613.0
75%,12.0,30730.0,14441.0,6667.0,6565.0,680.0,2374.0,2191.0,508.0,2939.0,2069.0,623.0,2467.0,1994.0
max,23.0,40161.0,20580.0,10583.0,10584.0,8653.0,7018.0,5211.0,8653.0,6907.0,4923.0,8653.0,6907.0,5280.0


## Классификация

### Разделение выборки на обучающую и тестовую

In [3]:
%%time

data = pd.read_csv('data/lc_sample.csv', index_col=False, dtype=np.uint16)

CPU times: total: 1min 17s
Wall time: 1min 22s


In [14]:
%%time

msk = np.random.rand(len(data)) <= 0.9

data_train = data[~msk]
data_test = data[msk]

CPU times: total: 32.9 s
Wall time: 47.1 s


In [16]:
%%time

data.shape, data_train.shape, data_test.shape

CPU times: total: 0 ns
Wall time: 0 ns


((74029669, 14), (7406365, 14), (66623304, 14))

In [17]:
%%time

data_train.to_csv('data/lc_sample_train.csv', index=False)
data_test.to_csv('data/lc_sample_control.csv', index=False)

CPU times: total: 5min 30s
Wall time: 6min 6s


### Обучение и классификация

In [2]:
%%time

data_train = pd.read_csv('data/lc_sample_train.csv', dtype=np.uint16, index_col=False)

CPU times: total: 7.45 s
Wall time: 8.3 s


In [6]:
%%time

X_train = data_train[data_train.columns.drop("CLASS")]
y_train = data_train["CLASS"]
X_train.shape, y_train.shape

CPU times: total: 1.56 s
Wall time: 2.08 s


((7406365, 13), (7406365,))

In [7]:
%%time

rf = RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=SEED)

CPU times: total: 15.6 ms
Wall time: 9 ms


In [8]:
%%time

rf.fit(X_train, y_train)

CPU times: total: 2h 48min 24s
Wall time: 52min 27s


In [2]:
%%time

# dump(rf, 'data/backups/rf_fitted_on_data_train.joblib')
rf = load('data/backups/rf_fitted_on_data_train.joblib')

CPU times: total: 26.2 s
Wall time: 2min 25s


In [11]:
%%time

oob = rf.oob_score_
oob

CPU times: total: 0 ns
Wall time: 1 ms


0.976938484668255

In [12]:
%%time

data_test = pd.read_csv('data/lc_sample_control.csv', dtype=np.uint16, index_col=False)

CPU times: total: 1min 21s
Wall time: 1min 32s


In [13]:
%%time

X_test = data_test[data_test.columns.drop("CLASS")]
y_test = data_test["CLASS"]
X_test.shape, y_test.shape

CPU times: total: 4.7 s
Wall time: 7.28 s


((66623304, 13), (66623304,))

In [18]:
%%time

X_test.to_csv('data/X_test.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)

CPU times: total: 5min 38s
Wall time: 6min 25s


In [3]:
%%time

X_test = pd.read_csv('data/X_test.csv', dtype=np.uint16, index_col=False)
y_test = pd.read_csv('data/y_test.csv', dtype=np.uint16, index_col=False)

CPU times: total: 1min 13s
Wall time: 1min 22s


In [5]:
chunksize = 5000000
chunks_number = int(np.ceil(X_test.shape[0] / chunksize))
chunks_number

14

In [6]:
%%time

for i, chunk in enumerate(pd.read_csv('data/X_test.csv', dtype=np.uint16, chunksize=chunksize)):
    chunk.to_csv(f'data/backups/X_test_chunk_{i}.csv', index=False)

CPU times: total: 5min 49s
Wall time: 7min 4s


In [7]:
%%time

y_pred = []
for i in range(chunks_number):
    X_test = pd.read_csv(f'data/backups/X_test_chunk_{i}.csv', dtype=np.uint16)
    y_pred += list(rf.predict(X_test))

y_pred = np.array(y_pred)

CPU times: total: 1h 39min 12s
Wall time: 40min 2s


In [9]:
%%time

y_pred_series = pd.Series(y_pred)
y_pred_series.to_csv('data/y_pred.csv', index=False)

CPU times: total: 1min 4s
Wall time: 1min 6s


In [13]:
%%time

f1_score_weighted = f1_score(y_test, y_pred, average='weighted')
f1_score_weighted

CPU times: total: 22.5 s
Wall time: 23.4 s


0.9775497095053868

In [14]:
%%time

accuracy = accuracy_score(y_test, y_pred)
accuracy

CPU times: total: 3.08 s
Wall time: 3.69 s


0.9777492272073447

## Получение карты земного покрова по результатам классификации

In [3]:
%%time

X_test = pd.read_csv('data/X_test.csv', index_col=False, dtype=np.uint16)
y_pred = pd.read_csv('data/y_pred.csv', index_col=False, dtype=np.uint16)

CPU times: total: 1min 10s
Wall time: 1min 17s


In [8]:
%%time

xs, ys = X_test['X'], X_test['Y']
X_test = None

CPU times: total: 0 ns
Wall time: 0 ns


In [22]:
%%time

xs = np.array(xs)
ys = np.array(ys)
y_pred = np.array(y_pred).ravel()

CPU times: total: 125 ms
Wall time: 146 ms


In [25]:
xs, ys, y_pred

(array([37492, 37494, 37495, ..., 35628, 35631, 35632], dtype=uint16),
 array([ 1429,  1429,  1429, ..., 20580, 20580, 20580], dtype=uint16),
 array([20, 13, 13, ...,  8,  8,  8], dtype=uint16))

In [52]:
%%time

sample_path = os.path.join(img_dir, sample_filename)
sample_img = gdal.Open(sample_path, gdal.GA_ReadOnly)

output_path = os.path.join(img_dir, output_filename)
driver = sample_img.GetDriver()
output_img = driver.Create(output_path, sample_img.RasterXSize, sample_img.RasterYSize, 1, gdal.GDT_UInt16)

output_img.SetGeoTransform(sample_img.GetGeoTransform())
output_img.SetProjection(sample_img.GetProjection())
output_img.GetRasterBand(1).Fill(0)

output_band = output_img.GetRasterBand(1)

CPU times: total: 609 ms
Wall time: 720 ms


In [39]:
%%time

raster = np.zeros((sample_img.RasterYSize, sample_img.RasterXSize), dtype=np.uint8)

CPU times: total: 0 ns
Wall time: 0 ns


In [40]:
%%time

for y, x, predicted_class in zip(ys, xs, y_pred):
    raster[y][x] = predicted_class

CPU times: total: 50.1 s
Wall time: 51.9 s


In [53]:
%%time

output_band.WriteArray(raster)
output_img = None
output_band = None

CPU times: total: 1.86 s
Wall time: 13.5 s
