# Анализ метрик полученных из Prometheus с помощью машинного обучения
### Загрузка модуля $sktime$ 

In [None]:
!pip install sktime



### Импорт необходимых модулей

In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sktime.classification.compose import ColumnEnsembleClassifier
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.classification.interval_based import TimeSeriesForestClassifier
from sktime.transformations.panel.compose import ColumnConcatenator
from sktime.datatypes._panel._convert import from_2d_array_to_nested

### Загрузка данных

#### Считывание данных, полученных с помощью Prometheus с виртуальных машин (train)
Данные собраны в 3 состояниях:
- "Пустое" состояние
- Состояние повышенной нагрузки (с помощью утилиты stress)
- Состояние полезной нагрузки (работает Prometheus)

файлы diam_\*, rept_\* - зараженные \\
Файлы free_\* - чистые

In [None]:
fd = []
fd.append(pd.read_csv('/content/drive/MyDrive/curs_work/train/diam_1.csv', index_col=0))
fd.append(pd.read_csv('/content/drive/MyDrive/curs_work/train/diam_2.csv', index_col=0))
fd.append(pd.read_csv('/content/drive/MyDrive/curs_work/train/diam_3.csv', index_col=0))
fd.append(pd.read_csv('/content/drive/MyDrive/curs_work/train/rept_1.csv', index_col=0))
fd.append(pd.read_csv('/content/drive/MyDrive/curs_work/train/rept_2.csv', index_col=0))
fd.append(pd.read_csv('/content/drive/MyDrive/curs_work/train/rept_3.csv', index_col=0))
fd.append(pd.read_csv('/content/drive/MyDrive/curs_work/train/free_1.csv', index_col=0))
fd.append(pd.read_csv('/content/drive/MyDrive/curs_work/train/free_2.csv', index_col=0))
fd.append(pd.read_csv('/content/drive/MyDrive/curs_work/train/free_3.csv', index_col=0))

Разобьем данные с каждой ВМ на 3 части, для дальнейшего обучения

In [None]:
enlarged = []
for i in fd:
    enlarged.append(i.iloc[np.arange(0,40)])
    enlarged.append(i.iloc[np.arange(40,80)].reset_index(drop=True))
    enlarged.append(i.iloc[np.arange(80,120)].reset_index(drop=True))

##### Перевод данных из pandas DataFrame в тип данных, необходимый для работы с модулем sktime

In [None]:
sk_frame = from_2d_array_to_nested(enlarged[0].T).T
for i in np.arange(1, len(enlarged)):
    sk_frame = sk_frame.append(from_2d_array_to_nested(enlarged[i].T).T)

In [None]:
sk_frame.columns = enlarged[0].columns
sk_frame = sk_frame.reset_index(drop=True)
sk_frame = sk_frame.reindex(np.concatenate((np.arange(0,27,3), np.arange(1,27,3), np.arange(2,27,3))))
sk_frame = sk_frame.reset_index(drop=True)
sk_frame.head(5)


Unnamed: 0,libvirt_domain_block_stats_flush_requests_total,libvirt_domain_block_stats_flush_time_seconds_total,libvirt_domain_block_stats_read_bytes_total,libvirt_domain_block_stats_read_requests_total,libvirt_domain_block_stats_read_time_seconds_total,libvirt_domain_block_stats_write_bytes_total,libvirt_domain_block_stats_write_requests_total,libvirt_domain_block_stats_write_time_seconds_total,libvirt_domain_info_cpu_time_seconds_total,libvirt_domain_info_memory_usage_bytes,...,libvirt_domain_interface_stats_transmit_packets_total,libvirt_domain_memory_stats_disk_cache_bytes,libvirt_domain_memory_stats_major_fault_total,libvirt_domain_memory_stats_minor_fault_total,libvirt_domain_memory_stats_rss_bytes,libvirt_domain_memory_stats_unused_bytes,libvirt_domain_memory_stats_usable_bytes,libvirt_domain_memory_stats_used_percent,libvirt_domain_vcpu_cpu,libvirt_domain_vcpu_time_seconds_total
0,0 51896.0 1 51896.0 2 51908.0 3 ...,0 3.812795 1 3.812795 2 3.814651 3...,0 1.046134e+09 1 1.046134e+09 2 1....,0 48260.0 1 48260.0 2 48260.0 3 ...,0 140.977919 1 140.977919 2 140.97...,0 5.335634e+09 1 5.335634e+09 2 5....,0 75123.0 1 75123.0 2 75191.0 3 ...,0 136.586810 1 136.586810 2 136.61...,0 7242.05 1 7245.63 2 7249.16 3 ...,0 1.073742e+09 1 1.073742e+09 2 1....,...,0 52300.0 1 52321.0 2 52371.0 3 ...,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 3810.0 1 3810.0 2 3810.0 3 3...,0 22545435.0 1 22545435.0 2 225471...,0 1.121731e+09 1 1.121731e+09 2 1....,0 163254272.0 1 163287040.0 2 1637...,0 768663552.0 1 768696320.0 2 7692...,0 25.569054 1 25.565881 2 25.51670...,0 5.0 1 13.0 2 21.0 3 5.0 4 ...,0 615.10 1 615.17 2 615.30 3 6...
1,0 24780672.0 1 26145937.0 2 276454...,0 208.361958 1 219.678610 2 232.07...,0 1.107918e+09 1 1.107918e+09 2 1....,0 60581.0 1 60581.0 2 60581.0 3 ...,0 161.944016 1 161.944016 2 161.94...,0 5.486572e+09 1 5.486572e+09 2 5....,0 92736.0 1 92736.0 2 92736.0 3 ...,0 214.482965 1 214.482965 2 214.48...,0 13493.85 1 13642.20 2 13793.26 3...,0 1.073742e+09 1 1.073742e+09 2 1....,...,0 82356.0 1 82380.0 2 82395.0 3 ...,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 3960.0 1 3960.0 2 3960.0 3 3...,0 138040414.0 1 145709580.0 2 1527...,0 1.121944e+09 1 1.121944e+09 2 1....,0 408174592.0 1 536453120.0 2 3890...,0 618303488.0 1 746582016.0 2 5991...,0 40.128664 1 27.707245 2 41.97890...,0 15.0 1 11.0 2 3.0 3 3.0 4 ...,0 1721.68 1 1781.44 2 1841.19 3 ...
2,0 217333486.0 1 217333494.0 2 2173...,0 1745.715011 1 1745.716272 2 1745...,0 1.290030e+09 1 1.290067e+09 2 1....,0 62469.0 1 62473.0 2 62473.0 3 ...,0 166.998447 1 167.065036 2 167.06...,0 6.043132e+09 1 6.043615e+09 2 6....,0 111134.0 1 111204.0 2 111217.0 3...,0 226.401918 1 226.440294 2 226.44...,0 32085.56 1 32089.23 2 32092.89 3...,0 1.073742e+09 1 1.073742e+09 2 1....,...,0 115307.0 1 115318.0 2 115325.0 3...,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 4042.0 1 4043.0 2 4043.0 3 4...,0 1.066024e+09 1 1.066025e+09 2 1....,0 1.121886e+09 1 1.121886e+09 2 1....,0 162742272.0 1 162504704.0 2 1625...,0 753201152.0 1 753078272.0 2 7530...,0 27.066303 1 27.078202 2 27.07621...,0 9.0 1 1.0 2 1.0 3 27.0 4 ...,0 8959.46 1 8959.66 2 8959.82 3 ...
3,0 52634.0 1 52638.0 2 52638.0 3 ...,0 3.719479 1 3.720177 2 3.720177 3...,0 961037312.0 1 961037312.0 2 9610...,0 44975.0 1 44975.0 2 44975.0 3 ...,0 147.13808 1 147.13808 2 147.1380...,0 5.431456e+09 1 5.431472e+09 2 5....,0 82255.0 1 82258.0 2 82259.0 3 ...,0 114.547265 1 114.548299 2 114.54...,0 7192.28 1 7195.84 2 7199.23 3 ...,0 1.073742e+09 1 1.073742e+09 2 1....,...,0 51890.0 1 51908.0 2 51936.0 3 ...,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 2339.0 1 2339.0 2 2339.0 3 2...,0 18651846.0 1 18651846.0 2 186518...,0 1.119650e+09 1 1.119650e+09 2 1....,0 193613824.0 1 193646592.0 2 1936...,0 769568768.0 1 769601536.0 2 7696...,0 25.481400 1 25.478227 2 25.47505...,0 25.0 1 21.0 2 31.0 3 7.0 4 ...,0 915.08 1 915.17 2 915.26 3 9...
4,0 15871504.0 1 17405784.0 2 190254...,0 131.631761 1 143.894649 2 156.57...,0 980214784.0 1 980214784.0 2 9802...,0 46481.0 1 46481.0 2 46481.0 3 ...,0 155.539272 1 155.539272 2 155.53...,0 5.811597e+09 1 5.811597e+09 2 5....,0 111165.0 1 111165.0 2 111165.0 3...,0 139.674985 1 139.674985 2 139.67...,0 12577.01 1 12728.06 2 12880.53 3...,0 1.073742e+09 1 1.073742e+09 2 1....,...,0 82599.0 1 82620.0 2 82643.0 3 ...,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 2421.0 1 2421.0 2 2421.0 3 2...,0 93421994.0 1 101188184.0 2 1089...,0 1.119658e+09 1 1.119658e+09 2 1....,0 253440000.0 1 242671616.0 2 4057...,0 564498432.0 1 553172992.0 2 7162...,0 45.338696 1 46.435357 2 30.64502...,0 13.0 1 25.0 2 13.0 3 6.0 4 ...,0 1678.57 1 1738.35 2 1798.10 3 ...


In [None]:
y_train = np.array(([1 for i in range(6)] + [0 for i in range(3)]) * 3)
X_train = sk_frame

#### Считывание данных, полученных с помощью Prometheus с виртуальных машин (test)

In [None]:
fd_test = []
y_test = np.array([])
for root, dirs, files in os.walk('/content/drive/MyDrive/curs_work/test/'):  
    for f in files:
        print(f)
        fd_test.append(pd.read_csv('/content/drive/MyDrive/curs_work/test/' + f, index_col=0))
        if f == 'instance-00006400.csv' or f == 'instance-000063fe.csv' or f == 'instance-00006401.csv':
            y_test = np.append(y_test, 1)
        else:
            y_test = np.append(y_test, 0)

instance-0000096b.csv
instance-000063fe.csv
instance-000060f1.csv
instance-0000094d.csv
instance-0000055f.csv
instance-0000054d.csv
instance-000002cb.csv
instance-000001c3.csv
instance-000002bf.csv
instance-00006479.csv
instance-00006400.csv
instance-00000962.csv
instance-00006401.csv
instance-00005145.csv
instance-00000968.csv
instance-00000932.csv
instance-00000607.csv
instance-00000538.csv
instance-00000409.csv
instance-00000286.csv


##### Преобразование в sktime DataFrame

In [None]:
X_test = from_2d_array_to_nested(fd_test[0].T).T
for i in np.arange(1, len(fd_test)):
    X_test = X_test.append(from_2d_array_to_nested(fd_test[i].T).T)
X_test.columns = fd_test[0].columns
X_test = X_test.reset_index(drop=True)
X_test.head(26)

Unnamed: 0,libvirt_domain_block_stats_flush_requests_total,libvirt_domain_block_stats_flush_time_seconds_total,libvirt_domain_block_stats_read_bytes_total,libvirt_domain_block_stats_read_requests_total,libvirt_domain_block_stats_read_time_seconds_total,libvirt_domain_block_stats_write_bytes_total,libvirt_domain_block_stats_write_requests_total,libvirt_domain_block_stats_write_time_seconds_total,libvirt_domain_info_cpu_time_seconds_total,libvirt_domain_info_memory_usage_bytes,...,libvirt_domain_interface_stats_transmit_packets_total,libvirt_domain_memory_stats_disk_cache_bytes,libvirt_domain_memory_stats_major_fault_total,libvirt_domain_memory_stats_minor_fault_total,libvirt_domain_memory_stats_rss_bytes,libvirt_domain_memory_stats_unused_bytes,libvirt_domain_memory_stats_usable_bytes,libvirt_domain_memory_stats_used_percent,libvirt_domain_vcpu_cpu,libvirt_domain_vcpu_time_seconds_total
0,0 1924658.0 1 1924658.0 2 1924658....,0 278.026807 1 278.026807 2 278.02...,0 1.116498e+10 1 1.116498e+10 2 1....,0 1019192.0 1 1019192.0 2 1019192....,0 6924.720159 1 6924.720159 2 6924...,0 1.752094e+11 1 1.752094e+11 2 1....,0 12572097.0 1 12572097.0 2 125720...,0 35232.144585 1 35232.144585 2 35...,0 1881644.44 1 1881646.54 2 188164...,0 8.589935e+09 1 8.589935e+09 2 8....,...,0 20452181.0 1 20452211.0 2 204522...,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 4581.0 1 4581.0 2 4581.0 3 4...,0 560520931.0 1 560520931.0 2 5605...,0 8.623546e+09 1 8.623546e+09 2 8....,0 1.910174e+09 1 1.910272e+09 2 1....,0 7.745942e+09 1 7.746040e+09 2 7....,0 7.388321 1 7.387146 2 7.390672 3...,0 4.0 1 28.0 2 12.0 3 26.0 4 ...,0 171748.23 1 171748.50 2 171748.8...
1,0 207915675.0 1 207915685.0 2 2079...,0 1901.827081 1 1901.828116 2 1901...,0 3.666002e+09 1 3.666002e+09 2 3....,0 140992.0 1 140992.0 2 140992.0 3...,0 1317.435048 1 1317.435048 2 1317...,0 2.231612e+10 1 2.231637e+10 2 2....,0 1467940.0 1 1467984.0 2 1468057....,0 20081.778790 1 20083.600719 2 20...,0 161247.10 1 161249.38 2 161251.5...,0 1.073742e+09 1 1.073742e+09 2 1....,...,0 1369761.0 1 1369791.0 2 1369824....,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 7763.0 1 7763.0 2 7763.0 3 7...,0 1.093241e+09 1 1.093242e+09 2 1....,0 1.168650e+09 1 1.168650e+09 2 1....,0 264060928.0 1 263872512.0 2 2641...,0 731688960.0 1 731512832.0 2 7318...,0 29.149364 1 29.166419 2 29.13786...,0 28.0 1 31.0 2 0.0 3 14.0 4 ...,0 14812.25 1 14812.35 2 14812.47 3...
2,0 416490.0 1 416516.0 2 416546.0 3...,0 351.748647 1 351.750412 2 351.75...,0 1.908914e+09 1 1.908914e+09 2 1....,0 110227.0 1 110227.0 2 110227.0 3...,0 1085.833508 1 1085.833508 2 1085...,0 6.295988e+10 1 6.296431e+10 2 6....,0 4398420.0 1 4399326.0 2 4400260....,0 6817.273820 1 6817.462357 2 6817...,0 273496.40 1 273513.27 2 273530.3...,0 3.435974e+10 1 3.435974e+10 2 3....,...,0 208603.0 1 208635.0 2 208653.0 3...,0 3.540468e+09 1 3.540496e+09 2 3....,0 2931.0 1 2931.0 2 2931.0 3 2...,0 2.279965e+09 1 2.280234e+09 2 2....,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 2.738000e+10 1 2.735354e+10 2 2....,0 3.063694e+10 1 3.061054e+10 2 3....,0 9.012911 1 9.091299 2 9.008021 3...,0 9.0 1 29.0 2 3.0 3 11.0 4 ...,0 35723.34 1 35723.83 2 35725.32 3...
3,0 1864439.0 1 1864443.0 2 1864445....,0 454.507718 1 454.508690 2 454.50...,0 7.788868e+10 1 7.788868e+10 2 7....,0 651847.0 1 651847.0 2 651847.0 3...,0 30913.838704 1 30913.838704 2 30...,0 1.467438e+11 1 1.467438e+11 2 1....,0 4547747.0 1 4547751.0 2 4547754....,0 35218.455417 1 35218.456340 2 35...,0 161904.34 1 161904.52 2 161904.8...,0 2.147484e+09 1 2.147484e+09 2 2....,...,0 83627318.0 1 83627338.0 2 836285...,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 11201.0 1 11201.0 2 11201.0 3 ...,0 1.274827e+09 1 1.274827e+09 2 1....,0 2.140529e+09 1 2.140529e+09 2 2....,0 189808640.0 1 189808640.0 2 1896...,0 1.411596e+09 1 1.411596e+09 2 1....,0 26.752660 1 26.752660 2 26.75691...,0 12.0 1 22.0 2 2.0 3 8.0 4 ...,0 143611.52 1 143611.65 2 143611.9...
4,0 2185480.0 1 2185484.0 2 2185488....,0 274.517765 1 274.518062 2 274.51...,0 1.166085e+09 1 1.166085e+09 2 1....,0 118743.0 1 118743.0 2 118743.0 3...,0 1222.559447 1 1222.559447 2 1222...,0 1.068237e+11 1 1.068238e+11 2 1....,0 5576595.0 1 5576604.0 2 5576613....,0 19499.658551 1 19499.661733 2 19...,0 1766509.19 1 1766511.56 2 176651...,0 8.589935e+09 1 8.589935e+09 2 8....,...,0 3983814.0 1 3983814.0 2 3983814....,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 2059.0 1 2059.0 2 2059.0 3 2...,0 112438885.0 1 112438885.0 2 1124...,0 2.328211e+09 1 2.328211e+09 2 2....,0 6.494118e+09 1 6.494380e+09 2 6....,0 7.648182e+09 1 7.648444e+09 2 7....,0 8.637503 1 8.634371 2 8.635937 3...,0 14.0 1 6.0 2 16.0 3 0.0 4 ...,0 116509.66 1 116509.84 2 116510.0...
5,0 2573707.0 1 2573707.0 2 2573707....,0 350.439009 1 350.439009 2 350.43...,0 8.743032e+12 1 8.743032e+12 2 8....,0 19931368.0 1 19931368.0 2 199313...,0 364360.611894 1 364360.611894 2 ...,0 4.067009e+11 1 4.067009e+11 2 4....,0 14567147.0 1 14567147.0 2 145671...,0 34329.632140 1 34329.632140 2 34...,0 7612228.40 1 7612231.49 2 761223...,0 8.589935e+09 1 8.589935e+09 2 8....,...,0 26662006.0 1 26662047.0 2 266620...,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 3247.0 1 3247.0 2 3247.0 3 3...,0 152659363.0 1 152662087.0 2 1526...,0 8.628576e+09 1 8.628576e+09 2 8....,0 6.117143e+09 1 6.116561e+09 2 6....,0 7.508005e+09 1 7.507423e+09 2 7....,0 10.219410 1 10.226365 2 10.22509...,0 17.0 1 4.0 2 17.0 3 25.0 4 ...,0 785965.42 1 785965.44 2 785965.4...
6,0 1252291.0 1 1252291.0 2 1252291....,0 125.051482 1 125.051482 2 125.05...,0 3.760165e+09 1 3.760165e+09 2 3....,0 388098.0 1 388098.0 2 388098.0 3...,0 843.173041 1 843.173041 2 843.17...,0 1.225191e+11 1 1.225191e+11 2 1....,0 4794158.0 1 4794158.0 2 4794158....,0 11689.431430 1 11689.431430 2 11...,0 1718702.62 1 1718704.18 2 171870...,0 8.589935e+09 1 8.589935e+09 2 8....,...,0 591946.0 1 591946.0 2 591946.0 3...,0 2.929267e+09 1 2.929267e+09 2 2....,0 6516.0 1 6516.0 2 6516.0 3 6...,0 717637755.0 1 717638438.0 2 7176...,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 4.554895e+09 1 4.555043e+09 2 4....,0 7.727763e+09 1 7.727911e+09 2 7....,0 7.424133 1 7.422366 2 7.422660 3...,0 11.0 1 27.0 2 7.0 3 1.0 4 ...,0 13233.79 1 13233.81 2 13233.82 3...
7,0 88740296.0 1 88740422.0 2 887406...,0 11593.453459 1 11593.465196 2 11...,0 1.184182e+11 1 1.184198e+11 2 1....,0 13369285.0 1 13369635.0 2 133696...,0 52581.623027 1 52585.033637 2 52...,0 2.224863e+13 1 2.224867e+13 2 2....,0 214867098.0 1 214867451.0 2 2148...,0 273485.496014 1 273485.860114 2 ...,0 30935958.45 1 30936007.75 2 3093...,0 4.294967e+09 1 4.294967e+09 2 4....,...,0 1.109860e+09 1 1.109861e+09 2 1....,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 115568.0 1 115571.0 2 115571.0 3...,0 8.201143e+10 1 8.201159e+10 2 8....,0 4.320616e+09 1 4.320731e+09 2 4....,0 180850688.0 1 183971840.0 2 1754...,0 664760320.0 1 669970432.0 2 6618...,0 83.956128 1 83.830382 2 84.02740...,0 18.0 1 10.0 2 6.0 3 4.0 4 ...,0 14684159.02 1 14684182.34 2 1468...
8,0 1017857.0 1 1017857.0 2 1017861....,0 244.152348 1 244.152348 2 244.15...,0 468919808.0 1 468919808.0 2 4689...,0 20858.0 1 20858.0 2 20858.0 3 ...,0 145.517519 1 145.517519 2 145.51...,0 2.941898e+10 1 2.941898e+10 2 2....,0 2173418.0 1 2173418.0 2 2173427....,0 12600.403094 1 12600.403094 2 12...,0 7019912.61 1 7019923.11 2 701993...,0 8.589935e+09 1 8.589935e+09 2 8....,...,0 1258597.0 1 1258597.0 2 1258597....,0 741605376.0 1 741605376.0 2 7416...,0 819.0 1 819.0 2 819.0 3 819....,0 115775776.0 1 115775776.0 2 1157...,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 6.755987e+09 1 6.755918e+09 2 6....,0 7.090258e+09 1 7.090188e+09 2 7....,0 12.953925 1 12.954780 2 12.95638...,0 10.0 1 14.0 2 20.0 3 10.0 4 ...,0 4629211.28 1 4629215.67 2 462921...
9,0 1036021.0 1 1036047.0 2 1036080....,0 345.959152 1 345.963141 2 345.96...,0 690943488.0 1 690943488.0 2 6909...,0 25428.0 1 25428.0 2 25428.0 3 ...,0 101.071546 1 101.071546 2 101.07...,0 1.004270e+11 1 1.004289e+11 2 1....,0 17320304.0 1 17320671.0 2 173213...,0 12650.216638 1 12650.289063 2 12...,0 1237798.25 1 1237821.65 2 123784...,0 1.717987e+10 1 1.717987e+10 2 1....,...,0 3764246.0 1 3764313.0 2 3764377....,0 7.782605e+09 1 7.782830e+09 2 7....,0 6637.0 1 6637.0 2 6637.0 3 6...,0 9.267874e+09 1 9.268131e+09 2 9....,0 0.0 1 0.0 2 0.0 3 0.0 4 ...,0 7.507198e+09 1 7.523946e+09 2 7....,0 1.537577e+10 1 1.539275e+10 2 1....,0 8.403585 1 8.302444 2 8.308471 3...,0 26.0 1 6.0 2 16.0 3 30.0 4 ...,0 307849.98 1 307853.32 2 307861.8...


In [None]:
for i in range(len(X_test)):
    for j in range(len(X_test.columns)):
        print(i,j, end = " ")
        
print()
print(X_test.iloc[9,9])

# cc.fit_transform(X_test.iloc[:, [0,9]])

0 0 0 1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 9 0 10 0 11 0 12 0 13 0 14 0 15 0 16 0 17 0 18 0 19 0 20 0 21 0 22 0 23 0 24 0 25 0 26 1 0 1 1 1 2 1 3 1 4 1 5 1 6 1 7 1 8 1 9 1 10 1 11 1 12 1 13 1 14 1 15 1 16 1 17 1 18 1 19 1 20 1 21 1 22 1 23 1 24 1 25 1 26 2 0 2 1 2 2 2 3 2 4 2 5 2 6 2 7 2 8 2 9 2 10 2 11 2 12 2 13 2 14 2 15 2 16 2 17 2 18 2 19 2 20 2 21 2 22 2 23 2 24 2 25 2 26 3 0 3 1 3 2 3 3 3 4 3 5 3 6 3 7 3 8 3 9 3 10 3 11 3 12 3 13 3 14 3 15 3 16 3 17 3 18 3 19 3 20 3 21 3 22 3 23 3 24 3 25 3 26 4 0 4 1 4 2 4 3 4 4 4 5 4 6 4 7 4 8 4 9 4 10 4 11 4 12 4 13 4 14 4 15 4 16 4 17 4 18 4 19 4 20 4 21 4 22 4 23 4 24 4 25 4 26 5 0 5 1 5 2 5 3 5 4 5 5 5 6 5 7 5 8 5 9 5 10 5 11 5 12 5 13 5 14 5 15 5 16 5 17 5 18 5 19 5 20 5 21 5 22 5 23 5 24 5 25 5 26 6 0 6 1 6 2 6 3 6 4 6 5 6 6 6 7 6 8 6 9 6 10 6 11 6 12 6 13 6 14 6 15 6 16 6 17 6 18 6 19 6 20 6 21 6 22 6 23 6 24 6 25 6 26 7 0 7 1 7 2 7 3 7 4 7 5 7 6 7 7 7 8 7 9 7 10 7 11 7 12 7 13 7 14 7 15 7 16 7 17 7 18 7 19 7 20 7 21 7 22 7 23 7 24 7 25 7 26 

#### Обучение моделей

Функция делящая n элементов на нужное количество фолдов

In [None]:
def kfold_split(num_objects, num_folds):
    ans = []
    count = num_objects // num_folds
    mas = np.arange(num_objects)
    i = 0
    for i in range(0, count * (num_folds - 1), count):
        ans.append((np.hstack([mas[:i], mas[i+count:]]), mas[i:i + count]))
    else:
        i += count
        ans.append((mas[:i], mas[i:]))
    return ans

### Анализ признаков 

Сначала проведем кросс-валидацию по каждому столбцу отдельно и посмотрим какие столбцы дудут наилучший результат

##### Отбор через TimeSeriesForestClassifier

In [None]:
tf_ans = dict()
folds = kfold_split(len(X_train), 3)
for i in range(27):
    for k in np.logspace(1, 3, 10):
        metrics = 0
        for datas in folds:
            X_train_cv = X_train.iloc[datas[0]]
            y_train_cv = y_train[datas[0]]
            X_test_cv = X_train.iloc[datas[1]]
            y_test_cv = y_train[datas[1]]
            clf = ColumnEnsembleClassifier(
                estimators=[
                    ("TSF", TimeSeriesForestClassifier(n_estimators=int(k)), [i]),
                ]
            )
            clf.fit(X_train_cv, y_train_cv)
            metrics += clf.score(X_test_cv, y_test_cv)
        tf_ans[(i, int(k))] = metrics / len(folds)

Посмотрим на столбцы, по которым модель наиболее эффективно определяет зараженность ВМ

In [None]:
final_dict = {k:v for k, v in tf_ans.items() if v > 0.7}
tf_cols = list(set(i[0] for i in list(final_dict.keys())))
sk_frame.columns[tf_cols]

Index(['libvirt_domain_block_stats_flush_requests_total',
       'libvirt_domain_block_stats_flush_time_seconds_total',
       'libvirt_domain_block_stats_read_bytes_total',
       'libvirt_domain_block_stats_read_requests_total',
       'libvirt_domain_block_stats_read_time_seconds_total',
       'libvirt_domain_block_stats_write_bytes_total',
       'libvirt_domain_block_stats_write_requests_total',
       'libvirt_domain_block_stats_write_time_seconds_total',
       'libvirt_domain_info_cpu_time_seconds_total',
       'libvirt_domain_interface_stats_receive_bytes_total',
       'libvirt_domain_interface_stats_receive_packets_total',
       'libvirt_domain_interface_stats_transmit_bytes_total',
       'libvirt_domain_interface_stats_transmit_packets_total ',
       'libvirt_domain_memory_stats_major_fault_total',
       'libvirt_domain_memory_stats_minor_fault_total',
       'libvirt_domain_memory_stats_rss_bytes',
       'libvirt_domain_memory_stats_unused_bytes',
       'libvirt_do

##### Отбор через KNeighborsTimeSeriesClassifier

In [None]:
knn_ans = dict()
folds = kfold_split(len(X_train), 3)
for i in range(27):
    for k in [1, 3, 5, 7, 9]:
        metrics = 0
        for datas in folds:
            X_train_cv = X_train.iloc[datas[0]]
            y_train_cv = y_train[datas[0]]
            X_test_cv = X_train.iloc[datas[1]]
            y_test_cv = y_train[datas[1]]
            clf = ColumnEnsembleClassifier(
                estimators=[
                    ("TSF", KNeighborsTimeSeriesClassifier(n_neighbors=int(k)), [i]),
                ]
            )
            clf.fit(X_train_cv, y_train_cv)
            metrics += clf.score(X_test_cv, y_test_cv)
        knn_ans[(i, int(k))] = metrics / len(folds)

Посмотрим на столбцы, по которым модель наиболее эффективно определяет зараженность ВМ

In [None]:
final_dict = {k:v for k, v in knn_ans.items() if v > 0.7}
knn_cols = list(set(i[0] for i in list(final_dict.keys())))
sk_frame.columns[knn_cols]

Index(['libvirt_domain_block_stats_flush_requests_total',
       'libvirt_domain_block_stats_flush_time_seconds_total',
       'libvirt_domain_block_stats_read_bytes_total',
       'libvirt_domain_block_stats_read_requests_total',
       'libvirt_domain_block_stats_read_time_seconds_total',
       'libvirt_domain_block_stats_write_bytes_total',
       'libvirt_domain_block_stats_write_requests_total',
       'libvirt_domain_block_stats_write_time_seconds_total',
       'libvirt_domain_info_cpu_time_seconds_total',
       'libvirt_domain_interface_stats_receive_bytes_total',
       'libvirt_domain_interface_stats_receive_packets_total',
       'libvirt_domain_interface_stats_transmit_bytes_total',
       'libvirt_domain_interface_stats_transmit_packets_total ',
       'libvirt_domain_memory_stats_major_fault_total',
       'libvirt_domain_memory_stats_minor_fault_total',
       'libvirt_domain_memory_stats_rss_bytes',
       'libvirt_domain_memory_stats_unused_bytes',
       'libvirt_do

##### Агрегирование результатов отбора с двух разных моделей

Таким образом мы получили наиболее информативные столбцы для отслеживания.

In [None]:
inter = list(set(tf_cols).intersection(knn_cols))
sk_frame.columns[inter]

Index(['libvirt_domain_block_stats_flush_requests_total',
       'libvirt_domain_block_stats_flush_time_seconds_total',
       'libvirt_domain_block_stats_read_bytes_total',
       'libvirt_domain_block_stats_read_requests_total',
       'libvirt_domain_block_stats_read_time_seconds_total',
       'libvirt_domain_block_stats_write_bytes_total',
       'libvirt_domain_block_stats_write_requests_total',
       'libvirt_domain_block_stats_write_time_seconds_total',
       'libvirt_domain_info_cpu_time_seconds_total',
       'libvirt_domain_interface_stats_receive_bytes_total',
       'libvirt_domain_interface_stats_receive_packets_total',
       'libvirt_domain_interface_stats_transmit_bytes_total',
       'libvirt_domain_interface_stats_transmit_packets_total ',
       'libvirt_domain_memory_stats_major_fault_total',
       'libvirt_domain_memory_stats_minor_fault_total',
       'libvirt_domain_memory_stats_rss_bytes',
       'libvirt_domain_memory_stats_unused_bytes',
       'libvirt_do

### Обучение и тестирование моделей

#### ColumnConcatenator

Мы можем объединить многомерные временные ряды/панельные данные в длинные одномерные временные ряды/панели, а затем применить классификатор к одномерным данным. 

##### TimeSeriesForestClassifier

1) кросс-валидация

In [None]:
ans = {}
folds = kfold_split(len(X_train), 3)
for k in np.logspace(1, 3, 10):
    metrics = 0
    for datas in folds:
        X_train_cv = X_train.iloc[:, inter].iloc[datas[0]]
        y_train_cv = y_train[datas[0]]
        X_test_cv = X_train.iloc[:, inter].iloc[datas[1]]
        y_test_cv = y_train[datas[1]]
        steps = [
            ("concatenate", ColumnConcatenator()),
            ("classify", TimeSeriesForestClassifier(n_estimators=int(k))),
        ]        
        clf = Pipeline(steps)
        clf.fit(X_train_cv, y_train_cv)
        metrics += clf.score(X_test_cv, y_test_cv)
    ans[int(k)] = metrics / len(folds)


2) поиск наилучшего значения

In [None]:
max_val = max(ans.values())
final_dict = {k:v for k, v in ans.items() if v == max_val}
final_dict

{10: 1.0,
 16: 1.0,
 27: 1.0,
 46: 1.0,
 77: 1.0,
 129: 1.0,
 215: 1.0,
 359: 1.0,
 599: 1.0,
 1000: 1.0}

3) обучение и тест

In [None]:
steps = [
    ("concatenate", ColumnConcatenator()),
    ("classify", TimeSeriesForestClassifier(n_estimators=100)),
]        
clf = Pipeline(steps)
clf.fit(X_train.iloc[:, inter], y_train)
clf.score(X_test.iloc[:, inter], y_test)

0.15

In [None]:
clf.predict(X_test.iloc[:, inter]), y_test

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
        0., 0., 0.]))

##### KNeighborsTimeSeriesClassifier

1) кросс-валидация

In [None]:
ans = {}
folds = kfold_split(len(X_train), 3)
for k in [1,3,5,7,9]:
    metrics = 0
    for datas in folds:
        X_train_cv = X_train.iloc[:, inter].iloc[datas[0]]
        y_train_cv = y_train[datas[0]]
        X_test_cv = X_train.iloc[:, inter].iloc[datas[1]]
        y_test_cv = y_train[datas[1]]
        steps = [
            ("concatenate", ColumnConcatenator()),
            ("classify", KNeighborsTimeSeriesClassifier(n_neighbors=int(k))),
        ]        
        clf = Pipeline(steps)
        clf.fit(X_train_cv, y_train_cv)
        metrics += clf.score(X_test_cv, y_test_cv)
    ans[int(k)] = metrics / len(folds)


2) поиск наилучшего значения k

In [None]:
max_val = max(ans.values())
final_dict = {k:v for k, v in ans.items() if v == max_val}
final_dict

{1: 0.9259259259259259}

3) обучение и тест

In [None]:
steps = [
    ("concatenate", ColumnConcatenator()),
    ("classify", KNeighborsTimeSeriesClassifier(n_neighbors=1)),
]        
clf = Pipeline(steps)
clf.fit(X_train.iloc[:, inter], y_train)
clf.score(X_test.iloc[:, inter], y_test)

0.2

In [None]:
clf.predict(X_test.iloc[:, inter]), y_test

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
        0., 0., 0.]))

#### ColumnEnsembleClassifier

Мы также можем подобрать один классификатор для определенных столбцов временных рядов, а затем агрегировать их прогнозы.

##### TimeSeriesForestClassifier

1) Кросс валидация

In [None]:
ans = {}
folds = kfold_split(len(X_train), 3)
for k in np.logspace(1, 3, 10):
    metrics = 0
    for datas in folds:
        X_train_cv = X_train.iloc[:, inter].iloc[datas[0]]
        y_train_cv = y_train[datas[0]]
        X_test_cv = X_train.iloc[:, inter].iloc[datas[1]]
        y_test_cv = y_train[datas[1]]
        clf = ColumnEnsembleClassifier(
            estimators=[
                ("TSF" + str(i), TimeSeriesForestClassifier(n_estimators=int(k)), [i]) for i in range(len(inter))
            ]
        )
        clf.fit(X_train_cv, y_train_cv)
        metrics += clf.score(X_test_cv, y_test_cv)
    ans[int(k)] = metrics / len(folds)

2) Поиск наилучшего k

In [None]:
max_val = max(ans.values())
final_dict = {k:v for k, v in ans.items() if v == max_val}
final_dict

{10: 1.0,
 16: 1.0,
 27: 1.0,
 46: 1.0,
 77: 1.0,
 129: 1.0,
 215: 1.0,
 359: 1.0,
 599: 1.0,
 1000: 1.0}

3) Обучение и тест

In [None]:
clf = ColumnEnsembleClassifier(
    estimators=[
        ("TSF" + str(i), TimeSeriesForestClassifier(n_estimators=int(100)), [i]) for i in range(len(inter))
    ]
)
clf.fit(X_train.iloc[:, inter], y_train)
clf.score(X_test.iloc[:, inter], y_test)

0.2

In [None]:
clf.predict(X_test.iloc[:, inter]), y_test

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
        0., 0., 0.]))

##### KNeighborsTimeSeriesClassifier

1) Кросс валидация

In [None]:
ans = {}
folds = kfold_split(len(X_train), 3)
for k in [1,3,5,7,9]:
    metrics = 0
    for datas in folds:
        X_train_cv = X_train.iloc[:, inter].iloc[datas[0]]
        y_train_cv = y_train[datas[0]]
        X_test_cv = X_train.iloc[:, inter].iloc[datas[1]]
        y_test_cv = y_train[datas[1]]
        clf = ColumnEnsembleClassifier(
            estimators=[
                ("TSF" + str(i), KNeighborsTimeSeriesClassifier(n_neighbors=int(k)), [i]) for i in range(len(inter))
            ]

        )
        clf.fit(X_train_cv, y_train_cv)
        metrics += clf.score(X_test_cv, y_test_cv)
    ans[int(k)] = metrics / len(folds)

2) Поиск наилучшего k

In [None]:
max_val = max(ans.values())
final_dict = {k:v for k, v in ans.items() if v == max_val}
final_dict

{1: 1.0, 3: 1.0}

3) Обучение и тест

In [None]:
clf = ColumnEnsembleClassifier(
    estimators=[
        ("TSF" + str(i), KNeighborsTimeSeriesClassifier(n_neighbors=int(1)), [i]) for i in range(len(inter))
    ]
)
clf.fit(X_train.iloc[:, inter], y_train)
clf.score(X_test.iloc[:, inter], y_test)

0.15

In [None]:
clf.predict(X_test.iloc[:, inter]), y_test

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
        0., 0., 0.]))