In [7]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import os
from os.path import join
import numpy as np
import pandas as pd
from sklearn import metrics

In [8]:
WINDOW_SIZE = 80


In [9]:
def divide_windows(features, labels, window_size, step_factor):
    step = int(window_size * step_factor)
    new_features, new_labels = list(), list()
    for i in range(0, features.shape[0] - window_size + 1, step):
        if len(set(labels[i:i + window_size])) != 1:
            continue
        new_features.append([features[i:i + window_size, j] for j in range(features.shape[1])])
        new_labels.append(labels[i])
    return np.array(new_features), np.array(new_labels)

In [10]:
clean_data_path = r"C:\Users\ke4\Desktop\Goznak_ML_Tasks\train\train\clean"
noisy_data_path = r"C:\Users\ke4\Desktop\Goznak_ML_Tasks\train\train\noisy"

In [11]:
clean_data_path_val = r"C:\Users\ke4\Desktop\Goznak_ML_Tasks\val\clean"
noisy_data_path_val = r"C:\Users\ke4\Desktop\Goznak_ML_Tasks\val\noisy"

In [12]:
clean_data = []
for folder in os.listdir(clean_data_path):
    for file in os.listdir(join(clean_data_path, folder)):
        data = np.load(join(clean_data_path, folder, file))
        clean_data.append(data)
        
noisy_data = []
for folder in os.listdir(noisy_data_path):
    for file in os.listdir(join(noisy_data_path, folder)):
        data= np.load(join(noisy_data_path, folder, file))
        noisy_data.append(data)

In [13]:
clean_data_val = []
for folder in os.listdir(clean_data_path_val):
    for file in os.listdir(join(clean_data_path_val, folder)):
        data = np.load(join(clean_data_path_val, folder, file))
        clean_data_val.append(data)
        
noisy_data_val = []
for folder in os.listdir(noisy_data_path_val):
    for file in os.listdir(join(noisy_data_path_val, folder)):
        data= np.load(join(noisy_data_path_val, folder, file))
        noisy_data_val.append(data)

In [14]:
df_clean = pd.DataFrame(np.vstack(clean_data))
df_clean['class']=0

df_noisy = pd.DataFrame(np.vstack(noisy_data))
df_noisy['class']=1

# df_total = pd.concat([df_clean,df_noisy])
# df_total.reset_index(drop=True, inplace=True)

# df_clean_val = pd.DataFrame(np.vstack(clean_data_val))
# df_clean_val['class']=0
# df_noisy_val = pd.DataFrame(np.vstack(noisy_data_val))
# df_noisy_val['class']=1
# df_total_val = pd.concat([df_clean_val,df_noisy_val])
# df_total_val.reset_index(drop=True, inplace=True)


In [15]:
wind_feature_clean, wind_label_clean=divide_windows(df_clean.drop('class', axis=1).values,
                                        labels = df_clean['class'],
                                        window_size=WINDOW_SIZE,
                                        step_factor=1)

wind_feature_noise, wind_label_noise=divide_windows(df_noisy.drop('class', axis=1).values,
                                        labels = df_noisy['class'],
                                        window_size=WINDOW_SIZE,
                                        step_factor=1)

In [16]:
wind_feature_noise[:100000].shape

(100000, 80, 80)

In [17]:
from keras.layers import Input, concatenate 
from keras.models import Model
from keras.layers import UpSampling1D, Conv1D

def define_skip_model():
  
    input_net = Input((80,80))
  
    conv1 = Conv1D(32, 3, strides=(2), activation = 'relu', padding = 'same')(input_net)
    conv2 = Conv1D(64, 3, strides=(2), activation = 'relu', padding = 'same')(conv1)
    conv3 = Conv1D(128, 3, strides=(2), activation = 'relu', padding = 'same')(conv2)
  
    conv4 = Conv1D(128, 3, strides=(2), activation = 'relu', padding = 'same')(conv3)
  
    up1 = Conv1D(128, 3, activation = 'relu', padding = 'same')(UpSampling1D(size = (2))(conv4))
    merge1 = concatenate([conv3,up1], axis = 2)
    up2 = Conv1D(64, 3, activation = 'relu', padding = 'same')(UpSampling1D(size = (2))(merge1))
    merge2 = concatenate([conv2,up2], axis = 2)
    up3 = Conv1D(32, 3, activation = 'relu', padding = 'same')(UpSampling1D(size = (2))(merge2))
    merge3 = concatenate([conv1,up3], axis = 2)
  
    up4 = Conv1D(32, 3, padding = 'same')(UpSampling1D(size = (2))(merge3))
  
    output_net = Conv1D(80, 80, padding = 'same')(up4)
    model = Model(inputs = input_net, outputs = output_net)
  
    return model

In [18]:
skip_model = define_skip_model()
skip_model.compile(optimizer = 'Adam', loss = 'mean_squared_error', metrics = ['mse'])
# не влезает весь датасет в ОЗУ, пришлось порезать до 100к, не стал разбираться с .fit_generator
skip_model.fit(wind_feature_noise[:100000], wind_feature_clean[:100000], batch_size=128, epochs=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ec54fe5e50>

In [None]:
# skip_model.save("models/denoise_autoencoder.h5")


In [19]:
def reverse_window(window_arr):
    last_arr = np.empty((0,80))
    for arr in window_arr:
        last_arr = np.concatenate([last_arr, arr.T])
    return last_arr

In [20]:
def get_predict(meg):
    window_size = 80
    step_factor = 1
    step = int(window_size * step_factor)
    new_features = list()
    for i in range(0, meg.shape[0] - window_size + 1, step):
        new_features.append([meg[i:i + window_size, j] for j in range(meg.shape[1])])
    wind_feature = np.array(new_features)
    return wind_feature
    
   

In [21]:
#для сравнения данные обрезаются под денойсед сигнал, костыльно
#нужно наполнять исходные данные нулями в конце,  при дилении на окна, что бы хорошо разбивалось на окна
mse_denoised = []
mse_noised = []
for index in range(len(noisy_data_val)):
    denoised = reverse_window(skip_model.predict(get_predict(noisy_data_val[index])))
    denoised_mse = metrics.mean_squared_error(denoised, clean_data_val[index][:denoised.shape[0]])
    noised_mse = metrics.mean_squared_error(noisy_data_val[index], clean_data_val[index])
    mse_noised.append(noised_mse)
    mse_denoised.append(denoised_mse)



























In [22]:
mean_mse_denoised = sum(mse_denoised) / len(mse_denoised)
mean_mse_noised = sum(mse_noised) / len(mse_noised)
print('Средний MSE на валидции у зашумленных данных - {}, у очищенных {}'.format(mean_mse_noised, mean_mse_denoised))


Среднее MSE на валидции у зашумленных данных - 0.13281359243392946, у очищенных 0.05745127119250464
