## auto encoder を使って欠損を補完する

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable #自動微分用

import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

import torchvision
import torchvision.transforms as transforms


import matplotlib.pyplot as plt
import numpy as np

import numpy as np
import pandas as pd
from scipy.special import erfinv

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.preprocessing import StandardScaler
import lightgbm

from logging import getLogger
from tqdm import tqdm_notebook as tqdm
import argparse
import datetime
import random
from itertools import chain
import pickle
import warnings
from matplotlib import pyplot as plt
import seaborn as sns

import gc
import sys
from multiprocessing import Pool
sys.path.append('../')

from tools.my_logging import logInit
from tools.feature_tools import feature_engineering
from tools.objective_function import weighted_multi_logloss, lgb_multi_weighted_logloss, wloss_objective, wloss_metric, softmax, calc_team_score
from tools.model_io import save_models, load_models
from tools.fold_resampling import get_fold_resampling_dict

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
%config InlineBackend.figure_formats = {'png', 'retina'}

  from pandas.core import datetools


In [3]:
DEVICE='cpu'

#### まずは biLSTM AE を実装してみる

TODO
0. 出力層の acivation, loss func 考える
0. 動作確認 
0. bilstm に改良 
0. three layer に改良 
0. dropout, normalization 追加 

In [4]:
NTHREAD = 62
random.seed(71)
np.random.seed(42)
torch.manual_seed(71)
%load_ext autoreload
%autoreload 2

In [24]:
class EncoderbiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, isCuda, dropout=0.):
        super(EncoderbiLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.isCuda = isCuda
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        #self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout) #bidirectional=True)
        self.relu = nn.ReLU()
        
        #initialize weights
        nn.init.xavier_uniform(self.lstm.weight_ih_l0, gain=np.sqrt(2))
        nn.init.xavier_uniform(self.lstm.weight_hh_l0, gain=np.sqrt(2))

    def forward(self, input):
        tt = torch.cuda if self.isCuda else torch
        h0 = Variable(tt.FloatTensor(torch.zeros(self.num_layers*2, input.size(0), self.hidden_size)))
        c0 = Variable(tt.FloatTensor(torch.zeros(self.num_layers*2, input.size(0), self.hidden_size)))
        encoded_input, hidden = self.lstm(input, (h0, c0))
        encoded_input = self.relu(encoded_input)
        return encoded_input

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, isCuda, dropout=0.):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        
        self.isCuda = isCuda
        self.lstm = nn.LSTM(hidden_size, output_size, num_layers, batch_first=True, dropout=dropout)
        #self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
        #initialize weights
        nn.init.xavier_uniform(self.lstm.weight_ih_l0, gain=np.sqrt(2))
        nn.init.xavier_uniform(self.lstm.weight_hh_l0, gain=np.sqrt(2))
        
    def forward(self, encoded_input):
        tt = torch.cuda if self.isCuda else torch
        h0 = Variable(tt.FloatTensor(torch.zeros(self.num_layers, encoded_input.size(0), self.output_size)))
        c0 = Variable(tt.FloatTensor(torch.zeros(self.num_layers, encoded_input.size(0), self.output_size)))
        decoded_output, hidden = self.lstm(encoded_input, (h0, c0))
        #decoded_output = self.sigmoid(decoded_output)
        decoded_output = decoded_output
        return decoded_output

class LSTMAE(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, isCuda, dropout=0.):
        super(LSTMAE, self).__init__()
        self.encoder = EncoderbiLSTM(input_size, hidden_size, num_layers, isCuda, dropout=dropout)
        self.decoder = DecoderRNN(hidden_size*2, input_size, num_layers, isCuda, dropout=dropout)
        
    def forward(self, input):
        encoded_input = self.encoder(input)
        decoded_output = self.decoder(encoded_input)
        return decoded_output

### 試運用


In [6]:
class MKRankGaussScalar(object):
    """usage: 
    rgs = RankGaussScalar()
    rgs.fit(df_X)
    df_X_converted = rgs.transform(df_X)
    df_X_test_converted = rgs.transform(df_X_test)
    """
    def __init__(self):
        self.fit_done = False

    def rank_gauss(self, x):
        N = x.shape[0]
        temp = x.argsort()
        rank_x = temp.argsort() / N
        rank_x -= rank_x.mean()
        rank_x *= 2
        efi_x = erfinv(rank_x)
        efi_x -= efi_x.mean()
        return efi_x

    def fit(self, df_x):
        """
        df_x: fitting対象のDataFrame
        """
        self.train_unique_rankgauss = {}
        self.target_cols = np.sort(df_x.columns)
        for c in self.target_cols:
            unique_val = np.sort(df_x[c].unique())
            self.train_unique_rankgauss[c]= [unique_val, self.rank_gauss(unique_val)]
        self.fit_done = True

    def transform(self, df_target):
        """
        df_target: transform対象のDataFrame
        """
        assert self.fit_done
        assert np.all(np.sort(np.intersect1d(df_target.columns, self.target_cols)) == np.sort(self.target_cols))
        df_converted_rank_gauss = pd.DataFrame(index=df_target.index)
        for c in self.target_cols:
            df_converted_rank_gauss[c] = np.interp(df_target[c], 
                                                   self.train_unique_rankgauss[c][0], 
                                                   self.train_unique_rankgauss[c][1]) # ,left=0, right=0)
        return df_converted_rank_gauss

In [7]:
#train_df = pd.read_feather('../features/train/meta_features.ftr')
train_set_df = pd.read_csv('/home/naoya.taguchi/.kaggle/competitions/PLAsTiCC-2018/training_set.csv')

In [8]:
#train_df = train_df.replace(np.inf, np.nan)
#train_df = train_df.replace(-np.inf, np.nan)

In [8]:
train_set_df['int_mjd'] = train_set_df.mjd.astype(int)

In [9]:
pv_train_set_df = pd.pivot_table(data=train_set_df, values='flux', index=['object_id', 'int_mjd'], columns='passband', dropna=False, aggfunc='max')
pv_train_set_df

Unnamed: 0_level_0,passband,0,1,2,3,4,5
object_id,int_mjd,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
615,59580,,,,,,
615,59581,,,,,,
615,59582,,,,,,
615,59583,,,,,,
615,59584,,,,,,
615,59585,,,,,,
615,59586,,,,,,
615,59587,,,,,,
615,59588,,,,,,
615,59589,,,,,,


In [10]:
pv_train_set_err_df = pd.pivot_table(data=train_set_df, values='flux_err', index=['object_id', 'int_mjd'], columns='passband', dropna=False, aggfunc='max')
pv_train_set_err_df

Unnamed: 0_level_0,passband,0,1,2,3,4,5
object_id,int_mjd,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
615,59580,,,,,,
615,59581,,,,,,
615,59582,,,,,,
615,59583,,,,,,
615,59584,,,,,,
615,59585,,,,,,
615,59586,,,,,,
615,59587,,,,,,
615,59588,,,,,,
615,59589,,,,,,


In [11]:
# scaling 
#scaler = MKRankGaussScalar()
scaler = StandardScaler()
scaler.fit(pv_train_set_df)
scaled_pv_train_set_df = pv_train_set_df.copy()
scaled_pv_train_set_df[[0, 1, 2, 3, 4, 5]] = scaler.transform(pv_train_set_df)
scaled_pv_train_set_df.fillna(0, inplace=True)

In [12]:
# scaling 
#scaler = MKRankGaussScalar()
#err_scaler = StandardScaler()
#err_scaler.fit(pv_train_set_err_df)
#scaled_pv_train_set_err_df = pv_train_set_err_df.copy()
#scaled_pv_train_set_err_df[[0, 1, 2, 3, 4, 5]] = scaler.transform(pv_train_set_err_df)
scaled_pv_train_set_err_df = pv_train_set_err_df.copy()
scaled_pv_train_set_err_df.fillna(-1, inplace=True)

In [13]:
#test_set_df = pd.read_feather('/home/naoya.taguchi/.kaggle/competitions/PLAsTiCC-2018/test_set.fth', nthreads=NTHREAD)
#test_set_df['int_mjd'] = test_set_df.mjd.astype(int)

In [13]:
scaled_pv_train_set_panel = scaled_pv_train_set_df.to_panel()
scaled_pv_train_set_err_panel = scaled_pv_train_set_err_df.to_panel()
scaled_pv_train_set_panel.shape

(6, 7848, 951)

### 学習
 - 欠損値をどう扱うか
     - GPU を使うなら zero_padding が丸いが、スパースだし大体欠損している期間が同じなので欠損期間がうまく出ないのでは...？
     - 線形補間とか、何かしらで補完しても良さそう、ただ一方でこれは補完の精度によってきそうだし、良い補完ができるなら別に AE 要らなそう　

In [14]:
def worker_init_fn(worker_id):                                                          
    np.random.seed(np.random.get_state()[1][0] + worker_id)

class plasticcDatasetForAE(Dataset):
    def __init__(self, x, y, err):
        self.x = torch.FloatTensor(x.astype(np.float32), device=DEVICE)
        #self.x = torch.from_numpy(x.astype(np.float32))
        self.y = torch.FloatTensor(y.astype(np.float32), device=DEVICE)
        self.err = torch.FloatTensor(err.astype(np.float32), device=DEVICE)
 
    def __len__(self):
        return self.x.shape[0]
 
    def __getitem__(self, idx):
        return self.x[idx, :, :], self.y[idx, :, :], self.err[idx, :, :]

In [29]:
train_set_df.flux_err.min(), train_set_df.flux_err.max()

(0.46375299999999997, 2234069.25)

In [15]:
# loss 関数を定義
# weighted mean square error (https://arxiv.org/abs/1711.10609)

def WMSE(true, pred, err):
    # 元々欠損値だったところには loss をかけない
    mask = (true != -1)#.detach().numpy().astype(np.int64)
    mask = mask.float()
    n_T = pred.shape[1]
    print(torch.max(err), torch.min(err), )
    wmse = (1/n_T) * torch.mean(torch.sum((torch.pow(true - pred, 2) / (torch.pow(err, 2))) * mask, dim=0))
#    wmse = (1/n_T) * torch.sum((torch.pow(true - pred, 2) / (torch.pow(err, 2))))
#    print(np.isnan(mask.detach().numpy()).any()) # -> Flase
#    print(np.isnan(torch.sum(mask).detach().numpy()).any()) # -> Flase
#    print(np.isnan(((torch.pow(true - pred, 2) / (torch.pow(err, 2))) * mask).detach().numpy()).any()) # -> True -> mask と loss　の shape が違う？
#    print(np.isnan(((torch.pow(true - pred, 2) / (torch.pow(err, 2)))).detach().numpy()).any()) # -> False
    #print(np.isnan(true).detach().numpy(), np.isnan(pred).detach().numpy(), np.isnan(err).detach().numpy(),)
    #print(np.isnan((true - pred).detach().numpy()).any())
    #wmse = (1/n_T) * torch.sum(torch.pow((true - pred), 2))
    return wmse

In [16]:
# data 準備
#x_trn = scaled_pv_train_set_df.values
#y_trn = scaled_pv_train_set_df.values
x_trn = scaled_pv_train_set_panel.swapaxes(0, 1).swapaxes(1, 2).values
y_trn = scaled_pv_train_set_panel.swapaxes(0, 1).swapaxes(1, 2).values
err_trn = scaled_pv_train_set_err_panel.swapaxes(0, 1).swapaxes(1, 2).values

In [None]:
epochs = 1000

for i in range(1):
    # ae 定義
    bilstm_ae = LSTMAE(input_size=6, hidden_size=32, num_layers=3, isCuda=False, dropout=0.2)
    #bilstm_ae = LSTMAE(input_size=6, hidden_size=64, num_layers=3, isCuda=False, dropout=0.2)
    #bilstm_ae = LSTMAE(input_size=6, hidden_size=128, num_layers=3, isCuda=False)
    # dataset 加工
    dataset = plasticcDatasetForAE(x_trn, y_trn, err_trn)
    dataloader = DataLoader(dataset, batch_size=1000, shuffle=True, num_workers=0, worker_init_fn=worker_init_fn)
    #optimizer = optim.SGD(bilstm_ae.parameters(), lr=0.001, momentum=0.0)
    #scheduler = StepLR(optimizer, step_size=5, gamma=0.6)
    optimizer = optim.Adam(bilstm_ae.parameters(), lr=0.001)
    
    for epoch in range(epochs):
        #scheduler.step()
        for i_batch, sample_batched in enumerate(dataloader):
            bilstm_ae = bilstm_ae.train()
            optimizer.zero_grad()
            x_trn_batch, y_trn_batch, err_trn_batch = sample_batched
            x_trn_batch, y_trn_batch, err_trn_batch = Variable(x_trn_batch), Variable(y_trn_batch), Variable(err_trn_batch)
            y_pred = bilstm_ae(x_trn_batch)
            loss = WMSE(y_trn_batch, y_pred, err_trn_batch)
            loss.backward()
            optimizer.step()
            print(loss.detach().numpy())
            #losses.append(loss.detach().numpy())

tensor(21215.0332) tensor(-1.)


In [51]:
gc.collect()

183

In [20]:
np.isnan(y_pred.detach().numpy()).any()

True

In [267]:
bilstm_ae = LSTMAE(input_size=6, hidden_size=64, num_layers=2, isCuda=False)
bilstm_ae(torch.FloatTensor(
    np.array(
        [[
            [0., 1., 3., 2., 4., 5.], 
            [0., 1., 3., 2., 4., 5.], 
            [0., 1., 3., 2., 4., 5.], 
            [0., 1., 3., 2., 4., 5.], 
        ],
         [
            [0., 1., 3., 2., 4., 5.], 
            [0., 1., 3., 2., 4., 5.], 
            [0., 1., 3., 2., 4., 5.], 
            [0., 1., 3., 2., 4., 5.], 
        ],       
        ]
    )
))

tensor([[[-0.0307, -0.0775, -0.1336, -0.0143, -0.0987, -0.0795],
         [-0.0502, -0.1022, -0.1904, -0.0041, -0.1747, -0.1190],
         [-0.0656, -0.1092, -0.2135,  0.0120, -0.2250, -0.1374],
         [-0.0776, -0.1089, -0.2206,  0.0269, -0.2561, -0.1466]],

        [[-0.0307, -0.0775, -0.1336, -0.0143, -0.0987, -0.0795],
         [-0.0502, -0.1022, -0.1904, -0.0041, -0.1747, -0.1190],
         [-0.0656, -0.1092, -0.2135,  0.0120, -0.2250, -0.1374],
         [-0.0776, -0.1089, -0.2206,  0.0269, -0.2561, -0.1466]]],
       grad_fn=<TransposeBackward0>)
(tensor([[[-0.0935,  0.2743, -0.0708, -0.0153, -0.1340,  0.3244],
         [-0.0935,  0.2743, -0.0708, -0.0153, -0.1340,  0.3244]],

        [[-0.0776, -0.1089, -0.2206,  0.0269, -0.2561, -0.1466],
         [-0.0776, -0.1089, -0.2206,  0.0269, -0.2561, -0.1466]]],
       grad_fn=<ViewBackward>), tensor([[[-0.1542,  0.5232, -0.1357, -0.0332, -0.2322,  0.7198],
         [-0.1542,  0.5232, -0.1357, -0.0332, -0.2322,  0.7198]],

        [[

tensor([[[-0.0307, -0.0775, -0.1336, -0.0143, -0.0987, -0.0795],
         [-0.0502, -0.1022, -0.1904, -0.0041, -0.1747, -0.1190],
         [-0.0656, -0.1092, -0.2135,  0.0120, -0.2250, -0.1374],
         [-0.0776, -0.1089, -0.2206,  0.0269, -0.2561, -0.1466]],

        [[-0.0307, -0.0775, -0.1336, -0.0143, -0.0987, -0.0795],
         [-0.0502, -0.1022, -0.1904, -0.0041, -0.1747, -0.1190],
         [-0.0656, -0.1092, -0.2135,  0.0120, -0.2250, -0.1374],
         [-0.0776, -0.1089, -0.2206,  0.0269, -0.2561, -0.1466]]],
       grad_fn=<TransposeBackward0>)