In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
warnings.filterwarnings('ignore')
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [101]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

本次比赛的使用的通风机的数据主要来自于[open-source ventilator](https://www.kaggle.com/competitions/ventilator-pressure-prediction/data),数据的使用流程包括：第一个控制是0-100的连续变量，用于控制空气进行肺部的百分比（0关闭/1打开），第二个控制是否让空气排出（0关闭/1打开）
id：全局时间标识符
breath_id:全局呼吸时间步骤
R：气道手限成都，越高的越难吹
C：肺部顺从性，越高的越容易吹
time_step：实际的时间戳
u_in:吸气电磁阀的控制输入0-100
u_oyt:探索性电磁阀的控制输入 0-1
pressure：在探索回路中测量的气道压力

In [102]:
print(df_train.head())
print(df_test.head())

   id  breath_id   R   C  time_step       u_in  u_out   pressure
0   1          1  20  50   0.000000   0.083334      0   5.837492
1   2          1  20  50   0.033652  18.383041      0   5.907794
2   3          1  20  50   0.067514  22.509278      0   7.876254
3   4          1  20  50   0.101542  22.808822      0  11.742872
4   5          1  20  50   0.135756  25.355850      0  12.234987
   id  breath_id  R   C  time_step       u_in  u_out
0   1          0  5  20   0.000000   0.000000      0
1   2          0  5  20   0.031904   7.515046      0
2   3          0  5  20   0.063827  14.651675      0
3   4          0  5  20   0.095751  21.230610      0
4   5          0  5  20   0.127644  26.320956      0


In [103]:
print('The shape of train data is: ', df_train.shape)
print('The shape of test data is: ', df_test.shape)
print('The different types of id are: ', df_train['id'].unique())
print('The different types of breath id are: ', df_train['breath_id'].unique())

The shape of train data is:  (6036000, 8)
The shape of test data is:  (4024000, 7)
The different types of id are:  [      1       2       3 ... 6035998 6035999 6036000]
The different types of breath id are:  [     1      2      3 ... 125743 125745 125749]


In [104]:
# 计算每个breath_id的长度
breath_id_len = df_train.groupby('breath_id').size().values
print('The length of breath id is: ', breath_id_len)

The length of breath id is:  [80 80 80 ... 80 80 80]


In [105]:
print('Feature engineering...')
mask = np.array(df_train['u_out']==0).reshape(-1,80)

def add_feature(df):
    # 累积呼吸的量
    df['u_in_cumsum'] = df['u_in'].groupby(df['breath_id']).cumsum()
    # 时间差
    df['time_diff']=df['time_step'].diff()
    df['time_diff'].fillna(0, inplace=True)
    df['time_diff'].mask(df['time_diff']<0,0,inplace=True)

    # 累积压力量
    df['tmp']=df['u_in']*df['time_diff']
    df['area_true']=df['tmp'].groupby(df['breath_id']).cumsum()
    df['tmp'] = df['u_out']*(-1)+1

add_feature(df_train)
add_feature(df_test)

Feature engineering...


In [106]:
print('Drop the useless columns...')
targets = df_train[['pressure']].to_numpy()
df_train.drop(['id', 'breath_id', 'pressure'], axis=1, inplace=True)
df_test.drop(['id', 'breath_id'], axis=1, inplace=True)

Drop the useless columns...


In [107]:
print('Normalize the data...')
from sklearn.preprocessing import RobustScaler,normalize
RS = RobustScaler()
df_train = RS.fit_transform(df_train)
df_test = RS.transform(df_test)

df_train = df_train.reshape(-1,80,df_train.shape[1])
df_test = df_test.reshape(-1,80,df_test.shape[1])

Normalize the data...


In [108]:
print('K fold...')

from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=2021)

train_features =[df_train[i] for i in list(kf.split(df_train))[0][0]]
train_targets =[targets[i] for i in list(kf.split(df_train))[0][0]]
val_features =[df_train[i] for i in list(kf.split(df_train))[0][1]]
val_targets =[targets[i] for i in list(kf.split(df_train))[0][1]]
train_mask =[mask[i] for i in list(kf.split(df_train))[0][0]]
val_mask =[mask[i] for i in list(kf.split(df_train))[0][1]]

print('Build the dataset...')
print('The shape of train features is: ', np.array(train_features).shape)

K fold...
Build the dataset...
The shape of train features is:  (60360, 80, 9)


In [109]:
print('Create the dataloader...')
import torch
from torch.utils.data import Dataset, DataLoader
import random

bacth_size = 256

class MyDataset(Dataset):
    def __init__(self, features, targets, mask):
        super(MyDataset, self).__init__()
        self.features = features
        self.targets = targets
        self.mask = mask

    def __getitem__(self, index):
        return self.features[index].astype('float32'),
        self.targets[index].astype('float32'),
        self.mask[index].astype('bool')

    def __len__(self):
        return len(self.features)

class TestDataset(Dataset):
    def __init__(self, features, mask):
        super(TestDataset, self).__init__()
        self.features = features
        self.mask = mask

    def __getitem__(self, index):
        return self.features[index].astype('float32'),self.mask[index].astype('bool')

    def __len__(self):
        return len(self.features)

train_dataset = MyDataset(train_features, train_targets, train_mask)
train_loader = DataLoader(train_dataset, batch_size=bacth_size, shuffle=True, num_workers=4)

val_dataset = MyDataset(val_features, val_targets, val_mask)
val_loader = DataLoader(val_dataset, batch_size=bacth_size, shuffle=False, num_workers=4)


Create the dataloader...


In [118]:
train_features = torch.tensor(train_features, dtype=torch.float32)
train_targets = torch.tensor(train_targets, dtype=torch.float32)
train_mask = torch.tensor(train_mask, dtype=torch.bool)
val_features = torch.tensor(val_features, dtype=torch.float32)
val_targets = torch.tensor(val_targets, dtype=torch.float32)
val_mask = torch.tensor(val_mask, dtype=torch.bool)

train_dataset = torch.utils.data.TensorDataset(train_features, train_targets, train_mask)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=bacth_size, shuffle=True, num_workers=4)

val_dataset = torch.utils.data.TensorDataset(val_features, val_targets, val_mask)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=bacth_size, shuffle=False, num_workers=4)

In [129]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class RedisualLSTM(nn.Module):
    def __init__(self,d_model):
        super().__init__()
        self.LSTM = nn.LSTM(d_model, d_model, num_layers=2, bidirectional=True)
        self.linear = nn.Linear(d_model*2, d_model*4)
        self.linear2 = nn.Linear(d_model*4, d_model)
    
    def forward(self, x):
        res = x
        x, _ = self.LSTM(x)
        x = F.relu(self.linear(x))
        x = self.linear2(x)
        x = x + res
        return x

class SAKTModel(nn.Module):
    def __init__(self,n_skill,n_cat,nout,max_seq=100,embed_dim=128,pos_encode='LSTM',nlayers=2,rnnlayers=3,dropout=0.1,nheads=8):
        super().__init__()
        self.n_skill = n_skill
        self.embed_dim = embed_dim
        if pos_encode == 'LSTM':
            self.pos_encoder = nn.ModuleList([RedisualLSTM(embed_dim) for _ in range(nlayers)])
        
        self.pos_encoder_dropout = nn.Dropout(dropout)
        self.embedding = nn.Linear(n_skill, embed_dim)
        self.cat_embedding = nn.Embedding(n_cat, embed_dim,padding_idx=0)
        self.layer_norm = nn.LayerNorm(embed_dim)

        encoder_layer =[nn.TransformerEncoderLayer(embed_dim, nheads,embed_dim*4 ,dropout) for _ in range(nlayers)]
        conv_layer = [nn.Conv1d(embed_dim,embed_dim,(nlayers-1)*2-1,stride=1,padding=0) for _ in range(nlayers)]
        deconv_layers = [nn.ConvTranspose1d(embed_dim,embed_dim,(nlayers-1)*2-1,stride=1,padding=0) for _ in range(nlayers)]

        layer_norm_layers = [nn.LayerNorm(embed_dim) for _ in range(nlayers)]
        layer_norm_layers2 = [nn.LayerNorm(embed_dim) for _ in range(nlayers)]

        self.transformer_encoder = nn.ModuleList(encoder_layer)
        self.conv = nn.ModuleList(conv_layer)
        self.deconv = nn.ModuleList(deconv_layers)
        self.layer_norm = nn.ModuleList(layer_norm_layers)
        self.layer_norm2 = nn.ModuleList(layer_norm_layers2)
        self.nhead = nheads
        self.pred = nn.Linear(embed_dim, nout)
        self.downsample = nn.Linear(embed_dim*2, embed_dim)
    
    def forward(self, x, cat=None):
        device = x.device
        x = self.embedding(x)
        x = x.permute(1,0,2) # ?什么意思 为什么要转置
        for lstm in self.pos_encoder:
            lstm.LSTM.flatten_parameters()
            x= lstm(x)
        x = self.pos_encoder_dropout(x)
        x = self.layer_norm(x)
        for conv,transformer_layer,layer_norm,layer_norm2,deconv in zip(self.conv,self.transformer_encoder,self.layer_norm,self.layer_norm2,self.deconv):
            # LXBXC -> BXCXL
            res = x
            x = F.relu(conv(x.permute(1,2,0)).permute(2,0,1))
            x = layer_norm(x)
            x= F.relu(deconv(x.permute(1,2,0)).permute(2,0,1))
            x = layer_norm2(x)
            x = res + x
        x = x.permute(1,0,2)
        output = self.pred(x)
        return output.squeeze(-1)

In [130]:
model = SAKTModel(n_skill=df_train.shape[-1],n_cat=10,nout=1,max_seq=None,embed_dim=256,pos_encode='LSTM',nlayers=2,rnnlayers=3,dropout=0.1,nheads=8)

In [131]:
from pytorch_ranger import Ranger
optimizer = Ranger(mdoel.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
import tqdm

def train_epoch(model,train_loader,optimizer,criterion):
    model.train()
    train_loss = []
    for (x,y,mask) in train_loader:
        x = x.cuda()
        y = y.cuda()
        mask = mask.cuda()
        optimizer.zero_grad()
        print(model.forward(x).shape)
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
        bar.set_description(f'loss - {loss.item():.5f}')
    return np.mean(train_loss)

def val_epoch(model,val_loader,criterion):
    model.eval()
    val_loss = []
    preds = []
    bar = tqdm(val_loader)
    with torch.no_grad():
        for (x,y,mask) in bar:
            x = x.cuda()
            y = y.cuda()
            mask = mask.cuda()
            output = model(x)
            loss = criterion(output, y)
            val_loss.append(loss.item())
            preds.append(output.sigmoid().detach().cpu().numpy())
            bar.set_description(f'loss - {loss.item():.5f}')
    val_loss = np.mean(val_loss)
    preds = np.concatenate(preds)
    return val_loss, preds

epochs = 20
best_loss = np.inf
for epoch in range(epochs):
    train_loss = train_epoch(mdoel,train_loader,optimizer,criterion)
    val_loss, preds = val_epoch(mdoel,val_loader,criterion)
    print(f'epoch - {epoch + 1} train_loss - {train_loss:.5f} val_loss - {val_loss:.5f}')
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(mdoel.state_dict(), 'best_model.pt')

print('Predict...')


NotImplementedError: Module [ModuleList] is missing the required "forward" function