In [1]:
# PIPインストール
!pip install /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

Processing /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
Installing collected packages: autograd
Successfully installed autograd-1.7.0
Processing /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l- \ done
[?25h  Created wheel for autograd-gamma: filename=autograd_gamma-0.5.0-py3-none-any.whl size=4030 sha256=b7cfff1f8a2eb13f7c65826886657b0d4571aeee404d1b9e63c110dade300f8c
  Stored in directory: /root/.cache/pip/wheels/6b/b5/e0/4c79e15c0b5f2c15ecf613c720bb20daab20a666eb67135155
Successfully built autograd-gamma
Installing collected packages: autograd-gamma
Successfully installed autograd-gamma-0.5.0
Processing /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
Installing collected packages: interface-meta
Successfully installed interface-meta-1.3.0


In [2]:
# 訓練、テストデータ読み込み
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")
train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")

In [3]:
# 死亡者の寿命を最大・最小化して再スケーリング
train["y"] = train.efs_time.values
mx = train.loc[train.efs==1,"efs_time"].max()
mn = train.loc[train.efs==0,"efs_time"].min()
train.loc[train.efs==0,"y"] = train.loc[train.efs==0,"y"] + mx - mn
train.y = train.y.rank()

# 生存者に大きな値を与えることで分離
train.loc[train.efs==0,"y"] += 2*len(train)
train.y = train.y / train.y.max() # 正規化
train.y = np.log(train.y) # スケール縮小
train.y -= train.y.mean() # ゼロ平均
train.y *= -1.0 # 反転

In [4]:
RMV = ["ID","efs","efs_time","y"] # 取り除く特徴量
FEATURES = [c for c in train.columns if not c in RMV] # 全特徴量

# カテゴリ列と数値列を区別する
catc = [] # カテゴリ列
numc = [] # 数値列

for c in FEATURES:
    # カテゴリ変数の場合
    if train[c].dtype=="object":
        # NANで埋める
        catc.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
    else:
        numc.append(c)

In [5]:
# 訓練・テストデータ結合
combined = pd.concat([train,test],axis=0,ignore_index=True)

# カテゴリ列ワンホットエンコード
combined_c = pd.get_dummies(combined[catc], dtype=int)
# 数値列分類
combined_n = combined[numc]
combined_y = combined["y"]

# 数値変数の処理
for num in numc:
    if combined_n[num].dtype=="float64":
        combined_n.loc[:,num] = combined_n[num].astype("float32")
    if combined_n[num].dtype=="int64":
        combined_n.loc[:,num] = combined_n[num].astype("int32")
            
    m = combined_n[num].mean() # 平均
    s = combined_n.loc[:,num].std()  # 標準偏差
    combined_n.loc[:,num] = (combined_n[num]-m)/s    # 各行を標準化
    combined_n.loc[:,num] = combined_n[num].fillna(0) # 各行を0で補完
        
# 横に結合
combined_total = pd.concat([combined_c,combined_n,combined_y],axis=1)

# 縦に分割
train = combined_total.iloc[:len(train)].copy()
test = combined_total.iloc[len(train):].reset_index(drop=True).copy()

  1.21131943]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  combined_n.loc[:,num] = (combined_n[num]-m)/s    # 各行を標準化


In [6]:
FEATURES2 = [c for c in train.columns if not c in RMV]

# ワンホットエンコード後に改めて特徴量を分類
catc2 = [] # カテゴリ列
numc2 = [] # 数値列

for c in FEATURES2:
    if train[c].dtype=="object":
        catc2.append(c)
    else:
        numc2.append(c)

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold

In [8]:
class SurvivalModel(nn.Module):
    def __init__(self, num_features):
        super(SurvivalModel, self).__init__()
        # 数値列の数
        self.num_features = num_features
        # 数値列の数 + 各カテゴリの埋め込みユニーク数合計
        total_emb_size = num_features

        # 少し修正を加え改善
        self.fc1 = nn.Linear(total_emb_size, 100)
        self.drop1 = nn.Dropout(0.3)
        self.bn1 = nn.BatchNorm1d(100, track_running_stats=True)
        self.fc2 = nn.Linear(100, 20)
        self.drop2 = nn.Dropout(0.3)
        self.bn2 = nn.BatchNorm1d(20, track_running_stats=True)
        self.fc3 = nn.Linear(20, 1)
    
    def forward(self, x_num):
        # 列方向に結合する
        x = x_num

        x = torch.relu(self.bn1(self.drop1(self.fc1(x))))
        x = torch.relu(self.bn2(self.drop2(self.fc2(x))))
        
        return self.fc3(x) # 出力

In [9]:
# K Fold学習
def train_model(model, train_loader, valid_loader, optimizer, criterion, device, epochs):
    for epoch in range(epochs):
        # 訓練モード
        model.train()
        for x_num, y in train_loader:
            x_num, y = x_num.to(device), y.to(device) # GPU仕様
            optimizer.zero_grad() # 勾配初期化
            output = model(x_num)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
        
        model.eval()
        valid_loss = 0
        with torch.no_grad():
            for x_num, y in valid_loader:
                x_num, y = x_num.to(device), y.to(device)
                output = model(x_num)
                valid_loss += criterion(output, y).item()
        print(f"Epoch {epoch+1}, Validation Loss: {valid_loss/len(valid_loader)}")

In [10]:
# cudaセッティング
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [11]:
EPOCHS = 10  # エポック数
REPEATS = 1 # 繰り返し計算数
FOLDS = 5   # 分割数
kf = KFold(n_splits=FOLDS, random_state=42, shuffle=True)

oof_nn = np.zeros(len(train)) # 検証結果
pred_nn = np.zeros(len(test)) # テスト結果

for r in range(REPEATS):
    print(f"### REPEAT {r+1} ###")

    # K分割
    for i, (train_index, test_index) in enumerate(kf.split(train)):
        print(f"### Fold {i+1} ###")

        # 訓練データ
        X_train_nums = torch.tensor(train.loc[train_index,numc2].values, dtype=torch.float32) # 数値列
        y_train = torch.tensor(train.loc[train_index, "y"].values, dtype=torch.float32).unsqueeze(1) # 結合

        # 検証データ
        X_valid_nums = torch.tensor(train.loc[test_index,numc2].values, dtype=torch.float32) # 数値列
        y_valid = torch.tensor(train.loc[test_index, "y"].values, dtype=torch.float32).unsqueeze(1) # 結合

        # テンソルデータセット
        train_dataset = TensorDataset(X_train_nums, y_train)
        valid_dataset = TensorDataset(X_valid_nums, y_valid)

        # データローダー
        train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=16)

        model = SurvivalModel(len(numc2)).to(device) # モデル定義
        optimizer = optim.Adam(model.parameters(), lr=0.0001) # 学習アルゴリズム
        criterion = nn.MSELoss() # 平均二乗誤差

        # 学習フェーズ
        train_model(model, train_loader, valid_loader, optimizer, criterion, device, EPOCHS)

        # 予測フェーズ
        model.eval()
        #ここから見直す
        with torch.no_grad():
            # 検証結果
            oof_nn[test_index] += model(X_valid_nums.to(device)).cpu().numpy().flatten()

        X_test_nums = torch.tensor(test[numc2].values, dtype=torch.float32) # 数値列
        with torch.no_grad():
            # テスト結果
            pred_nn += model(X_test_nums.to(device)).cpu().numpy().flatten()

# 平均
oof_nn /= (FOLDS * REPEATS)
# oof_nn /= REPEATS
pred_nn /= (FOLDS * REPEATS)

### REPEAT 1 ###
### Fold 1 ###
Epoch 1, Validation Loss: 1.9433732223179605
Epoch 2, Validation Loss: 1.8971728944116169
Epoch 3, Validation Loss: 1.880023988419109
Epoch 4, Validation Loss: 1.8692535193430053
Epoch 5, Validation Loss: 1.8661757331755426
Epoch 6, Validation Loss: 1.8573319322533077
Epoch 7, Validation Loss: 1.8529393793808089
Epoch 8, Validation Loss: 1.8540423144896825
Epoch 9, Validation Loss: 1.8495018367966016
Epoch 10, Validation Loss: 1.8523025947312515
### Fold 2 ###
Epoch 1, Validation Loss: 2.00480393005742
Epoch 2, Validation Loss: 1.920201799273491
Epoch 3, Validation Loss: 1.875380888250139
Epoch 4, Validation Loss: 1.8620477729373508
Epoch 5, Validation Loss: 1.8474765436516867
Epoch 6, Validation Loss: 1.8406981486413214
Epoch 7, Validation Loss: 1.8393554233842426
Epoch 8, Validation Loss: 1.836840652094947
Epoch 9, Validation Loss: 1.8308999634451337
Epoch 10, Validation Loss: 1.830257082813316
### Fold 3 ###
Epoch 1, Validation Loss: 1.960573694441053

In [12]:
test3 = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")
train3 = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")

In [13]:
# スコア計算
from metric import score

y_true = train3[["ID","efs","efs_time","race_group"]].copy()
y_pred = train3[["ID"]].copy()
y_pred["prediction"] = oof_nn
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for NN =",m)


Overall CV for NN = 0.6619128897944809


In [14]:
# 提出データ
sub = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv")
sub.prediction = pred_nn
sub.to_csv("submission.csv",index=False)
print("Sub shape:",sub.shape)
sub.head()

Sub shape: (3, 2)


Unnamed: 0,ID,prediction
0,28800,-1.104125
1,28801,-0.096913
2,28802,-1.413874
