In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/kaggle/MoA')
import utils_moa as utils

In [None]:
modulename = 'utils_moa'
if modulename not in sys.modules:
    print('You have not imported the {} module'.format(modulename))

In [None]:
%%capture
!pip install optuna
import optuna
import torch
import numpy as np
import pandas as pd

In [57]:
# Config
ROOT = '/content/drive/My Drive/Colab Notebooks/kaggle/MoA/input'
DEVICE = "cuda"
EPOCHS = 4
BATCH_SIZE = 1024
KFOLDS = 5

In [58]:
def run_training(fold, params, save_model=False):
    df = pd.read_csv(f"{ROOT}/train_features.csv")
    df = utils.Engine.process_data(df) # no embedding layer cause 3 vars

    targets_df = pd.read_csv(f"{ROOT}/train_targets_folds.csv")

    # feature and target column names
    feature_columns = df.drop("sig_id", axis=1).columns
    target_columns = targets_df.drop(["sig_id", "kfold"], axis=1).columns

    df = df.merge(targets_df, on="sig_id", how="left")

    # create train and valid dfs
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    # split features and targets
    x_train = train_df[feature_columns].to_numpy()
    y_train = train_df[target_columns].to_numpy()

    x_valid = valid_df[feature_columns].to_numpy()
    y_valid = valid_df[target_columns].to_numpy()

    # create MoADataset instances
    train_dataset = utils.MoADataset(features=x_train, targets=y_train)
    valid_dataset = utils.MoADataset(features=x_valid, targets=y_valid)

    # create dataloaders from MoADatasets
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=BATCH_SIZE, 
        num_workers=8, shuffle=True # do we really want shuffle here
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=BATCH_SIZE, num_workers=8
    )

    model = utils.Model(
        nfeatures=x_train.shape[1], 
        ntargets=y_train.shape[1], 
        nlayers=params["num_layers"], 
        hidden_size=params["hidden_size"], 
        dropout=params["dropout"],
    )

    model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=params["learning_rate"])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, threshold=0.00001, 
    #                                                        mode='min', verbose=True)
    eng = utils.Engine(model, optimizer, device=DEVICE)

    best_loss = np.inf
    early_stopping_iter = 10
    early_stopping_counter = 0

    for epoch in range(EPOCHS):
        train_loss = eng.train(train_loader)
        valid_loss = eng.evaluate(valid_loader)
        print(f"fold: {fold}, epoch: {epoch}, {train_loss}, {valid_loss}")
        if valid_loss < best_loss:
            best_loss = valid_loss
            if save_model:
                torch.save(model.state_dict(), f"{ROOT}/model_{fold}.bin")
        else:
            early_stopping_counter += 1


        if early_stopping_counter > early_stopping_iter:
            break
    
        scheduler.step() # check effectiveness of scheduler
    # models.append(model.state_dict()) # figure out how to best store models
    return best_loss

    


In [59]:
def objective(trial):
    # can add more params
    params = {
        "num_layers": trial.suggest_int("num_layers", 1, 7),
        "hidden_size": trial.suggest_int("hidden_size", 16, 2048),
        "dropout": trial.suggest_uniform("dropout", 0.1, 0.7),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 1e-3)
    }
    all_losses = []
    for f_ in range(KFOLDS):
        temp_loss = run_training(f_, params, save_model=False)
        all_losses.append(temp_loss)
    
    return np.mean(all_losses)


In [61]:
if __name__ == "__main__":
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=3)

    print("best trial:")
    trial_ = study.best_trial

    print(trial_.value)
    print(trial_.params)

    scores = 0
    for j in range(KFOLDS):
        scr = run_training(j, trial_.params, save_model=True)
        scores += scr
    
    print(f"final_cv: {scores / KFOLDS}")



[32m[I 2020-10-09 06:37:50,733][0m A new study created in memory with name: no-name-66f063c5-a356-41ca-92b9-305a7df8e470[0m


fold: 0, epoch: 0, 0.7182412994535345, 0.693225359916687
fold: 0, epoch: 1, 0.7101190466629831, 0.6913039445877075
fold: 0, epoch: 2, 0.7021365228452181, 0.6886212587356567
fold: 0, epoch: 3, 0.6942946440295169, 0.6862565994262695
fold: 1, epoch: 0, 0.7214398321352506, 0.6949021697044373
fold: 1, epoch: 1, 0.713549017906189, 0.6933799028396607
fold: 1, epoch: 2, 0.7052856495505885, 0.6909747242927551
fold: 1, epoch: 3, 0.6973650612329182, 0.6882490634918212
fold: 2, epoch: 0, 0.7168367912894801, 0.6906760334968567
fold: 2, epoch: 1, 0.7086111401256762, 0.6879485130310059
fold: 2, epoch: 2, 0.7008030163614374, 0.6855356693267822
fold: 2, epoch: 3, 0.6929597697759929, 0.6825750708580017
fold: 3, epoch: 0, 0.7243734880497581, 0.6939130544662475
fold: 3, epoch: 1, 0.7164391279220581, 0.6921635746955872
fold: 3, epoch: 2, 0.7087389607178537, 0.6901207208633423
fold: 3, epoch: 3, 0.7005074337909096, 0.6879220962524414
fold: 4, epoch: 0, 0.714921091732226, 0.688745379447937
fold: 4, epoch: 1,

[32m[I 2020-10-09 06:38:57,994][0m Trial 0 finished with value: 0.685040512084961 and parameters: {'num_layers': 6, 'hidden_size': 1559, 'dropout': 0.4511122359747308, 'learning_rate': 1.5984442085192133e-06}. Best is trial 0 with value: 0.685040512084961.[0m


fold: 4, epoch: 3, 0.6912515288905093, 0.6801997303962708
fold: 0, epoch: 0, 0.6603661681476393, 0.6144968152046204
fold: 0, epoch: 1, 0.47955514255322906, 0.40520462989807127
fold: 0, epoch: 2, 0.34040456696560506, 0.2858272850513458
fold: 0, epoch: 3, 0.24858566883363223, 0.2315227806568146
fold: 1, epoch: 0, 0.6421449749093306, 0.6082555174827575
fold: 1, epoch: 1, 0.46415029230870697, 0.41668822169303893
fold: 1, epoch: 2, 0.33461067864769384, 0.30827507972717283
fold: 1, epoch: 3, 0.2473542619692652, 0.25635032057762147
fold: 2, epoch: 0, 0.6462506902845282, 0.6039087772369385
fold: 2, epoch: 1, 0.46508009025925084, 0.38716139793396
fold: 2, epoch: 2, 0.33167430131058945, 0.2739458024501801
fold: 2, epoch: 3, 0.2431209518721229, 0.22786185145378113
fold: 3, epoch: 0, 0.6463860524328131, 0.5949306964874268
fold: 3, epoch: 1, 0.46574913671142176, 0.38567407727241515
fold: 3, epoch: 2, 0.33352108691868027, 0.28547051548957825
fold: 3, epoch: 3, 0.244573922533738, 0.25200403332710264


[32m[I 2020-10-09 06:39:58,976][0m Trial 1 finished with value: 0.24168220579624178 and parameters: {'num_layers': 7, 'hidden_size': 194, 'dropout': 0.532661414443959, 'learning_rate': 0.0002984682517979398}. Best is trial 1 with value: 0.24168220579624178.[0m


fold: 4, epoch: 3, 0.24545699593267942, 0.24067204296588898
fold: 0, epoch: 0, 0.718909878479807, 0.6872681140899658
fold: 0, epoch: 1, 0.7066578833680404, 0.6823024868965148
fold: 0, epoch: 2, 0.6944093798336229, 0.6772769451141357
fold: 0, epoch: 3, 0.6827474989389118, 0.6715740084648132
fold: 1, epoch: 0, 0.7191553335440787, 0.6888874888420105
fold: 1, epoch: 1, 0.7067439305154901, 0.683750593662262
fold: 1, epoch: 2, 0.6944212191983273, 0.678686797618866
fold: 1, epoch: 3, 0.6822752293787504, 0.6731544852256774
fold: 2, epoch: 0, 0.7247039707083451, 0.6904198288917541
fold: 2, epoch: 1, 0.7119425409718564, 0.6849652171134949
fold: 2, epoch: 2, 0.7001048514717504, 0.6793313264846802
fold: 2, epoch: 3, 0.68796075645246, 0.6735563635826111
fold: 3, epoch: 0, 0.7148984388301247, 0.6847242951393128
fold: 3, epoch: 1, 0.7025359021989923, 0.6790600419044495
fold: 3, epoch: 2, 0.6900880023052818, 0.6735920429229736
fold: 3, epoch: 3, 0.6782148323561016, 0.6677269220352173
fold: 4, epoch: 0

[32m[I 2020-10-09 06:41:00,552][0m Trial 2 finished with value: 0.6733462905883789 and parameters: {'num_layers': 6, 'hidden_size': 501, 'dropout': 0.5476935073380687, 'learning_rate': 7.857843609345945e-06}. Best is trial 1 with value: 0.24168220579624178.[0m


fold: 4, epoch: 3, 0.694114719566546, 0.6807196736335754
best trial:
0.24168220579624178
{'num_layers': 7, 'hidden_size': 194, 'dropout': 0.532661414443959, 'learning_rate': 0.0002984682517979398}
fold: 0, epoch: 0, 0.6457030051632932, 0.6018311381340027
fold: 0, epoch: 1, 0.46618772964728505, 0.3899673104286194
fold: 0, epoch: 2, 0.3343554045024671, 0.27712319493293763
fold: 0, epoch: 3, 0.2452164064896734, 0.23208097517490386
fold: 1, epoch: 0, 0.6473035341814944, 0.6092552185058594
fold: 1, epoch: 1, 0.4662097049386878, 0.41510764360427854
fold: 1, epoch: 2, 0.33376763996325043, 0.3042787492275238
fold: 1, epoch: 3, 0.2452813818266517, 0.2475907623767853
fold: 2, epoch: 0, 0.6289254175989252, 0.6104933500289917
fold: 2, epoch: 1, 0.454204294242357, 0.427494603395462
fold: 2, epoch: 2, 0.3276837154438621, 0.32498748898506163
fold: 2, epoch: 3, 0.24198642608366513, 0.2796162724494934
fold: 3, epoch: 0, 0.6478072561715779, 0.5917270541191101
fold: 3, epoch: 1, 0.4683400094509125, 0.381

In [None]:

# test_features = pd.read_csv('/content/drive/My Drive/Colab Notebooks/kaggle/MoA/input/test_features.csv')
# test_features = pd.concat([test_features, pd.get_dummies(test_features['cp_time'], prefix='cp_time')], axis=1)
# test_features = pd.concat([test_features, pd.get_dummies(test_features['cp_dose'], prefix='cp_dose')], axis=1)
# test_features = pd.concat([test_features, pd.get_dummies(test_features['cp_type'], prefix='cp_type')], axis=1)
# test_features = test_features.drop(['cp_type', 'cp_time', 'cp_dose'], axis=1)
# class TestMoADataset:
#     def __init__(self, dataset):
#         self.dataset = dataset
    
#     def __len__(self):
#         return self.dataset.shape[0]
    
#     def __getitem__(self, item):
#         return {
#             "x": torch.tensor(self.dataset[item, :], dtype=torch.float)
#         }
# test_dataset = TestMoADataset(dataset=test_features.iloc[:, 1:].values)
# # num_workers=0?
# test_loader = torch.utils.data.DataLoader(
#             test_dataset,
#             batch_size=1024,
#             num_workers=0,
#             shuffle=False
#         )
# predictions = np.zeros((test_features.shape[0], 206))
# inference_model = model.model
# inference_model.eval()

# for ind, batch in enumerate(test_loader):
#     p = inference_model(batch['x'])[0].detach().cpu().numpy()
#     predictions[ind * 1024:(ind + 1) * 1024] * p
# test_features1 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/kaggle/MoA/input/test_features.csv')
# s = pd.DataFrame({'sig_id': test_features1['sig_id'].values})
# for col in train_targets_scored.columns[1:].tolist():
#     s[col] = 0
# s.loc[:, train_targets_scored.columns[1:]] = predictions
# s.loc[s['sig_id'].isin(test_features1.loc[test_features1['cp_type'] == 'ctl_vehicle', 'sig_id']), train_targets_scored.columns[1:]] = 0
# s.to_csv('submission.csv', index=False)
# torch.save(model.model.state_dict(),'model.pt')