In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/iterative-stratification/iterative-stratification-master/.gitignore
/kaggle/input/iterative-stratification/iterative-stratification-master/LICENSE
/kaggle/input/iterative-stratification/iterative-stratification-master/README.md
/kaggle/input/iterative-stratification/iterative-stratification-master/setup.cfg
/kaggle/input/iterative-stratification/iterative-stratification-master/setup.py
/kaggle/input/iterative-stratification/iterative-stratification-master/.travis.yml
/kaggle/input/iterative-stratification/iterative-stratification-master/tests/__init__.py
/kaggle/input/iterative-stratification/iterative-stratification-master/tests/test_ml_stratifiers.py
/kaggle/input/iterative-stratification/iterative-stratification-master/iterstrat/__init__.py
/kaggle/input/iterative-stratification/iterative-stratification-master/iterstrat/ml_stratifiers.py
/kaggle/input/lish-moa/test_features.csv
/kaggle/input/lish-moa/train_features.csv
/kaggle/input/lish-moa/train_targets_scored.csv
/k

In [2]:
import sys
sys.path.insert(1, '/kaggle/input/permutation-importance')
sys.path.insert(2, '/kaggle/input/iterative-stratification/iterative-stratification-master')

## Import Packages

In [3]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.layers import Dropout, BatchNormalization
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow_addons.optimizers import AdamW, Lookahead
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from PermutationImportance import PermutationImportance

 The versions of TensorFlow you are currently using is 2.3.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


## Helper Functions

In [4]:
def logloss_metric(df, y_true, y_pred):
    
    metrics = []
    
    # Calculate log_loss individually for every field
    for _target in df.columns:
        metrics.append(log_loss(y_true.loc[:, _target], 
                                y_pred.loc[:, _target].astype(float), 
                                labels=[0,1]))
    
    # Return mean of individual log_loss values
    return np.mean(metrics)

In [5]:
def pca_fet(train, test, features, kind, n_components, SEED=10):
    
    # Combine train and test datasets
    train_ = train[features].copy()
    test_ = test[features].copy()
    data = pd.concat([train_, test_], axis=0)
    
    # Perform PCA to create new features
    pca = PCA(n_components=n_components, random_state=SEED)
    data = pca.fit_transform(data)
    columns = [f'pca_{kind}{i + 1}' for i in range(n_components)]
    data = pd.DataFrame(data, columns = columns)
    
    # Append new features to train and test datasets
    train_ = data.iloc[:train.shape[0]]
    test_ = data.iloc[train.shape[0]:].reset_index(drop=True)
    train = pd.concat([train, train_], axis=1)
    test = pd.concat([test, test_], axis=1)
    return train, test

In [6]:
def kmeans_fet(train, test, features, kind, n_clusters, SEED=10):
    
    # Combine train and test datasets
    train_ = train[features].copy()
    test_ = test[features].copy()
    data = pd.concat([train_, test_], axis=0)
    
    # Perform KMeans to create new features
    kmeans = KMeans(n_clusters=n_clusters, random_state=SEED).fit(data)
    
    # Append new features to train and test datasets
    train[f'clusters_{kind}'] = kmeans.labels_[:train.shape[0]]
    test[f'clusters_{kind}'] = kmeans.labels_[train.shape[0]:]
    train = pd.get_dummies(train, columns = [f'clusters_{kind}'])
    test = pd.get_dummies(test, columns = [f'clusters_{kind}'])
    return train, test

In [7]:
def data_preprocess(dataset_dict, SEED=10):
    
    # Extract datasets path from input dict
    train_ds = dataset_dict['train_dataset']
    train_labels_ds = dataset_dict['train_labels_scored']
    predict_ds = dataset_dict['predict_dataset']
    
    # Read and display individual dataframe shapes
    train_df = pd.read_csv(train_ds)
    train_label_df = pd.read_csv(train_labels_ds)
    predict_df = pd.read_csv(predict_ds)
    print("\n------------- Input Dataset Shapes -------------")
    print("train_df: {}".format(train_df.shape))
    print("train_label_df: {}".format(train_label_df.shape))
    print("predict_df: {}".format(predict_df.shape))
    
    # Drop training rows with cp_type = ctl_vehicle
    train_label_df = train_label_df.loc[train_df['cp_type']=='trt_cp'].reset_index(drop=True)
    train_df = train_df.loc[train_df['cp_type']=='trt_cp'].reset_index(drop=True)
    train_samples = train_df.shape[0]
    print("\n------------- Dataset Shapes after removing ctl_vehicle rows -------------")
    print("train_df: {}".format(train_df.shape))
    print("train_label_df: {}".format(train_label_df.shape))
    
    # Separate the columns for gene expression and cell viability
    features_g = list(train_df.columns[4:776])
    features_c = list(train_df.columns[776:876])
    
    # Generate PCA features
    print("\n------------- Performing PCA -------------")
    train, test = pca_fet(train_df, predict_df, features_g, kind='g', n_components=80, SEED=SEED)
    train, test = pca_fet(train, test, features_c, kind='c', n_components=80, SEED=SEED)
    print("Dataset size after PCA: \ntrain: {} \npredict: {}".format(train.shape, test.shape))
    
    '''
    # Generate KMeans features
    print("\n------------- Performing KMeans -------------")
    train, test = kmeans_fet(train, test, features_g, kind='g', n_clusters=50, SEED=SEED)
    train, test = kmeans_fet(train, test, features_c, kind='c', n_clusters=50, SEED=SEED)
    print("Dataset size after KMeans: \ntrain: {} \npredict: {}".format(train.shape, test.shape))
    '''
    
    # Combined both input dataframes
    combined_df = train.append(test, sort=False, ignore_index=True)
    
    # Additional Feature Engineering
    combined_df['cp_type_enc'] = combined_df['cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    combined_df['cp_dose_enc'] = combined_df['cp_dose'].map({'D1': 0, 'D2': 1})
    combined_df['cp_time_days'] = combined_df['cp_time'].apply(lambda x: x//24)
    combined_df.drop(['sig_id', 'cp_type', 'cp_dose', 'cp_time'], axis=1, inplace=True)
    
    combined_df['g_sum'] = combined_df[features_g].sum(axis=1)
    combined_df['g_mean'] = combined_df[features_g].mean(axis=1)
    combined_df['g_std'] = combined_df[features_g].std(axis=1)
    combined_df['g_kurt'] = combined_df[features_g].kurtosis(axis=1)
    combined_df['g_skew'] = combined_df[features_g].skew(axis=1)
    combined_df['c_sum'] = combined_df[features_c].sum(axis=1)
    combined_df['c_mean'] = combined_df[features_c].mean(axis=1)
    combined_df['c_std'] = combined_df[features_c].std(axis=1)
    combined_df['c_kurt'] = combined_df[features_c].kurtosis(axis=1)
    combined_df['c_skew'] = combined_df[features_c].skew(axis=1)
    combined_df['gc_sum'] = combined_df[features_g + features_c].sum(axis=1)
    combined_df['gc_mean'] = combined_df[features_g + features_c].mean(axis=1)
    combined_df['gc_std'] = combined_df[features_g + features_c].std(axis=1)
    combined_df['gc_kurt'] = combined_df[features_g + features_c].kurtosis(axis=1)
    combined_df['gc_skew'] = combined_df[features_g + features_c].skew(axis=1)
    
    # Segregate train and predict datasets
    train_label_df.drop(['sig_id'], axis=1, inplace=True)
    train_y = train_label_df.values
    train_x = combined_df[:train_samples].values
    predict_x = combined_df[train_samples:].values
    
    print("\n------------- Final Dataset Shapes -------------")
    print("\ntrain_x: {}".format(train_x.shape))
    print("train_y: {}".format(train_y.shape))
    print("predict_x: {}".format(predict_x.shape))
    
    return train_x, train_y, predict_x

In [8]:
def moa_prediction_model_2L(input_shape, output_shape):
    
    # Input Layer
    x_input = Input(shape=(input_shape, ), name='INPUT')
    x = BatchNormalization(name='BN-INPUT')(x_input)
    
    # Fully-connected Layer 1
    x = Dense(units=1024, name='FC-1', activation='relu', kernel_regularizer=l2(0.0005))(x)
    x = BatchNormalization(name='BN_FC-1')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-1')(x)
    
    # Fully-connected Layer 2
    x = Dense(units=256, name='FC-2', activation='relu', kernel_regularizer=l2(0.0001))(x)
    x = BatchNormalization(name='BN_FC-2')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-2')(x)
    
    # Output Layer
    x = Dense(units=output_shape, activation='sigmoid', name='OUTPUT')(x)

    # Create Keras Model instance
    model = Model(inputs=x_input, outputs=x, name='MOA_Prediction_Model_2L')

    return model

In [9]:
def moa_prediction_model_3L(input_shape, output_shape):
    
    # Input Layer
    x_input = Input(shape=(input_shape, ), name='INPUT')
    x = BatchNormalization(name='BN-INPUT')(x_input)
    
    # Fully-connected Layer 1
    x = Dense(units=1024, name='FC-1', activation='selu', kernel_regularizer=l2(0.0005))(x)
    x = BatchNormalization(name='BN_FC-1')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-1')(x)
    
    # Fully-connected Layer 2
    x = Dense(units=512, name='FC-2', activation='selu', kernel_regularizer=l2(0.0003))(x)
    x = BatchNormalization(name='BN_FC-2')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-2')(x)
    
    # Fully-connected Layer 3
    x = Dense(units=256, name='FC-3', activation='selu', kernel_regularizer=l2(0.0001))(x)
    x = BatchNormalization(name='BN_FC-3')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-3')(x)
    
    # Output Layer
    x = Dense(units=output_shape, activation='sigmoid', name='OUTPUT')(x)

    # Create Keras Model instance
    model = Model(inputs=x_input, outputs=x, name='MOA_Prediction_Model_3L')

    return model

In [10]:
def moa_prediction_model_4L(input_shape, output_shape):
    
    # Input Layer
    x_input = Input(shape=(input_shape, ), name='INPUT')
    x = BatchNormalization(name='BN-INPUT')(x_input)
    
    # Fully-connected Layer 1
    x = Dense(units=1024, name='FC-1', kernel_regularizer=l2(0.0005))(x)
    x = LeakyReLU()(x)
    x = BatchNormalization(name='BN_FC-1')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-1')(x)
    
    # Fully-connected Layer 2
    x = Dense(units=512, name='FC-2', kernel_regularizer=l2(0.0003))(x)
    x = LeakyReLU()(x)
    x = BatchNormalization(name='BN_FC-2')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-2')(x)
    
    # Fully-connected Layer 3
    x = Dense(units=512, name='FC-3', kernel_regularizer=l2(0.0003))(x)
    x = LeakyReLU()(x)
    x = BatchNormalization(name='BN_FC-3')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-3')(x)
    
    # Fully-connected Layer 4
    x = Dense(units=256, name='FC-4', kernel_regularizer=l2(0.0001))(x)
    x = LeakyReLU()(x)
    x = BatchNormalization(name='BN_FC-4')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-4')(x)
    
    # Output Layer
    x = Dense(units=output_shape, activation='sigmoid', name='OUTPUT')(x)

    # Create Keras Model instance
    model = Model(inputs=x_input, outputs=x, name='MOA_Prediction_Model_4L')

    return model

In [11]:
def moa_prediction_model_5L(input_shape, output_shape):
    
    # Input Layer
    x_input = Input(shape=(input_shape, ), name='INPUT')
    x = BatchNormalization(name='BN-INPUT')(x_input)
    
    # Fully-connected Layer 1
    x = Dense(units=1024, name='FC-1', activation='swish', kernel_regularizer=l2(0.0005))(x)
    x = BatchNormalization(name='BN_FC-1')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-1')(x)
    
    # Fully-connected Layer 2
    x = Dense(units=512, name='FC-2', activation='swish', kernel_regularizer=l2(0.0003))(x)
    x = BatchNormalization(name='BN_FC-2')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-2')(x)
    
    # Fully-connected Layer 3
    x = Dense(units=512, name='FC-3', activation='swish', kernel_regularizer=l2(0.0003))(x)
    x = BatchNormalization(name='BN_FC-3')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-3')(x)
    
    # Fully-connected Layer 4
    x = Dense(units=256, name='FC-4', activation='swish', kernel_regularizer=l2(0.0001))(x)
    x = BatchNormalization(name='BN_FC-4')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-4')(x)
    
    # Fully-connected Layer 5
    x = Dense(units=256, name='FC-5', activation='swish', kernel_regularizer=l2(0.0001))(x)
    x = BatchNormalization(name='BN_FC-5')(x)
    x = Dropout(rate=0.5, name='DROPOUT_FC-5')(x)
    
    # Output Layer
    x = Dense(units=output_shape, activation='sigmoid', name='OUTPUT')(x)

    # Create Keras Model instance
    model = Model(inputs=x_input, outputs=x, name='MOA_Prediction_Model_5L')

    return model

## Data Preprocessing

### Set file paths for train and predict datasets

In [12]:
train_dataset = "/kaggle/input/lish-moa/train_features.csv"
train_labels_scored = "/kaggle/input/lish-moa/train_targets_scored.csv"
predict_dataset = "/kaggle/input/lish-moa/test_features.csv"

### Process train and predict features datasets

In [13]:
# Read and display individual dataframe shapes
train_df = pd.read_csv(train_dataset)
train_label_df = pd.read_csv(train_labels_scored)
train_label_df.drop(['sig_id'], axis=1, inplace=True)
predict_df = pd.read_csv(predict_dataset)

print("train_df: {}".format(train_df.shape))
print("train_label_df: {}".format(train_label_df.shape))
print("predict_df: {}".format(predict_df.shape))

train_df: (23814, 876)
train_label_df: (23814, 206)
predict_df: (3982, 876)


In [14]:
datasets = {}
datasets['train_dataset'] = train_dataset
datasets['train_labels_scored'] = train_labels_scored
datasets['predict_dataset'] = predict_dataset

In [15]:
Xtrain, Ytrain, Xpredict = data_preprocess(datasets)


------------- Input Dataset Shapes -------------
train_df: (23814, 876)
train_label_df: (23814, 207)
predict_df: (3982, 876)

------------- Dataset Shapes after removing ctl_vehicle rows -------------
train_df: (21948, 876)
train_label_df: (21948, 207)

------------- Performing PCA -------------
Dataset size after PCA: 
train: (21948, 1036) 
predict: (3982, 1036)

------------- Final Dataset Shapes -------------

train_x: (21948, 1050)
train_y: (21948, 206)
predict_x: (3982, 1050)


## Build and validate the model

In [16]:
# Define the model hyperparameters
mini_batch_size = 128

# Prediction Clipping Thresholds
p_min = 0.001
p_max = 0.999

# Set number of K-Folds
FOLD = 7

# Set seeds for training different models
np.random.seed(1)
seed_2L = np.random.randint(  0,  50, size=3)      # relu
seed_3L = np.random.randint( 51, 100, size=3)      # selu
seed_4L = np.random.randint(101, 150, size=2)      # leaky-relu
seed_5L = np.random.randint(151, 200, size=3)      # swish
seed_mstr = [seed_2L, seed_3L, seed_4L, seed_5L]

In [17]:
val_metric_final = 0
y_pred_final = 0
idx = 0
ctr = 1

for i in range(len(seed_mstr)):
    seeds = seed_mstr[i]
    val_metric = 0
    j = 0
    ctr += 1
    
    for seed in seeds:
        seed_metric = 0

        # Data Preprocessing
        # Xtrain, Ytrain, Xpredict = data_preprocess(datasets, seed)

        # Define K-fold cross validation test harness
        kfold = MultilabelStratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

        for i, (train, val) in enumerate(kfold.split(Xtrain, Ytrain)):

            idx += 1
            j += 1
            train_x_tmp, val_x_tmp = Xtrain[train], Xtrain[val]
            train_y_tmp, val_y_tmp = Ytrain[train], Ytrain[val]

            # Create the model
            if ctr==2:
                model = moa_prediction_model_2L(Xtrain.shape[1], Ytrain.shape[1])
            elif ctr==3:
                model = moa_prediction_model_3L(Xtrain.shape[1], Ytrain.shape[1])
            elif ctr==4:
                model = moa_prediction_model_4L(Xtrain.shape[1], Ytrain.shape[1])
            elif ctr==5:
                model = moa_prediction_model_5L(Xtrain.shape[1], Ytrain.shape[1])

            # Compile model to configure the learning process
            model.compile(loss='binary_crossentropy', 
                          optimizer=Lookahead(AdamW(lr=1e-2, 
                                                    weight_decay=1e-5, 
                                                    clipvalue=700), 
                                              sync_period=10))

            # Early stopping policy
            early = EarlyStopping(monitor="val_loss", mode="min", 
                                  restore_best_weights=True, 
                                  patience=10, verbose=0)

            # Reduce LR on plateau policy
            reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, 
                                          min_lr=1e-5, patience=5, 
                                          verbose=0, mode='min')

            # Fit the model
            history = model.fit(x=train_x_tmp, y=train_y_tmp, 
                                batch_size=mini_batch_size, 
                                epochs=85, verbose=0, workers=5,
                                callbacks=[reduce_lr, early], 
                                validation_data=(val_x_tmp, val_y_tmp))

            # Get logloss metric on validation dataset
            y_pred = model.predict(val_x_tmp)
            y_pred = np.clip(y_pred, p_min, p_max)
            true_labels = pd.DataFrame(val_y_tmp, columns=train_label_df.columns)
            pred_labels = pd.DataFrame(y_pred, columns=train_label_df.columns)
            metric = logloss_metric(train_label_df, true_labels, pred_labels)
            
            # Assign the logloss metric to individual variables
            seed_metric += metric
            val_metric += metric
            val_metric_final += metric

            pred_final = model.predict(Xpredict)
            y_pred_final += pred_final
            print("{}-Layer Model | Seed: {} | Fold: {} | Log Loss: {}".format(ctr, seed, (i+1), metric))
        
        
        agg_metric_per_fold = seed_metric / FOLD
        print("\n{}-Layer Model | Seed: {} | Aggregate Log Loss: {}".format(ctr, seed, agg_metric_per_fold))
        print("--------------------------------------------------------------------")
        
        
    agg_metric_per_seed = val_metric / j
    print("--------------------------------------------------------------------")
    print("{}-Layer Model | Aggregate Log Loss: {}".format(ctr, agg_metric_per_seed))
    print("--------------------------------------------------------------------")
    print("--------------------------------------------------------------------\n\n")


print("\nTotal #iterations: {}".format(idx))
val_metric_final /= float(idx)
y_pred_final /= float(idx)
print("Validation Logloss Metric: {}".format(val_metric))

2-Layer Model | Seed: 37 | Fold: 1 | Log Loss: 0.01628473008242043
2-Layer Model | Seed: 37 | Fold: 2 | Log Loss: 0.016675568282555793
2-Layer Model | Seed: 37 | Fold: 3 | Log Loss: 0.016449822286556236
2-Layer Model | Seed: 37 | Fold: 4 | Log Loss: 0.016618990101947423
2-Layer Model | Seed: 37 | Fold: 5 | Log Loss: 0.016336608981526286
2-Layer Model | Seed: 37 | Fold: 6 | Log Loss: 0.016604642013457335
2-Layer Model | Seed: 37 | Fold: 7 | Log Loss: 0.016507218687340872

2-Layer Model | Seed: 37 | Aggregate Log Loss: 0.01649679720511491
--------------------------------------------------------------------
2-Layer Model | Seed: 43 | Fold: 1 | Log Loss: 0.016509745287787165
2-Layer Model | Seed: 43 | Fold: 2 | Log Loss: 0.016677909837265654
2-Layer Model | Seed: 43 | Fold: 3 | Log Loss: 0.016234026289052725
2-Layer Model | Seed: 43 | Fold: 4 | Log Loss: 0.016435054032969593
2-Layer Model | Seed: 43 | Fold: 5 | Log Loss: 0.016373127844958663
2-Layer Model | Seed: 43 | Fold: 6 | Log Loss: 0

## Create submission file

In [18]:
y_pred_final = np.clip(y_pred_final, p_min, p_max)
pred_labels = pd.DataFrame(y_pred_final, columns=train_label_df.columns)
pred_labels.loc[predict_df['cp_type']=='ctl_vehicle', train_label_df.columns] = 0
print(pred_labels.shape)
pred_labels.head()

(3982, 206)


Unnamed: 0,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,adrenergic_receptor_agonist,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,0.001195,0.001304,0.001383,0.01509,0.024498,0.005698,0.003395,0.00601,0.001,0.017447,...,0.001,0.001,0.003421,0.001868,0.001427,0.001,0.001225,0.001789,0.002849,0.001934
1,0.001,0.001,0.002028,0.003667,0.004198,0.001933,0.002291,0.005213,0.003312,0.01143,...,0.001,0.002008,0.002123,0.001019,0.010571,0.001,0.007473,0.001098,0.002142,0.001517
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.001,0.001,0.001738,0.009912,0.011846,0.003252,0.002705,0.005112,0.001,0.011944,...,0.001,0.001116,0.00254,0.003387,0.007088,0.001,0.004711,0.00182,0.001835,0.001908
4,0.001,0.001071,0.001255,0.009942,0.014895,0.003758,0.003373,0.003761,0.001,0.012268,...,0.001,0.001163,0.002768,0.003787,0.001627,0.001,0.001232,0.001209,0.001831,0.001368


In [19]:
submit_df = pd.read_csv("/kaggle/input/lish-moa/sample_submission.csv")
submit_df.loc[:, submit_df.columns != 'sig_id'] = pred_labels
submit_df.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.001195,0.001304,0.001383,0.01509,0.024498,0.005698,0.003395,0.00601,0.001,...,0.001,0.001,0.003421,0.001868,0.001427,0.001,0.001225,0.001789,0.002849,0.001934
1,id_001897cda,0.001,0.001,0.002028,0.003667,0.004198,0.001933,0.002291,0.005213,0.003312,...,0.001,0.002008,0.002123,0.001019,0.010571,0.001,0.007473,0.001098,0.002142,0.001517
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_00276f245,0.001,0.001,0.001738,0.009912,0.011846,0.003252,0.002705,0.005112,0.001,...,0.001,0.001116,0.00254,0.003387,0.007088,0.001,0.004711,0.00182,0.001835,0.001908
4,id_0027f1083,0.001,0.001071,0.001255,0.009942,0.014895,0.003758,0.003373,0.003761,0.001,...,0.001,0.001163,0.002768,0.003787,0.001627,0.001,0.001232,0.001209,0.001831,0.001368


In [20]:
submit_df.to_csv("/kaggle/working/submission.csv", index=False)