In [1]:
import os
import pandas as pd
import numpy as np
import time

# sklearn utilities
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, f1_score, accuracy_score, make_scorer
from sklearn.ensemble import RandomForestClassifier

# filter warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
for file in os.listdir():
    if file.endswith('.csv'):
        print(file)

initial_submission.csv
train_targets_scored.csv
train_features.csv
train_drug.csv
sample_submission.csv
train_targets_nonscored.csv
test_features.csv


#### We have 5 files with data and/or samples
1. sample_submission.csv
2. test_features.csv
3. train_features.csv
4. train_drug.csv
5. train_targets_nonscored.csv
6. train_targets_scored.csv

In [3]:
# Check sample_submission file
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,id_001897cda,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2,id_002429b5b,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
3,id_00276f245,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
4,id_0027f1083,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5


In [4]:
sample_submission.columns

Index(['sig_id', '5-alpha_reductase_inhibitor', '11-beta-hsd1_inhibitor',
       'acat_inhibitor', 'acetylcholine_receptor_agonist',
       'acetylcholine_receptor_antagonist', 'acetylcholinesterase_inhibitor',
       'adenosine_receptor_agonist', 'adenosine_receptor_antagonist',
       'adenylyl_cyclase_activator',
       ...
       'tropomyosin_receptor_kinase_inhibitor', 'trpv_agonist',
       'trpv_antagonist', 'tubulin_inhibitor', 'tyrosine_kinase_inhibitor',
       'ubiquitin_specific_protease_inhibitor', 'vegfr_inhibitor', 'vitamin_b',
       'vitamin_d_receptor_agonist', 'wnt_inhibitor'],
      dtype='object', length=207)

In [5]:
# Check train_features file
train_features = pd.read_csv('train_features.csv')
train_features.head()

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,...,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
3,id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,...,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
4,id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,...,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


In [6]:
# Check test_features file
test_features = pd.read_csv('test_features.csv')
test_features.head()

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_0004d9e33,trt_cp,24,D1,-0.5458,0.1306,-0.5135,0.4408,1.55,-0.1644,...,0.0981,0.7978,-0.143,-0.2067,-0.2303,-0.1193,0.021,-0.0502,0.151,-0.775
1,id_001897cda,trt_cp,72,D1,-0.1829,0.232,1.208,-0.4522,-0.3652,-0.3319,...,-0.119,-0.1852,-1.031,-1.367,-0.369,-0.5382,0.0359,-0.4764,-1.381,-0.73
2,id_002429b5b,ctl_vehicle,24,D1,0.1852,-0.1404,-0.3911,0.131,-1.438,0.2455,...,-0.2261,0.337,-1.384,0.8604,-1.953,-1.014,0.8662,1.016,0.4924,-0.1942
3,id_00276f245,trt_cp,24,D2,0.4828,0.1955,0.3825,0.4244,-0.5855,-1.202,...,0.126,0.157,-0.1784,-1.12,-0.4325,-0.9005,0.8131,-0.1305,0.5645,-0.5809
4,id_0027f1083,trt_cp,48,D1,-0.3979,-1.268,1.913,0.2057,-0.5864,-0.0166,...,0.4965,0.7578,-0.158,1.051,0.5742,1.09,-0.2962,-0.5313,0.9931,1.838


In [7]:
# Check train_drug file
train_drug = pd.read_csv('train_drug.csv')
train_drug.head()

Unnamed: 0,sig_id,drug_id
0,id_000644bb2,b68db1d53
1,id_000779bfc,df89a8e5a
2,id_000a6266a,18bb41b2c
3,id_0015fd391,8c7f86626
4,id_001626bd3,7cbed3131


In [8]:
# Check train_targets_nonscored file
train_targets_nonscored = pd.read_csv('train_targets_nonscored.csv')
train_targets_nonscored.head()

Unnamed: 0,sig_id,abc_transporter_expression_enhancer,abl_inhibitor,ace_inhibitor,acetylcholine_release_enhancer,adenosine_deaminase_inhibitor,adenosine_kinase_inhibitor,adenylyl_cyclase_inhibitor,age_inhibitor,alcohol_dehydrogenase_inhibitor,...,ve-cadherin_antagonist,vesicular_monoamine_transporter_inhibitor,vitamin_k_antagonist,voltage-gated_calcium_channel_ligand,voltage-gated_potassium_channel_activator,voltage-gated_sodium_channel_blocker,wdr5_mll_interaction_inhibitor,wnt_agonist,xanthine_oxidase_inhibitor,xiap_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Check train_targets_scored file
train_targets_scored = pd.read_csv('train_targets_scored.csv')
train_targets_scored.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Any missing values in train_features
train_features.isnull().sum()

sig_id     0
cp_type    0
cp_time    0
cp_dose    0
g-0        0
          ..
c-95       0
c-96       0
c-97       0
c-98       0
c-99       0
Length: 876, dtype: int64

In [11]:
# Any missing values in test features
test_features.isnull().sum()

sig_id     0
cp_type    0
cp_time    0
cp_dose    0
g-0        0
          ..
c-95       0
c-96       0
c-97       0
c-98       0
c-99       0
Length: 876, dtype: int64

In [12]:
# Take a copy of data sets and exclude sig_id from working sets
train_f_copy = train_features.copy()
test_f_copy = test_features.copy()
train_targets_scored_copy = train_targets_scored.copy()
train_features = train_features.drop('sig_id',axis=1)
test_features = test_features.drop('sig_id',axis=1)
train_targets_scored = train_targets_scored.drop('sig_id',axis=1)

In [13]:
# Select Numerical columns
numerical_cols = [cname for cname in train_features.columns if train_features[cname].dtype in ['int64', 'float64']]

# Select Categorical columns
categorical_cols = [cname for cname in train_features.columns if train_features[cname].dtype == 'object']

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [14]:
categorical_cols

['cp_type', 'cp_dose']

In [15]:
print(train_features.cp_type.nunique())
print(train_features.cp_dose.nunique())
print(train_features.cp_type.unique())
print(train_features.cp_dose.unique())

2
2
['trt_cp' 'ctl_vehicle']
['D1' 'D2']


In [16]:
train_features = preprocessor.fit_transform(train_features)
test_features = preprocessor.transform(test_features)

In [17]:
# function to perform logistic regression against a target feature
def log_reg(train_X, y, test_X):
    n = train_X.shape[1]
    lg = RandomForestClassifier()
    lg.fit(train_X,y)
    y_hat = lg.predict(train_X)
    y_test_hat = lg.predict(test_X)
    loss = -1/n * np.sum(
        np.multiply(y, np.log(y_hat)) + np.multiply(1-y, np.log(1-y_hat))
    )
    return (y_hat, loss, y_test_hat)

In [18]:
# train and predict the targets
train_results=pd.DataFrame()
test_results=pd.DataFrame()
loss = []
train_f1_score = []
for i in train_targets_scored.columns:
    start_time = time.time()
    y_preds, log_loss, y_test_hat = log_reg(train_features, train_targets_scored[i], test_features)
    train_results[i] = y_preds
    test_results[i] = y_test_hat
    loss.append(log_loss)
    train_f1_score.append(f1_score(train_targets_scored[i], y_preds))
    time_taken = time.time() - start_time 
    print(i,'- time taken',round(time_taken,2),' seconds')

5-alpha_reductase_inhibitor - time taken 21.41  seconds
11-beta-hsd1_inhibitor - time taken 24.35  seconds
acat_inhibitor - time taken 37.61  seconds
acetylcholine_receptor_agonist - time taken 103.87  seconds
acetylcholine_receptor_antagonist - time taken 110.56  seconds
acetylcholinesterase_inhibitor - time taken 56.41  seconds
adenosine_receptor_agonist - time taken 49.51  seconds
adenosine_receptor_antagonist - time taken 66.43  seconds
adenylyl_cyclase_activator - time taken 18.78  seconds
adrenergic_receptor_agonist - time taken 91.35  seconds
adrenergic_receptor_antagonist - time taken 133.59  seconds
akt_inhibitor - time taken 54.51  seconds
aldehyde_dehydrogenase_inhibitor - time taken 10.03  seconds
alk_inhibitor - time taken 41.99  seconds
ampk_activator - time taken 20.35  seconds
analgesic - time taken 19.23  seconds
androgen_receptor_agonist - time taken 43.95  seconds
androgen_receptor_antagonist - time taken 61.88  seconds
anesthetic_-_local - time taken 56.46  seconds


In [19]:
# Average train F1 score
np.array(train_f1_score).mean()

0.9982089684164919

In [20]:
# Average log loss
np.array(loss).mean()

inf

In [21]:
loss_pd = pd.DataFrame(loss)

In [22]:
loss_pd 

Unnamed: 0,0
0,-0.0
1,-0.0
2,-0.0
3,-0.0
4,inf
...,...
201,-0.0
202,-0.0
203,-0.0
204,-0.0


In [23]:
out_df = pd.concat([test_f_copy['sig_id'], test_results], axis=1)
out_df.columns = sample_submission.columns
out_df.to_csv('initial_submission.csv', index=False)  