In [126]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns

In [127]:
features = pd.read_csv('data/train_features.csv')
targets = pd.read_csv('data/train_targets_scored.csv')
# TODO: nonscored as well?

In [128]:
### Data preprocessing

# Drop a useless column
features = features.drop('sig_id', axis=1)
targets = targets.drop('sig_id', axis=1)

# One-hot encode categorical vars
cp_cols = ['cp_type', 'cp_time', 'cp_dose']
features = pd.get_dummies(features, columns=cp_cols)

# Standardize data
from sklearn.preprocessing import StandardScaler
scaler_ = StandardScaler().fit(features)
features = scaler_.transform(features)

# Perform PCA
from sklearn.decomposition import PCA
pca_ = PCA(n_components=0.8).fit(features)
features = pca_.transform(features)
print(f'Data reduced to {features.shape[1]} dimensions')

Data reduced to 198 dimensions


In [129]:
pos_counts = targets.sum(axis=0)
pos_counts.sort_values(ascending=False)

nfkb_inhibitor                                832
proteasome_inhibitor                          726
cyclooxygenase_inhibitor                      435
dopamine_receptor_antagonist                  424
serotonin_receptor_antagonist                 404
                                             ... 
protein_phosphatase_inhibitor                   6
autotaxin_inhibitor                             6
diuretic                                        6
erbb2_inhibitor                                 1
atp-sensitive_potassium_channel_antagonist      1
Length: 206, dtype: int64

In [130]:
from sklearn.utils import resample, shuffle

def resample_data(X, y):
    new_pos, new_neg = 1000, 1000
    pos_idx, neg_idx = np.where(y == 1), np.where(y == 0)
    new_X_pos = resample(X[pos_idx], n_samples=new_pos, replace=True)
    new_X_neg = resample(X[neg_idx], n_samples=new_neg, replace=False)
    new_X = np.concatenate([new_X_pos, new_X_neg])
    new_y = np.concatenate([np.repeat(1, new_pos), np.repeat(0, new_neg)])
    return shuffle(new_X, new_y)

In [145]:
### Linear classification

np.random.seed(0)

from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report, log_loss
from sklearn.model_selection import train_test_split

losses = []
for target in targets.columns.values:
    X, y = resample_data(features, targets[target])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)
    
    model = LogisticRegressionCV(Cs=10, cv=5, scoring='f1')
    model.fit(X_train, y_train)
    print(f"Cs: {model.Cs_}")
    print(f"best C: {model.C_}")
    y_pred, y_prob = model.predict(X_test), model.predict_proba(X_test)
    
    n_pos = sum(targets[target] == 1)
    print(f"{target} - {n_pos} positive examples")
    print(classification_report(y_test, y_pred))
    losses.append(log_loss(y_test, y_prob))

print(f"loss: {np.mean(losses)}")

Cs: [1.00000000e-04 7.74263683e-04 5.99484250e-03 4.64158883e-02
 3.59381366e-01 2.78255940e+00 2.15443469e+01 1.66810054e+02
 1.29154967e+03 1.00000000e+04]
best C: [166.81005372]
5-alpha_reductase_inhibitor - 17 positive examples
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       100
           1       1.00      1.00      1.00       100

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cs: [1.00000000e-04 7.74263683e-04 5.99484250e-03 4.64158883e-02
 3.59381366e-01 2.78255940e+00 2.15443469e+01 1.66810054e+02
 1.29154967e+03 1.00000000e+04]
best C: [1291.54966501]
11-beta-hsd1_inhibitor - 18 positive examples
              precision    recall  f1-score   support

           0       1.00      0.95      0.97       100
           1       0.95      1.00      0.98       100

    accuracy                           0.97       200
   macro avg       0.98      0.97      0.97       200
weighted avg       0.98      0.97      0.97       200



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cs: [1.00000000e-04 7.74263683e-04 5.99484250e-03 4.64158883e-02
 3.59381366e-01 2.78255940e+00 2.15443469e+01 1.66810054e+02
 1.29154967e+03 1.00000000e+04]
best C: [2.7825594]
acat_inhibitor - 24 positive examples
              precision    recall  f1-score   support

           0       1.00      0.94      0.97       100
           1       0.94      1.00      0.97       100

    accuracy                           0.97       200
   macro avg       0.97      0.97      0.97       200
weighted avg       0.97      0.97      0.97       200

Cs: [1.00000000e-04 7.74263683e-04 5.99484250e-03 4.64158883e-02
 3.59381366e-01 2.78255940e+00 2.15443469e+01 1.66810054e+02
 1.29154967e+03 1.00000000e+04]
best C: [166.81005372]
acetylcholine_receptor_agonist - 190 positive examples
              precision    recall  f1-score   support

           0       0.84      0.73      0.78       100
           1       0.76      0.86      0.81       100

    accuracy                           0.80       200
   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cs: [1.00000000e-04 7.74263683e-04 5.99484250e-03 4.64158883e-02
 3.59381366e-01 2.78255940e+00 2.15443469e+01 1.66810054e+02
 1.29154967e+03 1.00000000e+04]
best C: [10000.]
acetylcholinesterase_inhibitor - 73 positive examples
              precision    recall  f1-score   support

           0       1.00      0.78      0.88       100
           1       0.82      1.00      0.90       100

    accuracy                           0.89       200
   macro avg       0.91      0.89      0.89       200
weighted avg       0.91      0.89      0.89       200



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

In [146]:
### Ensemble model

np.random.seed(0)

from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

losses = []
for target in targets.columns.values:
    X, y = resample_data(features, targets[target])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)
    
    model = GridSearchCV(
        AdaBoostClassifier(base_estimator=DecisionTreeClassifier()),
        param_grid={
            'n_estimators': [10, 25, 50, 100],
            'learning_rate': [0.01, 0.1, 1, 10]
        },
        scoring='f1',
        cv=5)
    model.fit(X_train, y_train)
    print(f"best params: {model.best_params_}")
    y_pred, y_prob = model.predict(X_test), model.predict_proba(X_test)
    
    n_pos = sum(targets[target] == 1)
    print(f"{target} - {n_pos} positive examples")
    print(classification_report(y_test, y_pred))
    losses.append(log_loss(y_test, y_prob))

print(f"loss: {np.mean(losses)}")

best params: {'learning_rate': 0.01, 'n_estimators': 100}
5-alpha_reductase_inhibitor - 17 positive examples
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       100
           1       0.97      1.00      0.99       100

    accuracy                           0.98       200
   macro avg       0.99      0.98      0.98       200
weighted avg       0.99      0.98      0.98       200



KeyboardInterrupt: 

In [133]:
### Deep learning

import torch
from torch import nn
from torch.nn import functional as F

class Net(nn.Module):
    def __init__(self, n_feats, n_labels):
        super().__init__()
        self.fc1 = nn.Linear(n_feats, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, n_labels)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [134]:
from sklearn import metrics

def train(model, train_loader, criterion, optimizer, device, epoch):
    model.train()
    
    #all_preds = []
    #all_targets = []
    losses = []
    
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        '''
        batch_size = inputs.size()[0]
        n_labels = targets.size()[1]
        assert outputs.size() == (batch_size, n_labels)
        preds = np.zeros((batch_size, n_labels))
        preds[outputs.detach().numpy() > 0] = 1
        all_preds.extend(preds)
        all_targets.extend(targets.detach().numpy())
        '''
        losses.append(loss.item())
        
        if batch_idx % 100 == 0:
            print(f"\tepoch {epoch} batch {batch_idx} loss {np.mean(losses)}")
            #print(metrics.classification_report(all_targets, all_preds))
    
    return np.mean(losses)

def test(model, test_loader, criterion, device):
    model.eval()

    losses = []
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            losses.append(loss)
    
    return np.mean(losses)

In [148]:
from torch import optim
from torch.utils.data import DataLoader, TensorDataset

np.random.seed(0)
torch.manual_seed(0)

batch_size = 10
learning_rate = 0.01
n_epochs = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_feats, n_labels = features.shape[1], targets.shape[1]
model = Net(n_feats, n_labels).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

X_train, X_test, y_train, y_test = [torch.tensor(np.asarray(split), dtype=torch.float).to(device) for split in train_test_split(features, targets, test_size=0.1)]
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

for epoch in range(n_epochs):
    # Train the neural network
    # Note: we output the probs for each label simultaneously instead of training a bunch of different models
    print(f"starting epoch {epoch}")
    train_loss = train(model, train_loader, criterion, optimizer, device, epoch)
    print(f"epoch {epoch} completed, training loss {train_loss}")
    
    # Test the neural network
    test_loss = test(model, test_loader, criterion, device)
    print(f"test loss {test_loss}")

starting epoch 0
	epoch 0 batch 0 loss 0.6996876001358032
	epoch 0 batch 100 loss 0.05308194199374112
	epoch 0 batch 200 loss 0.03912204130093998
	epoch 0 batch 300 loss 0.03375352380265083
	epoch 0 batch 400 loss 0.031246723991556282
	epoch 0 batch 500 loss 0.029460908112649672
	epoch 0 batch 600 loss 0.02902440421636559
	epoch 0 batch 700 loss 0.0284137892843717
	epoch 0 batch 800 loss 0.028173149339552415
	epoch 0 batch 900 loss 0.027842701077316533
	epoch 0 batch 1000 loss 0.027593464659458094
	epoch 0 batch 1100 loss 0.028082180440892803
	epoch 0 batch 1200 loss 0.027995471557219213
	epoch 0 batch 1300 loss 0.028215831877960094
	epoch 0 batch 1400 loss 0.028186913865547277
	epoch 0 batch 1500 loss 0.02801094133578127
	epoch 0 batch 1600 loss 0.027716666688650457
	epoch 0 batch 1700 loss 0.027402381006976496
	epoch 0 batch 1800 loss 0.027083218006157563
	epoch 0 batch 1900 loss 0.026833143834806568
	epoch 0 batch 2000 loss 0.026572826127384392
	epoch 0 batch 2100 loss 0.02630335943

	epoch 7 batch 1200 loss 0.02064615107187187
	epoch 7 batch 1300 loss 0.02065760110770506
	epoch 7 batch 1400 loss 0.020688681169908272
	epoch 7 batch 1500 loss 0.020672476906912634
	epoch 7 batch 1600 loss 0.020698155783149392
	epoch 7 batch 1700 loss 0.02072789723700737
	epoch 7 batch 1800 loss 0.02071891486234156
	epoch 7 batch 1900 loss 0.020747096877180623
	epoch 7 batch 2000 loss 0.020802625705362825
	epoch 7 batch 2100 loss 0.02077951222154596
epoch 7 completed, training loss 0.020769814560155093
test loss 0.021047938615083694
starting epoch 8
	epoch 8 batch 0 loss 0.017106814309954643
	epoch 8 batch 100 loss 0.020814212481721793
	epoch 8 batch 200 loss 0.02100498531365869
	epoch 8 batch 300 loss 0.020838440941999423
	epoch 8 batch 400 loss 0.020804728751774516
	epoch 8 batch 500 loss 0.020726526424682843
	epoch 8 batch 600 loss 0.020813048247731416
	epoch 8 batch 700 loss 0.020744841008416552
	epoch 8 batch 800 loss 0.02068029023351127
	epoch 8 batch 900 loss 0.0206968337540348

In [153]:
### Generate our predictions for the test set

test_features = pd.read_csv('data/test_features.csv')
test_ids = test_features['sig_id']
test_features = test_features.drop('sig_id', axis=1)
test_features = pd.get_dummies(test_features, columns=cp_cols)
test_features = scaler_.transform(test_features)
test_features = pca_.transform(test_features)

model.eval()
test_loader = DataLoader(TensorDataset(torch.tensor(test_features, dtype=torch.float).to(device)), batch_size=batch_size)

all_outputs = []
with torch.no_grad():
    for (inputs,) in test_loader:
        #print(inputs)
        outputs = model(inputs)
        all_outputs.extend(outputs)
all_outputs = torch.stack(all_outputs, dim=0).numpy()

n_instances, n_labels = test_features.shape[0], targets.shape[1]
test_preds = np.zeros((n_instances, n_labels))
test_preds[all_outputs > 0] = 1

submission = pd.DataFrame(test_preds, columns=targets.columns.values)
submission['sig_id'] = test_ids
submission = submission.loc[:, ['sig_id'] + list(targets.columns.values)] # Reorder
submission.to_csv('submission.csv', index=False)