In [1]:
import pandas as pd
import numpy as np

In [2]:
"""
Reading data
"""
df_train = pd.read_csv('cs589_mini_project/data/train_features.csv')
df_test = pd.read_csv('cs589_mini_project/data/test_features.csv')
df_target_s = pd.read_csv('cs589_mini_project/data/train_targets_scored.csv')
submission = pd.read_csv('cs589_mini_project/data/sample_submission.csv')

In [3]:
# display(df_train.head(5))
# display(df_test.head(5))
# display(df_target_s.head(5))
# display(submission.head(5))
print(df_train.shape)
print(df_test.shape)
print(df_target_s.shape)
print(submission.shape)

(23814, 876)
(3982, 876)
(23814, 207)
(3982, 207)


In [4]:
def preprocess(df):
    df['cp_type'] = df['cp_type'].map({'trt_cp':0, 'ctl_vehicle':1})
    df['cp_time'] = df['cp_time'].map({24:1, 48:2, 72:3})
    df['cp_dose'] = df['cp_dose'].map({'D1':0, 'D2':1})
    return df
X_train = preprocess(df_train)
X_test = preprocess(df_test)
display(X_train)

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,0,1,0,1.0620,0.5577,-0.2479,-0.6208,-0.1944,-1.0120,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,id_000779bfc,0,3,0,0.0743,0.4087,0.2991,0.0604,1.0190,0.5207,...,-0.4265,0.7543,0.4708,0.0230,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,id_000a6266a,0,2,0,0.6280,0.5817,1.5540,-0.0764,-0.0323,1.2390,...,-0.7250,-0.6297,0.6103,0.0223,-1.3240,-0.3174,-0.6417,-0.2187,-1.4080,0.6931
3,id_0015fd391,0,2,0,-0.5138,-0.2491,-0.2656,0.5288,4.0620,-0.8095,...,-2.0990,-0.6441,-5.6300,-1.3780,-0.8632,-1.2880,-1.6210,-0.8784,-0.3876,-0.8154
4,id_001626bd3,0,3,1,-0.3254,-0.4009,0.9700,0.6919,1.4180,-0.8244,...,0.0042,0.0048,0.6670,1.0690,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,id_fffb1ceed,0,1,1,0.1394,-0.0636,-0.1112,-0.5080,-0.4713,0.7201,...,0.1969,0.0262,-0.8121,0.3434,0.5372,-0.3246,0.0631,0.9171,0.5258,0.4680
23810,id_fffb70c0c,0,1,1,-1.3260,0.3478,-0.3743,0.9905,-0.7178,0.6621,...,0.4286,0.4426,0.0423,-0.3195,-0.8086,-0.9798,-0.2084,-0.1224,-0.2715,0.3689
23811,id_fffc1c3f4,1,2,1,0.3942,0.3756,0.3109,-0.7389,0.5505,-0.0159,...,0.5409,0.3755,0.7343,0.2807,0.4116,0.6422,0.2256,0.7592,0.6656,0.3808
23812,id_fffcb9e7c,0,1,0,0.6660,0.2324,0.4392,0.2044,0.8531,-0.0343,...,-0.1105,0.4258,-0.2012,0.1506,1.5230,0.7101,0.1732,0.7015,-0.6290,0.0740


In [5]:
from sklearn.decomposition import PCA

"""
PCA on Genes 
"""
n_comp = 200

genes = [col for col in X_train.columns if col.startswith('g-')]
cells = [col for col in X_train.columns if col.startswith('c-')]

data_genes = pd.concat([pd.DataFrame(X_train[genes]), pd.DataFrame(X_test[genes])])
data_genes_pca = PCA(n_components=n_comp, random_state=42).fit_transform(data_genes)

train_gene_pca = data_genes_pca[:X_train.shape[0]]
test_gene_pca = data_genes_pca[-X_test.shape[0]:]

train_gene_pca = pd.DataFrame(train_gene_pca, columns=[f'pca_G-{i}' for i in range(n_comp)])
test_gene_pca = pd.DataFrame(test_gene_pca, columns=[f'pca_G-{i}' for i in range(n_comp)])

"""
PCA on Cells
"""
n_comp = 50 

data_cells = pd.concat([pd.DataFrame(X_train[cells]), pd.DataFrame(X_test[cells])])
data_cells_pca = PCA(n_components=n_comp, random_state=42).fit_transform(data_cells)

train_cells_pca = data_cells_pca[:X_train.shape[0]] 
test_cells_pca = data_cells_pca[-X_test.shape[0]:]

train_cells_pca = pd.DataFrame(train_cells_pca, columns=[f'pca_C-{i}' for i in range(n_comp)])
test_cells_pca = pd.DataFrame(test_cells_pca, columns=[f'pca_C-{i}' for i in range(n_comp)])

In [6]:
train_features = pd.concat((train_gene_pca, train_cells_pca), axis=1)
test_features = pd.concat((test_gene_pca, test_cells_pca), axis=1)

# y_labels
df_target_s = df_target_s.drop(['sig_id'], axis=1)

display(train_features)

Unnamed: 0,pca_G-0,pca_G-1,pca_G-2,pca_G-3,pca_G-4,pca_G-5,pca_G-6,pca_G-7,pca_G-8,pca_G-9,...,pca_C-40,pca_C-41,pca_C-42,pca_C-43,pca_C-44,pca_C-45,pca_C-46,pca_C-47,pca_C-48,pca_C-49
0,-8.191034,-0.488171,-3.967531,6.922384,3.426204,-4.939660,-4.021629,3.108678,2.719793,-2.409787,...,-0.511125,1.022494,-0.136036,0.075919,-0.066207,-0.101230,0.225362,0.057230,-0.107296,-0.341431
1,-6.531898,3.288575,9.420453,-0.574416,-2.009119,4.779334,2.585964,1.996409,0.305785,1.425774,...,-0.319058,0.319740,-0.159246,-0.345882,0.150300,-0.216133,0.687300,0.400472,-0.136589,0.428745
2,-1.669338,2.297153,-0.782439,-7.036332,-1.434308,-1.718394,2.841784,-0.774921,-1.731597,-4.942427,...,-0.149934,-0.867463,-0.574558,-0.320742,0.147007,0.119810,0.768706,-1.156860,-0.365257,-0.456086
3,9.268336,-4.829223,-1.752497,-11.326041,-1.178113,-10.384511,5.975081,-2.337151,-0.157038,4.973514,...,-1.084507,0.665303,-0.475641,-0.746390,1.684805,-0.698474,-0.320341,0.882071,1.369027,1.143375
4,-7.185918,0.111345,8.256769,-7.394481,-0.632286,-4.333923,-1.741193,0.646718,-6.767281,5.830797,...,-1.127680,-0.199497,0.697061,-0.557235,-0.913216,0.174704,0.175920,0.543692,0.291145,0.513161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,-6.018252,2.927708,-0.626853,-3.892167,-0.773846,-0.904330,1.530116,0.114901,2.579791,-1.943151,...,-0.287854,-0.227900,0.129922,0.478464,0.659682,-0.359301,0.600943,-0.788161,-0.156615,-1.434734
23810,-4.952320,-0.545998,-1.626794,0.221293,4.770532,-0.212782,0.308716,-2.716075,-1.425005,0.317466,...,-0.101267,-0.316646,-0.221708,-0.471236,-0.263232,-0.692952,-1.059063,0.699861,0.985178,0.260118
23811,-6.522456,-1.372804,-0.831251,0.531181,-0.837211,-1.502036,0.003087,2.996020,-1.024627,-0.891801,...,-0.542585,0.421446,0.047425,-0.000176,-0.890236,0.125915,-0.517586,0.500146,0.305829,-0.503617
23812,7.535654,-20.646006,-1.614500,12.610355,-8.006033,-0.190805,-3.018044,5.666977,7.751807,7.323825,...,0.472907,-1.584457,-0.692551,0.229086,-0.850729,0.451430,0.720103,0.530429,0.165919,0.857855


In [10]:
from sklearn.linear_model import LogisticRegression
# from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, log_loss

"""
Linear Model - Logistic Regression (multinomial)
"""
# 3_Fold cross validation
n_splits = 3

kf = KFold(n_splits=n_splits)
kf.get_n_splits(train_features)

log_Loss = 0.0

for train_index, test_index in kf.split(train_features):
    
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train_, X_val_ = train_features.iloc[train_index], train_features.iloc[test_index]
    y_train_, y_val_ = df_target_s.iloc[train_index], df_target_s.iloc[test_index]
    
    X_train_, X_val_ = np.array(X_train_), np.array(X_val_)
    y_train_, y_val_ = np.array(y_train_), np.array(y_val_)
    
    # fix for empty columns - LogisticRegression blows up otherwise 
    check_for_empty_cols = np.where(y_train_.sum(axis = 0) == 0)[0]
    if len(check_for_empty_cols):
        y_train_[0, check_for_empty_cols] = 1
    
    clf = MultiOutputClassifier(LogisticRegression(max_iter=10000, tol=0.1, C = 0.5), n_jobs=-1)
    clf.fit(X_train_, y_train_)
    y_pred = clf.predict(X_val_)
    
    # calc averag log loss
    print("loss: ", log_loss(y_val_, y_pred))
    loss += log_loss(y_val_, y_pred)
    
    
loss /= n_splits
print("average log loss: ", loss)

TRAIN: [ 7938  7939  7940 ... 23811 23812 23813] TEST: [   0    1    2 ... 7935 7936 7937]
loss:  5.038902350517758
TRAIN: [    0     1     2 ... 23811 23812 23813] TEST: [ 7938  7939  7940 ... 15873 15874 15875]
loss:  5.138453919425258
TRAIN: [    0     1     2 ... 15873 15874 15875] TEST: [15876 15877 15878 ... 23811 23812 23813]
loss:  5.124433833132402
average log loss:  6.780230817864393


In [11]:
y_pred.shape

(7938, 206)

In [16]:
log_Loss = 0.0
for c in range(y_pred.shape[0]):
    y_pred_col = y_pred[:, c]
    y_val_col = y_val_[:, c]
    # fix empty y_val columns if there is any
    if y_val_col.sum()==0:
        y_val_col[0] = 1
    log_Loss += log_loss(y_val_col, y_pred_col)
    
print(log_Loss)
    

print(log_Loss / y_pred.shape[0])

IndexError: index 206 is out of bounds for axis 1 with size 206

In [15]:
np.sum(y_pred, axis=0)

array([  1,   1,   2,   1,   4,   6,  13,   4,   2,   8,   0,  19,   2,
        11,   1,   0,  12,   1,   6,   9,   5,  13,   0,  14,   0,   0,
         2,   3,  11,   8,   0,  17,   7,   0,   0,   1,  17,   2,  32,
         0,   6,   9,   9,   6,   6,   4,   1,   4,   7,  10,   1,  19,
         2,   0,   2,  12,  13,  12,   7,   2,   0,   4,   2,  99,   5,
         4,  10,  21,  15,   0,   4,   0,   5,   9,  10,   0,  10,  14,
         3,   5, 110,   0,   0,   8,  16,   8,   1,   6,  18,  79,   3,
         0,   2,   5,   3,  11,  86,   0,   2,   1,   1,  18,   4,  30,
         9,   2,   1,   1,  13,  75,  32,   5,   5,   8,  21,   9,  12,
         8,  30,  94,   0,   1,   8,   0,  13,   0,   8,  18,  12,   8,
         0,   3,   2,  41,   8,   8, 259,   0,   1,   1,   1,   0,   0,
         7,   8,   8,  16,   2,  22,  96,   1,   1,   4,  32,   1,  14,
         2,  16,   7,  16,   3,   7,   4, 254,  12,   0,  12,   0,  10,
        78,   0,  12,   0,   2,  13,   5,   2,   1,   8,   9,  1

In [17]:
from sklearn.ensemble import RandomForestClassifier

# 3_Fold cross validation
n_splits = 3

kf = KFold(n_splits=n_splits)
kf.get_n_splits(train_features)

log_Loss = 0.0

for train_index, test_index in kf.split(train_features):
    
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train_, X_val_ = train_features.iloc[train_index], train_features.iloc[test_index]
    y_train_, y_val_ = df_target_s.iloc[train_index], df_target_s.iloc[test_index]
    
    X_train_, X_val_ = np.array(X_train_), np.array(X_val_)
    y_train_, y_val_ = np.array(y_train_), np.array(y_val_)
    
    # fix for empty columns - LogisticRegression blows up otherwise 
    check_for_empty_cols = np.where(y_train_.sum(axis = 0) == 0)[0]
    if len(check_for_empty_cols):
        y_train_[0, check_for_empty_cols] = 1
    
    clf = RandomForestClassifier(n_estimators=200,max_depth=10, random_state=43,min_samples_split=10)
    clf.fit(X_train_, y_train_)
    y_pred = clf.predict(X_val_)
    
    # calc averag log loss
    loss += calc_log_loss(y_val_, y_pred)
    print("loss: ", loss)
    break
    
    
# loss /= n_splits
# print("average log loss: ", loss)


TRAIN: [ 7938  7939  7940 ... 23811 23812 23813] TEST: [   0    1    2 ... 7935 7936 7937]
loss:  3.3897107329389637


In [19]:
np.sum(y_pred, axis=0)

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  59,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,  15,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   8,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,  15,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0, 236,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0, 236,   0,   0,   0,   0,   0,
        43,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [24]:
def calc_log_loss(y_true, y_pred):
    log_Loss = 0.0
    for c in range(y_pred.shape[1]):
        y_pred_col = y_pred[:, c]
        y_val_col = y_true[:, c]
        # fix empty y_val columns if there is any
        if y_val_col.sum()==0:
            y_val_col[0] = 1
        log_Loss += log_loss(y_val_col, y_pred_col)
    return (log_Loss / y_pred.shape[1])

In [29]:
from sklearn.neural_network import MLPClassifier


# 3_Fold cross validation
n_splits = 3

kf = KFold(n_splits=n_splits)
kf.get_n_splits(train_features)

log_Loss = 0.0

for train_index, test_index in kf.split(train_features):
    
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train_, X_val_ = train_features.iloc[train_index], train_features.iloc[test_index]
    y_train_, y_val_ = df_target_s.iloc[train_index], df_target_s.iloc[test_index]
    
    X_train_, X_val_ = np.array(X_train_), np.array(X_val_)
    y_train_, y_val_ = np.array(y_train_), np.array(y_val_)
    
    # fix for empty columns - LogisticRegression blows up otherwise 
    check_for_empty_cols = np.where(y_train_.sum(axis = 0) == 0)[0]
    if len(check_for_empty_cols):
        y_train_[0, check_for_empty_cols] = 1
    
    clf = MLPClassifier(random_state=1, max_iter=1500, verbose=1)
    clf.fit(X_train_, y_train_)
    y_pred = clf.predict(X_val_)
    
    # calc averag log loss
    loss += calc_log_loss(y_val_, y_pred)
    print("loss: ", loss)
    break


TRAIN: [ 7938  7939  7940 ... 23811 23812 23813] TEST: [   0    1    2 ... 7935 7936 7937]
Iteration 1, loss = 42.99590457
Iteration 2, loss = 5.54297738
Iteration 3, loss = 4.48677032
Iteration 4, loss = 4.14306158
Iteration 5, loss = 3.94652223
Iteration 6, loss = 3.80286040
Iteration 7, loss = 3.68554338
Iteration 8, loss = 3.58586791
Iteration 9, loss = 3.49817901
Iteration 10, loss = 3.41959404
Iteration 11, loss = 3.34823711
Iteration 12, loss = 3.28317527
Iteration 13, loss = 3.22223705
Iteration 14, loss = 3.16426173
Iteration 15, loss = 3.10923112
Iteration 16, loss = 3.05739863
Iteration 17, loss = 3.00756836
Iteration 18, loss = 2.96075274
Iteration 19, loss = 2.91452584
Iteration 20, loss = 2.87037213
Iteration 21, loss = 2.82754729
Iteration 22, loss = 2.78580015
Iteration 23, loss = 2.74530670
Iteration 24, loss = 2.70658176
Iteration 25, loss = 2.66799632
Iteration 26, loss = 2.63039656
Iteration 27, loss = 2.59435412
Iteration 28, loss = 2.55809771
Iteration 29, loss = 

Iteration 250, loss = 0.28482232
Iteration 251, loss = 0.28170962
Iteration 252, loss = 0.28142849
Iteration 253, loss = 0.27709573
Iteration 254, loss = 0.27391333
Iteration 255, loss = 0.27335542
Iteration 256, loss = 0.27033037
Iteration 257, loss = 0.26989770
Iteration 258, loss = 0.26470822
Iteration 259, loss = 0.26152284
Iteration 260, loss = 0.25708147
Iteration 261, loss = 0.25526161
Iteration 262, loss = 0.25191281
Iteration 263, loss = 0.25059495
Iteration 264, loss = 0.24779866
Iteration 265, loss = 0.24327413
Iteration 266, loss = 0.23962821
Iteration 267, loss = 0.23775276
Iteration 268, loss = 0.23582779
Iteration 269, loss = 0.23220085
Iteration 270, loss = 0.23084939
Iteration 271, loss = 0.23031638
Iteration 272, loss = 0.22936258
Iteration 273, loss = 0.22703009
Iteration 274, loss = 0.22614737
Iteration 275, loss = 0.22118515
Iteration 276, loss = 0.22043437
Iteration 277, loss = 0.21559933
Iteration 278, loss = 0.21168882
Iteration 279, loss = 0.21000955
Iteration 

In [30]:
calc_log_loss(y_val_, y_pred)

0.23810826590639605