In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [2]:
"""
Reading data
"""
df_train = pd.read_csv('cs589_mini_project/data/train_features.csv')
df_test = pd.read_csv('cs589_mini_project/data/test_features.csv')
df_target_s = pd.read_csv('cs589_mini_project/data/train_targets_scored.csv')
submission = pd.read_csv('cs589_mini_project/data/sample_submission.csv')

# train on X_train and y_train
# test on X_test

In [3]:
# display(df_train.head(5))
# display(df_test.head(5))
# display(df_target_s.head(5))
# display(submission.head(5))
print(df_train.shape)
print(df_test.shape)
print(df_target_s.shape)
print(submission.shape)

(23814, 876)
(3982, 876)
(23814, 207)
(3982, 207)


In [4]:
def preprocess(df):
    df['cp_type'] = df['cp_type'].map({'trt_cp':0, 'ctl_vehicle':1})
    df['cp_time'] = df['cp_time'].map({24:1, 48:2, 72:3})
    df['cp_dose'] = df['cp_dose'].map({'D1':0, 'D2':1})
    return df
X_train = preprocess(df_train)
X_test = preprocess(df_test)
display(X_train)

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,0,1,0,1.0620,0.5577,-0.2479,-0.6208,-0.1944,-1.0120,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,id_000779bfc,0,3,0,0.0743,0.4087,0.2991,0.0604,1.0190,0.5207,...,-0.4265,0.7543,0.4708,0.0230,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,id_000a6266a,0,2,0,0.6280,0.5817,1.5540,-0.0764,-0.0323,1.2390,...,-0.7250,-0.6297,0.6103,0.0223,-1.3240,-0.3174,-0.6417,-0.2187,-1.4080,0.6931
3,id_0015fd391,0,2,0,-0.5138,-0.2491,-0.2656,0.5288,4.0620,-0.8095,...,-2.0990,-0.6441,-5.6300,-1.3780,-0.8632,-1.2880,-1.6210,-0.8784,-0.3876,-0.8154
4,id_001626bd3,0,3,1,-0.3254,-0.4009,0.9700,0.6919,1.4180,-0.8244,...,0.0042,0.0048,0.6670,1.0690,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,id_fffb1ceed,0,1,1,0.1394,-0.0636,-0.1112,-0.5080,-0.4713,0.7201,...,0.1969,0.0262,-0.8121,0.3434,0.5372,-0.3246,0.0631,0.9171,0.5258,0.4680
23810,id_fffb70c0c,0,1,1,-1.3260,0.3478,-0.3743,0.9905,-0.7178,0.6621,...,0.4286,0.4426,0.0423,-0.3195,-0.8086,-0.9798,-0.2084,-0.1224,-0.2715,0.3689
23811,id_fffc1c3f4,1,2,1,0.3942,0.3756,0.3109,-0.7389,0.5505,-0.0159,...,0.5409,0.3755,0.7343,0.2807,0.4116,0.6422,0.2256,0.7592,0.6656,0.3808
23812,id_fffcb9e7c,0,1,0,0.6660,0.2324,0.4392,0.2044,0.8531,-0.0343,...,-0.1105,0.4258,-0.2012,0.1506,1.5230,0.7101,0.1732,0.7015,-0.6290,0.0740


In [6]:
"""
PCA on Genes 
"""
n_comp = 200

genes = [col for col in X_train.columns if col.startswith('g-')]
cells = [col for col in X_train.columns if col.startswith('c-')]

data_genes = pd.concat([pd.DataFrame(X_train[genes]), pd.DataFrame(X_test[genes])])
data_genes_pca = PCA(n_components=n_comp, random_state=42).fit_transform(data_genes)

train_gene_pca = data_genes_pca[:X_train.shape[0]]
test_gene_pca = data_genes_pca[-X_test.shape[0]:]

train_gene_pca = pd.DataFrame(train_gene_pca, columns=[f'pca_G-{i}' for i in range(n_comp)])
test_gene_pca = pd.DataFrame(test_gene_pca, columns=[f'pca_G-{i}' for i in range(n_comp)])

In [7]:
"""
PCA on Cells
"""
n_comp = 50 

data_cells = pd.concat([pd.DataFrame(X_train[cells]), pd.DataFrame(X_test[cells])])
data_cells_pca = PCA(n_components=n_comp, random_state=42).fit_transform(data_cells)

train_cells_pca = data_cells_pca[:X_train.shape[0]] 
test_cells_pca = data_cells_pca[-X_test.shape[0]:]

train_cells_pca = pd.DataFrame(train_cells_pca, columns=[f'pca_C-{i}' for i in range(n_comp)])
test_cells_pca = pd.DataFrame(test_cells_pca, columns=[f'pca_C-{i}' for i in range(n_comp)])

In [8]:
train_features = pd.concat((train_gene_pca, train_cells_pca), axis=1)
test_features = pd.concat((test_gene_pca, test_cells_pca), axis=1)

# y_labels
df_target_s = df_target_s.drop(['sig_id'], axis=1)

display(train_features)

Unnamed: 0,pca_G-0,pca_G-1,pca_G-2,pca_G-3,pca_G-4,pca_G-5,pca_G-6,pca_G-7,pca_G-8,pca_G-9,...,pca_C-40,pca_C-41,pca_C-42,pca_C-43,pca_C-44,pca_C-45,pca_C-46,pca_C-47,pca_C-48,pca_C-49
0,-8.191034,-0.488171,-3.967531,6.922384,3.426204,-4.939660,-4.021629,3.108678,2.719793,-2.409787,...,-0.511125,1.022494,-0.136036,0.075919,-0.066207,-0.101230,0.225362,0.057230,-0.107296,-0.341431
1,-6.531898,3.288575,9.420453,-0.574416,-2.009119,4.779334,2.585964,1.996409,0.305785,1.425774,...,-0.319058,0.319740,-0.159246,-0.345882,0.150300,-0.216133,0.687300,0.400472,-0.136589,0.428745
2,-1.669338,2.297153,-0.782439,-7.036332,-1.434308,-1.718394,2.841784,-0.774921,-1.731597,-4.942427,...,-0.149934,-0.867463,-0.574558,-0.320742,0.147007,0.119810,0.768706,-1.156860,-0.365257,-0.456086
3,9.268336,-4.829223,-1.752497,-11.326041,-1.178113,-10.384511,5.975081,-2.337151,-0.157038,4.973514,...,-1.084507,0.665303,-0.475641,-0.746390,1.684805,-0.698474,-0.320341,0.882071,1.369027,1.143375
4,-7.185918,0.111345,8.256769,-7.394481,-0.632286,-4.333923,-1.741193,0.646718,-6.767281,5.830797,...,-1.127680,-0.199497,0.697061,-0.557235,-0.913216,0.174704,0.175920,0.543692,0.291145,0.513161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,-6.018252,2.927708,-0.626853,-3.892167,-0.773846,-0.904330,1.530116,0.114901,2.579791,-1.943151,...,-0.287854,-0.227900,0.129922,0.478464,0.659682,-0.359301,0.600943,-0.788161,-0.156615,-1.434734
23810,-4.952320,-0.545998,-1.626794,0.221293,4.770532,-0.212782,0.308716,-2.716075,-1.425005,0.317466,...,-0.101267,-0.316646,-0.221708,-0.471236,-0.263232,-0.692952,-1.059063,0.699861,0.985178,0.260118
23811,-6.522456,-1.372804,-0.831251,0.531181,-0.837211,-1.502036,0.003087,2.996020,-1.024627,-0.891801,...,-0.542585,0.421446,0.047425,-0.000176,-0.890236,0.125915,-0.517586,0.500146,0.305829,-0.503617
23812,7.535654,-20.646006,-1.614500,12.610355,-8.006033,-0.190805,-3.018044,5.666977,7.751807,7.323825,...,0.472907,-1.584457,-0.692551,0.229086,-0.850729,0.451430,0.720103,0.530429,0.165919,0.857855


In [17]:
from sklearn.linear_model import LogisticRegression
# from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, log_loss

"""
Linear Model - Logistic Regression (multinomial)
"""
# 3_Fold cross validation
kf = KFold(n_splits=3)
kf.get_n_splits(train_features)

log_Loss = 0.0

for train_index, test_index in kf.split(train_features):
    
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train_, X_val_ = train_features.iloc[train_index], train_features.iloc[test_index]
    y_train_, y_val_ = df_target_s.iloc[train_index], df_target_s.iloc[test_index]
    
    X_train_, X_val_ = np.array(X_train_), np.array(X_val_)
    y_train_, y_val_ = np.array(y_train_), np.array(y_val_)
    
    # stupid fix for empty columns - LogisticRegression blows up otherwise 
    # (the problem occurs for two folds only, each time for a single column)
    # yes, i know it's ugly
    check_for_empty_cols = np.where(y_train_.sum(axis = 0) == 0)[0]
    if len(check_for_empty_cols):
        y_train_[0, check_for_empty_cols] = 1
    
    
    clf = MultiOutputClassifier(LogisticRegression(max_iter=10000, tol=0.1, C = 0.5), n_jobs=-1)
    clf.fit(X_train_, y_train_)
    y_pred = clf.predict(X_val_)
    
    break

TRAIN: [ 7938  7939  7940 ... 23811 23812 23813] TEST: [   0    1    2 ... 7935 7936 7937]


In [18]:
y_pred.shape

(7938, 206)

In [47]:
log_Loss = 0.0
for c in range(y_pred.shape[1]):
    y_pred_col = y_pred[:, c]
    y_val_col = y_val_[:, c]
    if y_val_col.sum()==0:
        y_val_col[0] = 1
    log_Loss += log_loss(y_val_col, y_pred_col)
    

print(log_Loss / y_pred.shape[1])

0.1149870627261709


In [35]:
np.sum(y_pred, axis=0)

array([  0,   5,   7,   0,   0,   5,  10,   8,   0,   3,   0,  17,   0,
         5,   1,   0,  13,   2,   6,   5,  12,  11,   0,   9,   0,   0,
         0,   1,   4,  11,   1,  10,  10,   0,   0,   0,  14,   7,  23,
         0,  17,   5,   9,   4,   7,   5,   0,   9,   9,   7,   5,  16,
         3,   0,   4,   4,  13,   6,  10,   2,   0,   3,   3, 105,  16,
         5,   6,  10,  16,   0,   5,   2,  10,  15,   5,   0,  16,  11,
         8,   6, 103,   0,   0,   4,  16,   8,   6,   2,   6,  68,   1,
         2,   0,   3,   5,   9,  94,   0,   6,   2,   1,   9,  14,  31,
         7,   1,   1,   4,   4, 114,  18,   5,   3,   6,  13,   4,  18,
         3,  19,  81,   0,   0,  10,   1,  16,   0,   2,  14,   7,   2,
         2,   8,   1,  33,   8,   6, 235,   0,   8,   0,   1,   1,   0,
         9,   8,   5,   8,   0,  18,  85,   0,   2,   3,  29,   5,   6,
         5,  10,   4,   9,   0,   7,   5, 236,   8,   0,  18,   2,  17,
        61,   0,  17,   0,   3,   7,   4,   0,   6,   3,  14,   