In [1]:
import pandas as pd
import glob
from io import StringIO
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
with open('train_features_0.csv', 'r') as f, open('train_features_1.csv', 'r')  as f1 :  # I split the training feature data into two files for the sake of meeting github's file size limit
    content = f.read()
    content1 = f1.read()
    
    obj = StringIO(content + content1)
    
train_feats_df = pd.read_csv(obj)
train_target_df = pd.read_csv("train_targets_scored.csv")

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from tqdm import tqdm
import pickle
from sklearn.utils import resample
from sklearn.model_selection import train_test_split



clf_dict = {} # A dictionary for saving the individual models

X_train, X_test, y_train, y_test = train_test_split(train_feats_df.values, train_target_df.values, test_size=0.2,)  # Train-test split. One fold for now.



In [3]:
X_train

array([['id_92898f216', 'trt_cp', 48, ..., -0.5912, 0.9209, -0.0457],
       ['id_faf092875', 'trt_cp', 72, ..., 1.026, -0.9019, -0.9099],
       ['id_a7d37d9c7', 'ctl_vehicle', 24, ..., 0.9264, 0.8839, 0.3978],
       ...,
       ['id_a23c124dd', 'trt_cp', 24, ..., 0.3596, -0.8082, -1.886],
       ['id_7eba18346', 'trt_cp', 72, ..., 0.9712, -1.13, 1.19],
       ['id_07f7bedce', 'trt_cp', 72, ..., -1.025, -0.7848, -0.8253]],
      dtype=object)

In [4]:
y_train

array([['id_92898f216', 0, 0, ..., 0, 0, 0],
       ['id_faf092875', 0, 0, ..., 0, 0, 0],
       ['id_a7d37d9c7', 0, 0, ..., 0, 0, 0],
       ...,
       ['id_a23c124dd', 0, 0, ..., 0, 0, 0],
       ['id_7eba18346', 0, 0, ..., 0, 0, 0],
       ['id_07f7bedce', 0, 0, ..., 0, 0, 0]], dtype=object)

In [6]:



from sklearn.ensemble import RandomForestClassifier



for combination in tqdm([(24, 'D1'), (48, 'D1'), (72, 'D1'), (24, 'D2'), (48, 'D2'), (72, 'D2')]):  # Each particular combination will have their own classifier
    label_clfs_container = []  # A list to keep the classifier
    lable_feats_container = []  # A list to keep the feature importance values
    for i, label in enumerate(train_target_df.iloc[:,1:].columns):  # Train each label with their own classifier.
        
    
        ind = (X_train[:, 2] == combination[0])  & (X_train[:, 3] == combination[1]) &  (X_train[:, 1] == 'trt_cp')  # Obtaining the training index based on combination



        feature_mat = X_train[ind][:, 4:]

        target_vector = y_train[ind][:, i + 1]
        
        # Feature selection
        RF_clf = RandomForestClassifier(max_depth=100, random_state=0,n_jobs = 20)
        RF_clf.fit(feature_mat.astype(float), target_vector.astype(int))
        selected_features =  RF_clf.feature_importances_
        feature_mat = feature_mat[:, selected_features != 0]  # I actually found that many feature importance are zero... 
                                                              # So I only select those that have non-zero feature importance.

        lable_feats_container.append(selected_features)
        
        if (target_vector == 1).any():  # If and only if the the FILTERED target vector contains any activation, then proceed with fitting a classifier. See else below.
            
            ############### Dealing with class inbalance by over sampling the activated samples ##########################
            assert  (target_vector == 1).sum() < (target_vector == 0).sum()
            
            
            idx_of_label_one = target_vector == 1
            idx_of_label_zero = target_vector == 0
            
            num_sample_0 = (target_vector == 0).sum()
            
            
            
            resampled_one_feat, resampled_one_target = \
                     resample(feature_mat[idx_of_label_one] , target_vector[idx_of_label_one], n_samples =num_sample_0)
            
            final_feat = np.append(feature_mat[idx_of_label_zero], resampled_one_feat, axis = 0)
            
            final_target = np.append(target_vector[idx_of_label_zero], resampled_one_target, axis = 0)
            
            
            
            ################# Finally, fit a simple LS model and add it to the class dictionary #########################
            clf = LogisticRegression(max_iter =1000)
            clf.fit(final_feat.astype('float32'), final_target.astype('int'))
            
            label_clfs_container.append(clf)
            
            
            
            
        else:  # Else (if no activiation is found), then add zero as a classifier. 
            label_clfs_container.append(0)
        
    clf_dict[combination] = (label_clfs_container, lable_feats_container)
        

    
    

100%|██████████| 6/6 [27:58<00:00, 279.70s/it]


In [7]:
pred = np.zeros(y_test[:, 1:].shape)  # Initiazlie a prediction matrix to hold prediction; matrix has the same size as the y_test
for combination in tqdm([(24, 'D1'), (48, 'D1'), (72, 'D1'), (24, 'D2'), (48, 'D2'), (72, 'D2')]):  # Iterating over each combiantions, same as what I did before
    
    temp_result = [ ]
    
    idx = (X_test[:, 2] == combination[0])  & (X_test[:, 3] == combination[1]) &  (X_test[:, 1] == 'trt_cp')
    test_feat_mat = X_test[idx][:, 4:]
    for classifier,feature_idx in zip(clf_dict[combination][0], clf_dict[combination][1],):  # Iterating over classifer for EACH label. `feature_idx` contains the feature importance for each classifier and it is used to filter the test features.
        
        if classifier != 0:  # Proceed if a classifer exist
            assert classifier.classes_[0] == 0
            
            feat_mat = test_feat_mat[:, feature_idx != 0]
      

            pred_prob = classifier.predict_proba(feat_mat)[:, 1]
            
            temp_result.append(pred_prob)
        else:  # If a classifier is not available, then just append zeros since there is no activiation found for that label.
            temp_result.append([0] * sum(idx))
            
    temp_result = np.array(temp_result).T
    
    pred[idx] = temp_result  # Assign these values to the prediciotn matrix based on the index



    

100%|██████████| 6/6 [00:34<00:00,  5.82s/it]


In [8]:
def evaluation(y_true, y_pred):
    return -(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)).mean(axis = 0).mean()

pred[pred < 1e-15] = 1e-15  
pred[pred == 1] = 1 - 1e-15 

evaluation(y_test[:, 1:], pred)

0.030326035639150442