In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

In [43]:
!pwd

/home/jwillow/Documents/UofT/MIE1628/notebook


In [44]:
train_features = './lish-moa/train_features.csv'
train_labels = './lish-moa/train_targets_scored.csv'

test_features = './lish-moa/test_features.csv'
test_labels = './lish-moa/train_targets_nonscored.csv'

train_features_df = pd.read_csv(
    filepath_or_buffer = train_features,
    sep=',',
    delimiter=None,
    header='infer',
    index_col='sig_id',
)

train_labels_df = pd.read_csv(
    filepath_or_buffer = train_labels,
    sep=',',
    delimiter=None,
    header='infer',
    index_col='sig_id',
)

Data-Process: Checking for data types

In [45]:
print('Train features dimensions: ', train_features_df.shape)
total_nans = train_features_df.isnull().sum().sum()
print('Number of NULL values: ', total_nans)

# train_features_df['cp_type'].dtype
numeric_cols = train_features_df.dtypes[train_features_df.dtypes != "object"].index.tolist() 
categorical_cols = train_features_df.dtypes[train_features_df.dtypes == "object"].index.tolist()

print(f'{len(numeric_cols)} numeric columns and {len(categorical_cols)} categoical columns')

Train features dimensions:  (23814, 875)
Number of NULL values:  0
873 numeric columns and 2 categoical columns


### Data Process - check cp_type, drop rows where cp_type == ctl_vehicle

In [46]:
# train_features_df[train_features_df['cp_type']=='ctl_vehicle']
'''
Label-based indexing. Check the activation of MoI for trt_cp type 

can use this to plot the distribution plot for different cp_types
'''
index_for_trt_cp_type = train_features_df[train_features_df['cp_type']=='trt_cp'].index
trt_cp_label_df = train_labels_df.loc[index_for_trt_cp_type]
trt_cp_label_df.sum()

index_for_ctl_cp_type = train_features_df[train_features_df['cp_type']=='ctl_vehicle'].index
ctl_cp_label_df = train_labels_df.loc[index_for_ctl_cp_type]
ctl_cp_label_df.sum().sum()

print('If cp_type==ctl_vehicle, there is no MoI activation')

# Drop the rows with cp_type = ctl_vehicle
train_features_df = train_features_df[train_features_df['cp_type']=='trt_cp']
train_features_df.drop(labels='cp_type', axis=1, inplace=True)
train_labels_df = train_labels_df.loc[index_for_trt_cp_type]

If cp_type==ctl_vehicle, there is no MoI activation


### Data-Process: Distribution of MoI for cp_dose D1 vs D2

In [47]:
'''
Two series that capture the different distribution of CP_dose feature -> Data Visualization
'''
index_for_d1_cp_dose = train_features_df[train_features_df['cp_dose']=='D1'].index
d1_dose_label_df = train_labels_df.loc[index_for_trt_cp_type]
d1_dose_label_df.sum()

index_for_d2_cp_dose = train_features_df[train_features_df['cp_dose']=='D2'].index
d2_dose_label_df = train_labels_df.loc[index_for_trt_cp_type]
d2_dose_label_df.sum()

5-alpha_reductase_inhibitor               17
11-beta-hsd1_inhibitor                    18
acat_inhibitor                            24
acetylcholine_receptor_agonist           190
acetylcholine_receptor_antagonist        301
                                        ... 
ubiquitin_specific_protease_inhibitor      6
vegfr_inhibitor                          170
vitamin_b                                 26
vitamin_d_receptor_agonist                39
wnt_inhibitor                             30
Length: 206, dtype: int64

### Encode categoricals to 1's and 0's

In [48]:
le = LabelEncoder()
categorical_cols.remove('cp_type')
train_features_df[categorical_cols] = train_features_df[categorical_cols].apply(lambda col: le.fit_transform(col))
train_features_df[categorical_cols].head(10)

Unnamed: 0_level_0,cp_dose
sig_id,Unnamed: 1_level_1
id_000644bb2,0
id_000779bfc,0
id_000a6266a,0
id_0015fd391,0
id_001626bd3,1
id_001762a82,0
id_001bd861f,1
id_0020d0484,0
id_00224bf20,0
id_0023f063e,1


### Data-Process: Standardize Feature data

In [49]:
scaler = StandardScaler()

for num_col in numeric_cols:
    train_features_df[num_col] = scaler.fit_transform(np.array(train_features_df[num_col]).reshape(-1,1))

In [50]:
# train_features_df
# train_labels_df

train_features_df = train_features_df[0:10000]
train_labels_df = train_labels_df[0:10000]

In [51]:
dt = DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    class_weight=None,
    presort='deprecated',
    ccp_alpha=0.0,
)
dt.fit(train_features_df, train_labels_df)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [52]:
dt.feature_importances_

array([0.00011037, 0.        , 0.00074786, 0.00196563, 0.00088996,
       0.0008356 , 0.00117264, 0.00066015, 0.00149381, 0.00135663,
       0.00083369, 0.00098589, 0.00082511, 0.00079094, 0.00205784,
       0.00056236, 0.00024135, 0.00148483, 0.0008308 , 0.00023059,
       0.00078269, 0.00069711, 0.0014105 , 0.00068651, 0.0013719 ,
       0.00043791, 0.00100571, 0.00192662, 0.00078559, 0.00126336,
       0.00144094, 0.        , 0.00043549, 0.00028759, 0.00164645,
       0.00144727, 0.00037786, 0.00095613, 0.00076295, 0.        ,
       0.00056783, 0.0012867 , 0.00079014, 0.00128938, 0.00107941,
       0.00110788, 0.00081443, 0.00060135, 0.00105055, 0.00122219,
       0.00128396, 0.00054006, 0.00094976, 0.00120466, 0.00035093,
       0.00028697, 0.00109547, 0.00050478, 0.00026411, 0.00066003,
       0.00116409, 0.00024977, 0.00070857, 0.00026513, 0.0008731 ,
       0.00048563, 0.00034217, 0.00170242, 0.00173351, 0.0008105 ,
       0.00116698, 0.00104081, 0.00078766, 0.00056884, 0.00056

In [62]:
n = 874
ranked = np.argsort(dt.feature_importances_)
largest_indices = ranked[::-1][:n]
np.save(file='most_important_features_index', arr=largest_indices)