In [138]:
# Load the data, filter out columns which aren't useful,
# and convert categorial columns to a onehot representation

import pandas as pd
import numpy as np
import re

pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100)

# Simplify some fields that are needlessly verbose
def simplify_field(f):
    f = f.replace("patient.cytogenetic_abnormalities.cytogenetic_abnormality",
        "patient.cytogenetic_abnormality")
    f = f.replace("patient.fish_test_component_results.fish_test_component_result",
        "patient.fish_test_component_result")
    f = f.replace("molecular_analysis_abnormality_testing_results.molecular_analysis_abnormality_testing_result_values",
        "molecular_analysis_abnormality_testing_result_values")
    f = f.replace("immunophenotype_cytochemistry_testing_results.immunophenotype_cytochemistry_testing_result_values",
        "immunophenotype_cytochemistry_testing_result")
    f = f.replace("race_list.race", "race")
    f = f.replace("patient.", "")
    return f

def clean_complex_bool_cols(df, base_name, sub_name, count, output=False):
    # Find unique values
    uniq_vals = set()
    for i in range(count):
        col = base_name + ("-%d" % (i+1) if i > 0 else "") + sub_name
        uniq_vals = uniq_vals.union(df[col].unique())
    if output:
        print(uniq_vals)

    # Turn index rows "", -2, -3 into True/False columns for each value
    new_df = pd.DataFrame()
    for val in uniq_vals:
        if pd.isnull(val):
            continue
        
        bool_vals = df[base_name + sub_name] == val
        for i in range(1, count):
            col = base_name + ("-%d" % (i+1) if i > 0 else "") + sub_name
            bool_vals = bool_vals | (df[col] == val)
        new_col = base_name + "_" + val
        new_df[new_col] = bool_vals
        if output:
            print(new_col, new_df[new_col])

    return new_df

# Get a onehot dataframe for a categorical column
def get_onehot_df(df, col):
    df = pd.get_dummies(df[col], drop_first=True)
    df.columns = [col + "_" + c for c in df.columns]    
    return df

# Create the pandas dataframe from a file
def load_dataframe(fn):
    df = pd.read_csv(open(fn), sep="\t").transpose()
    df.columns = df.iloc[0]
    df = df.drop(df.index[0]) # drop the row with the columns names

    df.columns = [simplify_field(c) for c in df.columns]
    df = df.apply(pd.to_numeric, errors='ignore') # convert object cols to numeric cols when possible

    # Remove some empty fields
    for field in ["molecular_analysis_abnormality_testing_results",
         "fish_test_component_results"]:
        del df[field]
    for field in df.columns:
        if field.startswith("admin.") or field.startswith("fish_test_component_result") or \
            "immunophenotype_cytochemistry_percent_positive" in field or \
            "molecular_analysis_abnormality_testing_percentage_value" in field:
            del df[field]

    # Clean complex columns with indices (e.g. "-2") into onehot dataframes
    ca_df = clean_complex_bool_cols(df, "cytogenetic_abnormality", "", 4)
    ictr_df = clean_complex_bool_cols(df, "immunophenotype_cytochemistry_testing_result", 
        ".immunophenotype_cytochemistry_testing_result", 21)
    ictr_df = ictr_df.drop(columns=[c for c in ictr_df.columns if "not tested" in c])
#     df = clean_complex_cols(df, "fish_test_component_result", 
#         ".fish_test_component_percentage_value", 9, True)
    maatr_df = clean_complex_bool_cols(df, "molecular_analysis_abnormality_testing_result_values", 
        ".molecular_analysis_abnormality_testing_result", 8)
    complex_dfs = [ca_df, ictr_df, maatr_df]
    
    ignore_col_prefixes = ["bcr_patient_barcode", "immunophenotype_cytochemistry_testing_result",
                           "fish_test_component_result", "molecular_analysis_abnormality_testing_result_values",
                           "cytogenetic_abnormality", "bcr_patient_uuid",
                           "days_to_death", # this one is kind of cheating
                           "patient_id"]
    noncomplex_cols = [c for c in df.columns if re.split("[\.\-]", c)[0] not in ignore_col_prefixes]
    
    final_df = pd.concat(complex_dfs, axis=1)
    for c in noncomplex_cols:
        # skip cols with no unique values
        if len(df[c].unique()) == 1:
            continue
        
        # convert categorical cols to onehot
        if df[c].dtype == object:
            new_df = get_onehot_df(df, c)
            final_df = pd.concat([final_df, new_df], axis=1)
        # skip cols with too many nans
        elif np.isnan(df[c]).sum() > len(df[c]) / 2:
            continue
        # use numeric cols as is
        elif df[c].dtype in (np.float64, np.int64):
            final_df = pd.concat([final_df, df[[c]]], axis=1)
        else:
            print("Col " + c + " is a weird type: " + str(df[c].dtype))
            
    return final_df

df = load_dataframe("LAML.merged_only_clinical_clin_format.txt")
print(list(df.columns))
print(df.shape)

['cytogenetic_abnormality_+8', 'cytogenetic_abnormality_del(7q) / 7q-', 'cytogenetic_abnormality_t(9;11)', 'cytogenetic_abnormality_complex - >= 3 distinct abnormalities', 'cytogenetic_abnormality_inv(16)', 'cytogenetic_abnormality_t(15;17) and variants', 'cytogenetic_abnormality_normal', 'cytogenetic_abnormality_del(5q) / 5q-', 'cytogenetic_abnormality_t(8;21)', 'immunophenotype_cytochemistry_testing_result_nse positive', 'immunophenotype_cytochemistry_testing_result_tdt negative', 'immunophenotype_cytochemistry_testing_result_cd34 positive', 'immunophenotype_cytochemistry_testing_result_cd117 positive', 'immunophenotype_cytochemistry_testing_result_cd7 positive', 'immunophenotype_cytochemistry_testing_result_cd14 negative', 'immunophenotype_cytochemistry_testing_result_other cd negative', 'immunophenotype_cytochemistry_testing_result_nse negative', 'immunophenotype_cytochemistry_testing_result_cd5 negative', 'immunophenotype_cytochemistry_testing_result_cd19 negative', 'immunophenoty

In [139]:
# Create our matrix of input features and vector of labels

y = df['vital_status_dead']
X = df[:]
del X['vital_status_dead']
print(df.shape, X.shape)

(200, 119) (200, 118)


In [164]:
# Impute missing values using the average and scale values to the same range

from sklearn.preprocessing import Imputer, StandardScaler
# Fit only to the training data
imp = Imputer()
Xi = imp.fit_transform(X)
print(Xi.shape)

scaler = StandardScaler()
Xis = scaler.fit_transform(Xi)
print(Xis.shape)

(200, 118)
(200, 118)


In [180]:
# Perform the classification with a few classifiers and use
# 10-fold cross validation with AUROC as the scoring metric.
# For ensemble methods, print out the top features.

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import RidgeClassifier
from sklearn.neural_network import MLPClassifier

clfs = {"NeuralNet": (MLPClassifier(max_iter=1000, hidden_layer_sizes=(118,118,118)), False),
        "RandomForest": (RandomForestClassifier(n_estimators=100), True),
        "AdaBoost": (AdaBoostClassifier(n_estimators=100), True),
        "SVC": (SVC(), False),
        "Ridge": (RidgeClassifier(), False),
        }

skf = StratifiedKFold(n_splits=10)

high_feats = set()
for clf_name, (clf, print_features) in clfs.items():
    print(clf_name)
    scores = cross_val_score(clf, Xis, y, cv=skf, scoring="roc_auc")
    print("ROC: %f +-%f" % (np.mean(scores), np.std(scores) * 2))

    clf.fit(Xis,y)
    
    print("Confusion matrix:\n", confusion_matrix(y, clf.predict(Xis)))
    
    if print_features:
        feat_importances = pd.Series(clf.feature_importances_, index=X.columns)
        feat_importances = feat_importances.sort_values(ascending=False)[:10]
        print(feat_importances)
        high_feats = high_feats.union(feat_importances.index)
    print()

print("Important features: ", " ".join(sorted(high_feats)))

NeuralNet
ROC: 0.631633 +-0.288370
Confusion matrix:
 [[ 67   0]
 [  0 133]]

RandomForest
ROC: 0.725837 +-0.238509
Confusion matrix:
 [[ 67   0]
 [  0 133]]
age_at_initial_pathologic_diagnosis                           0.074134
platelet_result_count                                         0.061695
days_to_birth                                                 0.058076
year_of_initial_pathologic_diagnosis                          0.054530
lab_procedure_blast_cell_outcome_percentage_value             0.045091
lab_procedure_leukocyte_result_unspecified_value              0.041359
lab_procedure_bone_marrow_blast_cell_outcome_percent_value    0.040913
lab_procedure_bone_marrow_lymphocyte_outcome_percent_value    0.031618
lab_procedure_bone_marrow_neutrophil_result_percent_value     0.031189
lab_procedure_monocyte_result_percent_value                   0.026708
dtype: float64

AdaBoost
ROC: 0.678388 +-0.207875
Confusion matrix:
 [[ 67   0]
 [  0 133]]
days_to_birth                           

In [None]:
# Answers to questions:
# 1. Classifiers I tried: NeuralNet, RandomForest, AdaBoost, SVC, and Ridge. RandomForest
#    provided the best average AUROC but it was still pretty variable across the different
#    validation sets.
# 2. I did a fair bit of preprocessing of the data before it was useful. I cleaned up the 
#    indexed columns (e.g. "cytogenetic_abnormality-2") into onehot columns for each possible
#    value. I also replaced other categorical data with onehot columns. Numeric columns were
#    left intact if they didn't have too man NaNs or the values weren't all the same. A few
#    other columns were removed e.g. ("patient_id") and finally "days_to_death" was removed
#    because this seemed like cheating.
# 3. I used scikit-learn methods to perform 10-fold cross validation stratified on the
#    vital_status label.
# 4. Some of the top features as returned by the ensemble methods (in alphabetical order):
#   age_at_initial_pathologic_diagnosis days_to_birth
#   lab_procedure_abnormal_lymphocyte_result_percent_value
#   lab_procedure_blast_cell_outcome_percentage_value
#   lab_procedure_bone_marrow_blast_cell_outcome_percent_value
#   lab_procedure_bone_marrow_lymphocyte_outcome_percent_value
#   lab_procedure_bone_marrow_metamyelocyte_result_value
#   lab_procedure_bone_marrow_neutrophil_result_percent_value
#   lab_procedure_bone_marrow_promonocyte_count_result_percent_value
#   lab_procedure_leukocyte_result_unspecified_value
#   lab_procedure_monocyte_result_percent_value
#   platelet_result_count year_of_initial_pathologic_diagnosis