In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
#from sklearn.preprocessing import Imputer

In [2]:
np.random.seed(1)

In [3]:
df = pd.read_csv("NKI_cleaned.csv")
df

Unnamed: 0,Patient,ID,age,eventdeath,survival,timerecurrence,chemo,hormonal,amputation,histtype,...,Contig36312_RC,Contig38980_RC,NM_000853,NM_000854,NM_000860,Contig29014_RC,Contig46616_RC,NM_000888,NM_000898,AF067420
0,s122,18,43,0,14.817248,14.817248,0,0,1,1,...,0.591103,-0.355018,0.373644,-0.760690,-0.164025,-0.038726,0.237856,-0.087631,-0.369153,0.153795
1,s123,19,48,0,14.261465,14.261465,0,0,0,1,...,-0.199829,-0.001635,-0.062922,-0.682204,-0.220934,-0.100088,-0.466537,-0.231547,-0.643019,-0.014098
2,s124,20,38,0,6.644764,6.644764,0,0,0,1,...,0.328736,-0.047571,0.084228,-0.695950,-0.402840,-0.099965,0.110155,-0.114298,0.258495,-0.198911
3,s125,21,50,0,7.748118,7.748118,0,1,0,1,...,0.648861,-0.039088,0.182182,-0.524640,0.037320,-0.167688,-0.016790,-0.285344,-0.251188,0.862710
4,s126,22,38,0,6.436687,6.318960,0,0,1,1,...,-0.287538,-0.286893,0.057082,-0.565021,-0.105632,-0.108148,-0.405853,-0.053601,-0.677072,0.134160
5,s127,23,42,0,5.037645,2.743326,1,0,1,1,...,-0.417534,-0.141338,-0.492190,0.090633,-0.169754,-0.220211,-0.429283,-0.042797,-0.394709,-0.390144
6,s128,24,50,0,8.739220,8.739220,1,1,0,1,...,0.086751,-0.144424,-0.778273,0.024693,0.204909,-0.043497,-0.172939,-0.013997,-0.437534,0.255511
7,s129,25,43,0,7.567420,7.567420,1,0,0,1,...,-0.003150,0.043824,0.442394,-0.498541,-0.231900,0.029205,-0.078742,-0.241568,-0.841080,-0.680880
8,s130,26,47,0,7.296372,7.296372,1,0,0,1,...,-0.362921,-0.038672,-0.647650,-0.760694,0.146781,0.038366,-0.127822,-0.058059,-1.041802,-0.130038
9,s131,27,39,1,4.662560,1.114305,0,0,0,1,...,-0.845758,0.635155,-0.235659,-0.396895,-0.474251,-0.298208,-0.531806,0.091948,-0.605779,-0.607580


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Columns: 1570 entries, Patient to AF067420
dtypes: float64(1556), int64(13), object(1)
memory usage: 3.3+ MB


## Data Cleaning

In [5]:
# Drop the column `ID` as it is unrelated to the dependent variable
df = df.drop(['ID','Patient','barcode'],axis=1)
df

Unnamed: 0,age,eventdeath,survival,timerecurrence,chemo,hormonal,amputation,histtype,diam,posnodes,...,Contig36312_RC,Contig38980_RC,NM_000853,NM_000854,NM_000860,Contig29014_RC,Contig46616_RC,NM_000888,NM_000898,AF067420
0,43,0,14.817248,14.817248,0,0,1,1,25,0,...,0.591103,-0.355018,0.373644,-0.760690,-0.164025,-0.038726,0.237856,-0.087631,-0.369153,0.153795
1,48,0,14.261465,14.261465,0,0,0,1,20,0,...,-0.199829,-0.001635,-0.062922,-0.682204,-0.220934,-0.100088,-0.466537,-0.231547,-0.643019,-0.014098
2,38,0,6.644764,6.644764,0,0,0,1,15,0,...,0.328736,-0.047571,0.084228,-0.695950,-0.402840,-0.099965,0.110155,-0.114298,0.258495,-0.198911
3,50,0,7.748118,7.748118,0,1,0,1,15,1,...,0.648861,-0.039088,0.182182,-0.524640,0.037320,-0.167688,-0.016790,-0.285344,-0.251188,0.862710
4,38,0,6.436687,6.318960,0,0,1,1,15,0,...,-0.287538,-0.286893,0.057082,-0.565021,-0.105632,-0.108148,-0.405853,-0.053601,-0.677072,0.134160
5,42,0,5.037645,2.743326,1,0,1,1,10,1,...,-0.417534,-0.141338,-0.492190,0.090633,-0.169754,-0.220211,-0.429283,-0.042797,-0.394709,-0.390144
6,50,0,8.739220,8.739220,1,1,0,1,25,1,...,0.086751,-0.144424,-0.778273,0.024693,0.204909,-0.043497,-0.172939,-0.013997,-0.437534,0.255511
7,43,0,7.567420,7.567420,1,0,0,1,15,3,...,-0.003150,0.043824,0.442394,-0.498541,-0.231900,0.029205,-0.078742,-0.241568,-0.841080,-0.680880
8,47,0,7.296372,7.296372,1,0,0,1,18,1,...,-0.362921,-0.038672,-0.647650,-0.760694,0.146781,0.038366,-0.127822,-0.058059,-1.041802,-0.130038
9,39,1,4.662560,1.114305,0,0,0,1,17,0,...,-0.845758,0.635155,-0.235659,-0.396895,-0.474251,-0.298208,-0.531806,0.091948,-0.605779,-0.607580


In [6]:
# Remove rows where taget are missing
missing_row = df['eventdeath'].isnull()
print('Number of rows where target are missing:')
print(sum(missing_row))

df = df[~missing_row]
df

Number of rows where target are missing:
0


Unnamed: 0,age,eventdeath,survival,timerecurrence,chemo,hormonal,amputation,histtype,diam,posnodes,...,Contig36312_RC,Contig38980_RC,NM_000853,NM_000854,NM_000860,Contig29014_RC,Contig46616_RC,NM_000888,NM_000898,AF067420
0,43,0,14.817248,14.817248,0,0,1,1,25,0,...,0.591103,-0.355018,0.373644,-0.760690,-0.164025,-0.038726,0.237856,-0.087631,-0.369153,0.153795
1,48,0,14.261465,14.261465,0,0,0,1,20,0,...,-0.199829,-0.001635,-0.062922,-0.682204,-0.220934,-0.100088,-0.466537,-0.231547,-0.643019,-0.014098
2,38,0,6.644764,6.644764,0,0,0,1,15,0,...,0.328736,-0.047571,0.084228,-0.695950,-0.402840,-0.099965,0.110155,-0.114298,0.258495,-0.198911
3,50,0,7.748118,7.748118,0,1,0,1,15,1,...,0.648861,-0.039088,0.182182,-0.524640,0.037320,-0.167688,-0.016790,-0.285344,-0.251188,0.862710
4,38,0,6.436687,6.318960,0,0,1,1,15,0,...,-0.287538,-0.286893,0.057082,-0.565021,-0.105632,-0.108148,-0.405853,-0.053601,-0.677072,0.134160
5,42,0,5.037645,2.743326,1,0,1,1,10,1,...,-0.417534,-0.141338,-0.492190,0.090633,-0.169754,-0.220211,-0.429283,-0.042797,-0.394709,-0.390144
6,50,0,8.739220,8.739220,1,1,0,1,25,1,...,0.086751,-0.144424,-0.778273,0.024693,0.204909,-0.043497,-0.172939,-0.013997,-0.437534,0.255511
7,43,0,7.567420,7.567420,1,0,0,1,15,3,...,-0.003150,0.043824,0.442394,-0.498541,-0.231900,0.029205,-0.078742,-0.241568,-0.841080,-0.680880
8,47,0,7.296372,7.296372,1,0,0,1,18,1,...,-0.362921,-0.038672,-0.647650,-0.760694,0.146781,0.038366,-0.127822,-0.058059,-1.041802,-0.130038
9,39,1,4.662560,1.114305,0,0,0,1,17,0,...,-0.845758,0.635155,-0.235659,-0.396895,-0.474251,-0.298208,-0.531806,0.091948,-0.605779,-0.607580


We determine and drop the variables with excessive missing values from the dataset.

In [7]:
def remove_bad_columns(df,bad_column_threshold):
    # find bad columns having too many missing values
    n_null = np.array(df.isnull().sum(axis=0))
    bad_col = np.array([]).astype(int)
    for i in range(len(n_null)):
        if n_null[i] >= bad_column_threshold:
            bad_col = np.append(bad_col,i)

    #print(bad_col)
    print('number of bad columns:',len(bad_col))

    # delete bad columns
    df = df.drop(df.columns[bad_col],axis=1)
    #df.info()
    return df   

In [8]:
df = remove_bad_columns(df,10)
df

number of bad columns: 0


Unnamed: 0,age,eventdeath,survival,timerecurrence,chemo,hormonal,amputation,histtype,diam,posnodes,...,Contig36312_RC,Contig38980_RC,NM_000853,NM_000854,NM_000860,Contig29014_RC,Contig46616_RC,NM_000888,NM_000898,AF067420
0,43,0,14.817248,14.817248,0,0,1,1,25,0,...,0.591103,-0.355018,0.373644,-0.760690,-0.164025,-0.038726,0.237856,-0.087631,-0.369153,0.153795
1,48,0,14.261465,14.261465,0,0,0,1,20,0,...,-0.199829,-0.001635,-0.062922,-0.682204,-0.220934,-0.100088,-0.466537,-0.231547,-0.643019,-0.014098
2,38,0,6.644764,6.644764,0,0,0,1,15,0,...,0.328736,-0.047571,0.084228,-0.695950,-0.402840,-0.099965,0.110155,-0.114298,0.258495,-0.198911
3,50,0,7.748118,7.748118,0,1,0,1,15,1,...,0.648861,-0.039088,0.182182,-0.524640,0.037320,-0.167688,-0.016790,-0.285344,-0.251188,0.862710
4,38,0,6.436687,6.318960,0,0,1,1,15,0,...,-0.287538,-0.286893,0.057082,-0.565021,-0.105632,-0.108148,-0.405853,-0.053601,-0.677072,0.134160
5,42,0,5.037645,2.743326,1,0,1,1,10,1,...,-0.417534,-0.141338,-0.492190,0.090633,-0.169754,-0.220211,-0.429283,-0.042797,-0.394709,-0.390144
6,50,0,8.739220,8.739220,1,1,0,1,25,1,...,0.086751,-0.144424,-0.778273,0.024693,0.204909,-0.043497,-0.172939,-0.013997,-0.437534,0.255511
7,43,0,7.567420,7.567420,1,0,0,1,15,3,...,-0.003150,0.043824,0.442394,-0.498541,-0.231900,0.029205,-0.078742,-0.241568,-0.841080,-0.680880
8,47,0,7.296372,7.296372,1,0,0,1,18,1,...,-0.362921,-0.038672,-0.647650,-0.760694,0.146781,0.038366,-0.127822,-0.058059,-1.041802,-0.130038
9,39,1,4.662560,1.114305,0,0,0,1,17,0,...,-0.845758,0.635155,-0.235659,-0.396895,-0.474251,-0.298208,-0.531806,0.091948,-0.605779,-0.607580


We find bad rows which contain too many missing values, then remove them.

In [9]:
def remove_bad_rows(df,bad_row_threshold):   
    # find bad rows having too many missing values
    n_null = np.array(df.isnull().sum(axis=1))
    bad_row = np.array([]).astype(int)
    for t in range(len(n_null)):
        if n_null[t] >= bad_row_threshold:
            bad_row = np.append(bad_row,t)

    #print(bad_row)
    print('number of bad rows:',len(bad_row))

    # delete bad rows
    df = df.drop(bad_row)
    #df.info()
    return df

In [10]:
df = remove_bad_rows(df,10)
df

number of bad rows: 0


Unnamed: 0,age,eventdeath,survival,timerecurrence,chemo,hormonal,amputation,histtype,diam,posnodes,...,Contig36312_RC,Contig38980_RC,NM_000853,NM_000854,NM_000860,Contig29014_RC,Contig46616_RC,NM_000888,NM_000898,AF067420
0,43,0,14.817248,14.817248,0,0,1,1,25,0,...,0.591103,-0.355018,0.373644,-0.760690,-0.164025,-0.038726,0.237856,-0.087631,-0.369153,0.153795
1,48,0,14.261465,14.261465,0,0,0,1,20,0,...,-0.199829,-0.001635,-0.062922,-0.682204,-0.220934,-0.100088,-0.466537,-0.231547,-0.643019,-0.014098
2,38,0,6.644764,6.644764,0,0,0,1,15,0,...,0.328736,-0.047571,0.084228,-0.695950,-0.402840,-0.099965,0.110155,-0.114298,0.258495,-0.198911
3,50,0,7.748118,7.748118,0,1,0,1,15,1,...,0.648861,-0.039088,0.182182,-0.524640,0.037320,-0.167688,-0.016790,-0.285344,-0.251188,0.862710
4,38,0,6.436687,6.318960,0,0,1,1,15,0,...,-0.287538,-0.286893,0.057082,-0.565021,-0.105632,-0.108148,-0.405853,-0.053601,-0.677072,0.134160
5,42,0,5.037645,2.743326,1,0,1,1,10,1,...,-0.417534,-0.141338,-0.492190,0.090633,-0.169754,-0.220211,-0.429283,-0.042797,-0.394709,-0.390144
6,50,0,8.739220,8.739220,1,1,0,1,25,1,...,0.086751,-0.144424,-0.778273,0.024693,0.204909,-0.043497,-0.172939,-0.013997,-0.437534,0.255511
7,43,0,7.567420,7.567420,1,0,0,1,15,3,...,-0.003150,0.043824,0.442394,-0.498541,-0.231900,0.029205,-0.078742,-0.241568,-0.841080,-0.680880
8,47,0,7.296372,7.296372,1,0,0,1,18,1,...,-0.362921,-0.038672,-0.647650,-0.760694,0.146781,0.038366,-0.127822,-0.058059,-1.041802,-0.130038
9,39,1,4.662560,1.114305,0,0,0,1,17,0,...,-0.845758,0.635155,-0.235659,-0.396895,-0.474251,-0.298208,-0.531806,0.091948,-0.605779,-0.607580


In [11]:
dfx = df.drop('eventdeath',axis=1)
dfy = df['eventdeath']

In [12]:
# number of uniques of each column (excluding NaN)
nu = np.array([len(pd.unique(dfx[col].dropna())) for col in dfx.columns])
print('number of uniques of each variable:')
print(nu)

number of uniques of each variable:
[ 27 261 262 ... 272 272 272]


In [13]:
def define_variable_type(df,nu):
    i_binary = [] ; i_category = [] ; i_continuous = []
    for i in range(len(nu)):
        if nu[i] == 2: # binary 
            i_binary.append(i)
        elif nu[i] < 5: # !!!! NOTE: this is not always correct, depending on data
            i_category.append(i)
        else:
            i_continuous.append(i)

    print('i_binary:',i_binary)
    print('i_category:',i_category)   
    #i_binary, i_category, i_continuous
    
    variable_type  = np.ones(len(nu))  # binary
    variable_type[i_category] = 2   # categorical
    variable_type[i_continuous] = 3 # continuous

    return variable_type #,i_binary,i_category,i_continuous

In [14]:
variable_type = define_variable_type(dfx,nu)
print('variable type:',variable_type)

i_binary: [3, 4, 5]
i_category: [9, 10, 11]
variable type: [3. 3. 3. ... 3. 3. 3.]


In [15]:
def impute_missing(df,variable_type):
    # impute binary and categorical variables by the most frequency (in each column)
    # continuous variable by median
    # input: df: pandas data frame, variable_type: list
    # output: df2: pandas data frame
    df2 = df.copy()
    for i,col in enumerate(df.columns):
        if variable_type[i] < 3: # binary or caterogy
            df2[col] = df[col].fillna(df[col].mode().iloc[0])
        else: # continuous
            df2[col] = df[col].fillna(df[col].median())    
    return df2       

In [16]:
dfx_imputed = impute_missing(dfx,variable_type)
dfx_imputed

Unnamed: 0,age,survival,timerecurrence,chemo,hormonal,amputation,histtype,diam,posnodes,grade,...,Contig36312_RC,Contig38980_RC,NM_000853,NM_000854,NM_000860,Contig29014_RC,Contig46616_RC,NM_000888,NM_000898,AF067420
0,43,14.817248,14.817248,0,0,1,1,25,0,2,...,0.591103,-0.355018,0.373644,-0.760690,-0.164025,-0.038726,0.237856,-0.087631,-0.369153,0.153795
1,48,14.261465,14.261465,0,0,0,1,20,0,3,...,-0.199829,-0.001635,-0.062922,-0.682204,-0.220934,-0.100088,-0.466537,-0.231547,-0.643019,-0.014098
2,38,6.644764,6.644764,0,0,0,1,15,0,2,...,0.328736,-0.047571,0.084228,-0.695950,-0.402840,-0.099965,0.110155,-0.114298,0.258495,-0.198911
3,50,7.748118,7.748118,0,1,0,1,15,1,2,...,0.648861,-0.039088,0.182182,-0.524640,0.037320,-0.167688,-0.016790,-0.285344,-0.251188,0.862710
4,38,6.436687,6.318960,0,0,1,1,15,0,2,...,-0.287538,-0.286893,0.057082,-0.565021,-0.105632,-0.108148,-0.405853,-0.053601,-0.677072,0.134160
5,42,5.037645,2.743326,1,0,1,1,10,1,1,...,-0.417534,-0.141338,-0.492190,0.090633,-0.169754,-0.220211,-0.429283,-0.042797,-0.394709,-0.390144
6,50,8.739220,8.739220,1,1,0,1,25,1,1,...,0.086751,-0.144424,-0.778273,0.024693,0.204909,-0.043497,-0.172939,-0.013997,-0.437534,0.255511
7,43,7.567420,7.567420,1,0,0,1,15,3,2,...,-0.003150,0.043824,0.442394,-0.498541,-0.231900,0.029205,-0.078742,-0.241568,-0.841080,-0.680880
8,47,7.296372,7.296372,1,0,0,1,18,1,3,...,-0.362921,-0.038672,-0.647650,-0.760694,0.146781,0.038366,-0.127822,-0.058059,-1.041802,-0.130038
9,39,4.662560,1.114305,0,0,0,1,17,0,3,...,-0.845758,0.635155,-0.235659,-0.396895,-0.474251,-0.298208,-0.531806,0.091948,-0.605779,-0.607580


In [17]:
dfx_imputed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 272 entries, 0 to 271
Columns: 1566 entries, age to AF067420
dtypes: float64(1556), int64(10)
memory usage: 3.3 MB


## Data Processing

### Attributes

In [18]:
def convert_binary_and_category(x,variable_type):
    """
    convert binary to +-1, category to one hot; remain continuous.
    """
    
    onehot_encoder = OneHotEncoder(sparse=False,categories='auto')

    # create 2 initial columns
    x_new = np.zeros((x.shape[0],2))

    for i,i_type in enumerate(variable_type):
        if i_type == 1: # binary
            unique_value = np.unique(x[:,i])
            x1 = np.array([-1. if value == unique_value[0] else 1. for value in x[:,i]])        
            x_new = np.hstack((x_new,x1[:,np.newaxis]))

        elif i_type == 2: # category
            x1 = onehot_encoder.fit_transform(x[:,i].reshape(-1,1))
            x_new = np.hstack((x_new,x1))
            
        else: # continuous      
            x_new = np.hstack((x_new,x[:,i][:,np.newaxis]))      

    # drop the 2 initial column
    x_new = x_new[:,2:]
    
    return x_new.astype(float)

In [19]:
# convert x
x = np.array(dfx_imputed)
x_new = convert_binary_and_category(x,variable_type)

print(x_new.shape)
print(x_new)

(272, 1572)
[[ 4.3000000e+01  1.4817248e+01  1.4817248e+01 ... -8.7631000e-02
  -3.6915300e-01  1.5379500e-01]
 [ 4.8000000e+01  1.4261465e+01  1.4261465e+01 ... -2.3154700e-01
  -6.4301900e-01 -1.4098000e-02]
 [ 3.8000000e+01  6.6447640e+00  6.6447640e+00 ... -1.1429800e-01
   2.5849500e-01 -1.9891100e-01]
 ...
 [ 5.0000000e+01  2.6192000e+00  2.1492130e+00 ... -5.1088400e-01
   2.5190300e-01 -8.2279200e-01]
 [ 5.2000000e+01  2.2905000e+00  2.2094460e+00 ... -3.9653100e-01
   3.5681600e-01  3.4508800e-01]
 [ 5.2000000e+01  3.7370000e+00  2.1273100e+00 ...  7.9495200e-01
  -1.0893030e+00 -3.2619300e-01]]


### Target

In [20]:
y = np.array(dfy)
print(np.unique(y,return_counts=True))


# if target is already 0 and 1
y_new = y #and delete lines below convert  
print(np.unique(y_new,return_counts=True))

(array([0, 1]), array([195,  77]))
(array([0, 1]), array([195,  77]))


In [21]:
# combine X and y and save to a file
xy_new = np.hstack((x_new,y_new[:,np.newaxis]))
np.savetxt('nki_processed.dat',xy_new,fmt='%f')