In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
#from sklearn.preprocessing import Imputer

In [2]:
np.random.seed(1)

In [3]:
df = pd.read_csv("paradox.csv")
df

Unnamed: 0,Site,Code,SAincl,SA2,SA4,SA6,SA8,SA10,SA12,SA1416,...,TempPara5,VitD,VitD3Groups,ThreeUTR,ThreeUTRvar,INT4,INT4var,D543,Callele,Callelevar
0,1.0,401,8.3,1.0,2.8,3.3,1.2,1.1,1.1,1.1,...,,,,,,,,,,
1,1.0,402,11.3,13.3,18.9,20.1,26.2,34.7,29.9,33.0,...,,,,,,,,,,
2,1.0,403,15.4,9.9,9.5,9.5,8.3,8.7,6.4,3.9,...,,,,,,,,,,
3,1.0,404,0.5,0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,1.0,405,33.0,24.1,12.3,11.8,8.7,3.3,3.1,3.1,...,,,,,,,,,,
5,1.0,406,6.4,21.4,3.9,6.0,7.1,6.7,2.8,6.4,...,,,,,,,,,,
6,1.0,407,3.3,1.3,0.2,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
7,1.0,408,3.6,3.3,0.6,0.5,0.2,0.0,0.0,0.0,...,,,,,,,,,,
8,1.0,409,6.4,7.9,6.0,4.8,2.6,0.3,1.8,1.0,...,,,,,,,,,,
9,1.0,410,9.1,7.9,7.9,5.7,4.1,2.0,2.8,4.1,...,,,,,,,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 49 columns):
Site                     241 non-null float64
Code                     241 non-null int64
SAincl                   241 non-null float64
SA2                      233 non-null float64
SA4                      227 non-null float64
SA6                      223 non-null float64
SA8                      232 non-null float64
SA10                     215 non-null float64
SA12                     225 non-null float64
SA1416                   223 non-null float64
SA2120                   221 non-null float64
SA2728                   211 non-null float64
studyarm                 241 non-null float64
sexe                     241 non-null float64
age                      241 non-null int64
lesionsince              240 non-null float64
BPSYST                   216 non-null float64
BPDIAST                  216 non-null float64
pulserateinclbeatsmin    232 non-null float64
tempinclCelsius          

## Data Cleaning

In [5]:
# Drop the column `Code` as it is unrelated to the dependent variable
df = df.drop(['Code'],axis=1)
df

Unnamed: 0,Site,SAincl,SA2,SA4,SA6,SA8,SA10,SA12,SA1416,SA2120,...,TempPara5,VitD,VitD3Groups,ThreeUTR,ThreeUTRvar,INT4,INT4var,D543,Callele,Callelevar
0,1.0,8.3,1.0,2.8,3.3,1.2,1.1,1.1,1.1,0.0,...,,,,,,,,,,
1,1.0,11.3,13.3,18.9,20.1,26.2,34.7,29.9,33.0,33.0,...,,,,,,,,,,
2,1.0,15.4,9.9,9.5,9.5,8.3,8.7,6.4,3.9,3.7,...,,,,,,,,,,
3,1.0,0.5,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,1.0,33.0,24.1,12.3,11.8,8.7,3.3,3.1,3.1,7.1,...,,,,,,,,,,
5,1.0,6.4,21.4,3.9,6.0,7.1,6.7,2.8,6.4,5.0,...,,,,,,,,,,
6,1.0,3.3,1.3,0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
7,1.0,3.6,3.3,0.6,0.5,0.2,0.0,0.0,0.0,0.0,...,,,,,,,,,,
8,1.0,6.4,7.9,6.0,4.8,2.6,0.3,1.8,1.0,0.0,...,,,,,,,,,,
9,1.0,9.1,7.9,7.9,5.7,4.1,2.0,2.8,4.1,0.0,...,,,,,,,,,,


In [6]:
# Remove rows where taget are missing
missing_row = df['Paradox1'].isnull()
print('Number of rows where target are missing:')
print(sum(missing_row))

df = df[~missing_row]
df

Number of rows where target are missing:
0


Unnamed: 0,Site,SAincl,SA2,SA4,SA6,SA8,SA10,SA12,SA1416,SA2120,...,TempPara5,VitD,VitD3Groups,ThreeUTR,ThreeUTRvar,INT4,INT4var,D543,Callele,Callelevar
0,1.0,8.3,1.0,2.8,3.3,1.2,1.1,1.1,1.1,0.0,...,,,,,,,,,,
1,1.0,11.3,13.3,18.9,20.1,26.2,34.7,29.9,33.0,33.0,...,,,,,,,,,,
2,1.0,15.4,9.9,9.5,9.5,8.3,8.7,6.4,3.9,3.7,...,,,,,,,,,,
3,1.0,0.5,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,1.0,33.0,24.1,12.3,11.8,8.7,3.3,3.1,3.1,7.1,...,,,,,,,,,,
5,1.0,6.4,21.4,3.9,6.0,7.1,6.7,2.8,6.4,5.0,...,,,,,,,,,,
6,1.0,3.3,1.3,0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
7,1.0,3.6,3.3,0.6,0.5,0.2,0.0,0.0,0.0,0.0,...,,,,,,,,,,
8,1.0,6.4,7.9,6.0,4.8,2.6,0.3,1.8,1.0,0.0,...,,,,,,,,,,
9,1.0,9.1,7.9,7.9,5.7,4.1,2.0,2.8,4.1,0.0,...,,,,,,,,,,


We determine and drop the variables with excessive missing values from the dataset.

In [7]:
def remove_bad_columns(df,bad_column_threshold):
    # find bad columns having too many missing values
    n_null = np.array(df.isnull().sum(axis=0))
    bad_col = np.array([]).astype(int)
    for i in range(len(n_null)):
        if n_null[i] >= bad_column_threshold:
            bad_col = np.append(bad_col,i)

    #print(bad_col)
    print('number of bad columns:',len(bad_col))

    # delete bad columns
    df = df.drop(df.columns[bad_col],axis=1)
    #df.info()
    return df   

In [8]:
df = remove_bad_columns(df,41)
df

number of bad columns: 15


Unnamed: 0,Site,SAincl,SA2,SA4,SA6,SA8,SA10,SA12,SA1416,SA2120,...,oedemaincl,Sitelesion,Hb,WBC,creatin,HIV,Drug,Paradox1,Paradox2,WeekParadox5
0,1.0,8.3,1.0,2.8,3.3,1.2,1.1,1.1,1.1,0.0,...,1,1.0,13.96,10.80,64.0,0.0,1.0,1.0,1.0,6.0
1,1.0,11.3,13.3,18.9,20.1,26.2,34.7,29.9,33.0,33.0,...,0,2.0,12.19,11.40,59.0,0.0,1.0,1.0,0.0,0.0
2,1.0,15.4,9.9,9.5,9.5,8.3,8.7,6.4,3.9,3.7,...,0,2.0,10.54,6.60,65.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.5,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,3.0,12.64,7.60,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,33.0,24.1,12.3,11.8,8.7,3.3,3.1,3.1,7.1,...,0,2.0,13.40,9.80,75.0,0.0,1.0,0.0,0.0,0.0
5,1.0,6.4,21.4,3.9,6.0,7.1,6.7,2.8,6.4,5.0,...,0,3.0,11.74,7.80,58.0,0.0,1.0,1.0,1.0,8.0
6,1.0,3.3,1.3,0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,0,2.0,12.87,10.30,65.0,0.0,1.0,0.0,0.0,0.0
7,1.0,3.6,3.3,0.6,0.5,0.2,0.0,0.0,0.0,0.0,...,0,2.0,9.70,4.90,62.0,0.0,1.0,0.0,0.0,0.0
8,1.0,6.4,7.9,6.0,4.8,2.6,0.3,1.8,1.0,0.0,...,2,1.0,10.80,13.67,64.6,0.0,1.0,0.0,0.0,0.0
9,1.0,9.1,7.9,7.9,5.7,4.1,2.0,2.8,4.1,0.0,...,2,1.0,15.92,5.60,53.0,0.0,1.0,1.0,1.0,16.0


We find bad rows which contain too many missing values, then remove them.

In [9]:
def remove_bad_rows(df,bad_row_threshold):   
    # find bad rows having too many missing values
    n_null = np.array(df.isnull().sum(axis=1))
    bad_row = np.array([]).astype(int)
    for t in range(len(n_null)):
        if n_null[t] >= bad_row_threshold:
            bad_row = np.append(bad_row,t)

    #print(bad_row)
    print('number of bad rows:',len(bad_row))

    # delete bad rows
    df = df.drop(bad_row)
    #df.info()
    return df

In [10]:
df = remove_bad_rows(df,6)
df

number of bad rows: 5


Unnamed: 0,Site,SAincl,SA2,SA4,SA6,SA8,SA10,SA12,SA1416,SA2120,...,oedemaincl,Sitelesion,Hb,WBC,creatin,HIV,Drug,Paradox1,Paradox2,WeekParadox5
0,1.0,8.3,1.0,2.8,3.3,1.2,1.1,1.1,1.1,0.0,...,1,1.0,13.96,10.80,64.0,0.0,1.0,1.0,1.0,6.0
1,1.0,11.3,13.3,18.9,20.1,26.2,34.7,29.9,33.0,33.0,...,0,2.0,12.19,11.40,59.0,0.0,1.0,1.0,0.0,0.0
2,1.0,15.4,9.9,9.5,9.5,8.3,8.7,6.4,3.9,3.7,...,0,2.0,10.54,6.60,65.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.5,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,3.0,12.64,7.60,0.0,0.0,1.0,0.0,0.0,0.0
4,1.0,33.0,24.1,12.3,11.8,8.7,3.3,3.1,3.1,7.1,...,0,2.0,13.40,9.80,75.0,0.0,1.0,0.0,0.0,0.0
5,1.0,6.4,21.4,3.9,6.0,7.1,6.7,2.8,6.4,5.0,...,0,3.0,11.74,7.80,58.0,0.0,1.0,1.0,1.0,8.0
6,1.0,3.3,1.3,0.2,0.0,0.0,0.0,0.0,0.0,0.0,...,0,2.0,12.87,10.30,65.0,0.0,1.0,0.0,0.0,0.0
7,1.0,3.6,3.3,0.6,0.5,0.2,0.0,0.0,0.0,0.0,...,0,2.0,9.70,4.90,62.0,0.0,1.0,0.0,0.0,0.0
8,1.0,6.4,7.9,6.0,4.8,2.6,0.3,1.8,1.0,0.0,...,2,1.0,10.80,13.67,64.6,0.0,1.0,0.0,0.0,0.0
9,1.0,9.1,7.9,7.9,5.7,4.1,2.0,2.8,4.1,0.0,...,2,1.0,15.92,5.60,53.0,0.0,1.0,1.0,1.0,16.0


In [11]:
dfx = df.drop('Paradox1',axis=1)
dfy = df['Paradox1']

In [12]:
# number of uniques of each column (excluding NaN)
nu = np.array([len(pd.unique(dfx[col].dropna())) for col in dfx.columns])
print('number of uniques of each variable:')
print(nu)

number of uniques of each variable:
[  2 183 161 155 122 132 121 123 115  75  46   4   2  41  21  13  12  75
  33  59  79   3   5   3   4 119 108  82   2   2   2   8]


In [13]:
def define_variable_type(df,nu):
    i_binary = [] ; i_category = [] ; i_continuous = []
    for i in range(len(nu)):
        if nu[i] == 2: # binary 
            i_binary.append(i)
        elif nu[i] < 5: # !!!! NOTE: this is not always correct, depending on data
            i_category.append(i)
        else:
            i_continuous.append(i)

    print('i_binary:',i_binary)
    print('i_category:',i_category)   
    #i_binary, i_category, i_continuous
    
    variable_type  = np.ones(len(nu))  # binary
    variable_type[i_category] = 2   # categorical
    variable_type[i_continuous] = 3 # continuous

    return variable_type #,i_binary,i_category,i_continuous

In [14]:
variable_type = define_variable_type(dfx,nu)
print('variable type:',variable_type)

i_binary: [0, 12, 28, 29, 30]
i_category: [11, 21, 23, 24]
variable type: [1. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 2. 1. 3. 3. 3. 3. 3. 3. 3. 3. 2. 3. 2.
 2. 3. 3. 3. 1. 1. 1. 3.]


In [15]:
def impute_missing(df,variable_type):
    # impute binary and categorical variables by the most frequency (in each column)
    # continuous variable by median
    # input: df: pandas data frame, variable_type: list
    # output: df2: pandas data frame
    df2 = df.copy()
    for i,col in enumerate(df.columns):
        if variable_type[i] < 3: # binary or caterogy
            df2[col] = df[col].fillna(df[col].mode().iloc[0])
        else: # continuous
            df2[col] = df[col].fillna(df[col].median())    
    return df2       

In [16]:
dfx_imputed = impute_missing(dfx,variable_type)
dfx_imputed

Unnamed: 0,Site,SAincl,SA2,SA4,SA6,SA8,SA10,SA12,SA1416,SA2120,...,stageincllesion,oedemaincl,Sitelesion,Hb,WBC,creatin,HIV,Drug,Paradox2,WeekParadox5
0,1.0,8.3,1.0,2.8,3.3,1.2,1.1,1.10,1.1,0.0,...,6,1,1.0,13.96,10.80,64.0,0.0,1.0,1.0,6.0
1,1.0,11.3,13.3,18.9,20.1,26.2,34.7,29.90,33.0,33.0,...,2,0,2.0,12.19,11.40,59.0,0.0,1.0,0.0,0.0
2,1.0,15.4,9.9,9.5,9.5,8.3,8.7,6.40,3.9,3.7,...,6,0,2.0,10.54,6.60,65.0,0.0,1.0,0.0,0.0
3,1.0,0.5,0.2,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,3,0,3.0,12.64,7.60,0.0,0.0,1.0,0.0,0.0
4,1.0,33.0,24.1,12.3,11.8,8.7,3.3,3.10,3.1,7.1,...,6,0,2.0,13.40,9.80,75.0,0.0,1.0,0.0,0.0
5,1.0,6.4,21.4,3.9,6.0,7.1,6.7,2.80,6.4,5.0,...,6,0,3.0,11.74,7.80,58.0,0.0,1.0,1.0,8.0
6,1.0,3.3,1.3,0.2,0.0,0.0,0.0,0.00,0.0,0.0,...,3,0,2.0,12.87,10.30,65.0,0.0,1.0,0.0,0.0
7,1.0,3.6,3.3,0.6,0.5,0.2,0.0,0.00,0.0,0.0,...,3,0,2.0,9.70,4.90,62.0,0.0,1.0,0.0,0.0
8,1.0,6.4,7.9,6.0,4.8,2.6,0.3,1.80,1.0,0.0,...,6,2,1.0,10.80,13.67,64.6,0.0,1.0,0.0,0.0
9,1.0,9.1,7.9,7.9,5.7,4.1,2.0,2.80,4.1,0.0,...,3,2,1.0,15.92,5.60,53.0,0.0,1.0,1.0,16.0


In [17]:
dfx_imputed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 236 entries, 0 to 240
Data columns (total 32 columns):
Site                     236 non-null float64
SAincl                   236 non-null float64
SA2                      236 non-null float64
SA4                      236 non-null float64
SA6                      236 non-null float64
SA8                      236 non-null float64
SA10                     236 non-null float64
SA12                     236 non-null float64
SA1416                   236 non-null float64
SA2120                   236 non-null float64
SA2728                   236 non-null float64
studyarm                 236 non-null float64
sexe                     236 non-null float64
age                      236 non-null int64
lesionsince              236 non-null float64
BPSYST                   236 non-null float64
BPDIAST                  236 non-null float64
pulserateinclbeatsmin    236 non-null float64
tempinclCelsius          236 non-null float64
bodyweightinclkg       

## Data Processing

### Attributes

In [18]:
def convert_binary_and_category(x,variable_type):
    """
    convert binary to +-1, category to one hot; remain continuous.
    """
    
    onehot_encoder = OneHotEncoder(sparse=False,categories='auto')

    # create 2 initial columns
    x_new = np.zeros((x.shape[0],2))

    for i,i_type in enumerate(variable_type):
        if i_type == 1: # binary
            unique_value = np.unique(x[:,i])
            x1 = np.array([-1. if value == unique_value[0] else 1. for value in x[:,i]])        
            x_new = np.hstack((x_new,x1[:,np.newaxis]))

        elif i_type == 2: # category
            x1 = onehot_encoder.fit_transform(x[:,i].reshape(-1,1))
            x_new = np.hstack((x_new,x1))
            
        else: # continuous      
            x_new = np.hstack((x_new,x[:,i][:,np.newaxis]))      

    # drop the 2 initial column
    x_new = x_new[:,2:]
    
    return x_new.astype(float)

In [19]:
# convert x
x = np.array(dfx_imputed)
x_new = convert_binary_and_category(x,variable_type)

print(x_new.shape)
print(x_new)

(236, 42)
[[-1.   8.3  1.  ... -1.   1.   6. ]
 [-1.  11.3 13.3 ... -1.  -1.   0. ]
 [-1.  15.4  9.9 ... -1.  -1.   0. ]
 ...
 [ 1.  38.4 10.9 ... -1.  -1.   0. ]
 [ 1.  68.1 79.2 ... -1.   1.  10. ]
 [ 1.   4.   1.5 ...  1.  -1.   0. ]]


### Target

In [20]:
y = np.array(dfy)
print(np.unique(y,return_counts=True))

# convert target to 0 and 1
y_new = y
#y_new[y =='No'] = 0
print(np.unique(y_new,return_counts=True))

(array([0., 1.]), array([174,  62]))
(array([0., 1.]), array([174,  62]))


In [21]:
# combine X and y and save to a file
xy_new = np.hstack((x_new,y_new[:,np.newaxis]))
np.savetxt('paradox_processed.dat',xy_new,fmt='%f')