# Preparing Data for Use

In [1]:
from scipy.io import arff
import numpy as np
import pandas as pd

In [2]:
# data is originally provided in arff format
data = arff.loadarff('data/bone-marrow.arff')
df = pd.DataFrame(data[0])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 37 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   donor_age                  187 non-null    float64
 1   donor_age_below_35         187 non-null    object 
 2   donor_ABO                  187 non-null    object 
 3   donor_CMV                  187 non-null    object 
 4   recipient_age              187 non-null    float64
 5   recipient_age_below_10     187 non-null    object 
 6   recipient_age_int          187 non-null    object 
 7   recipient_gender           187 non-null    object 
 8   recipient_body_mass        185 non-null    float64
 9   recipient_ABO              187 non-null    object 
 10  recipient_rh               187 non-null    object 
 11  recipient_CMV              187 non-null    object 
 12  disease                    187 non-null    object 
 13  disease_group              187 non-null    object 

In [4]:
# let's look at the what the initial data frame looks like
df.head()

Unnamed: 0,donor_age,donor_age_below_35,donor_ABO,donor_CMV,recipient_age,recipient_age_below_10,recipient_age_int,recipient_gender,recipient_body_mass,recipient_ABO,...,CD3_to_CD34_ratio,ANC_recovery,PLT_recovery,acute_GvHD_II_III_IV,acute_GvHD_III_IV,time_to_acute_GvHD_III_IV,extensive_chronic_GvHD,relapse,survival_time,survival_status
0,22.830137,b'yes',b'A',b'present',9.6,b'yes',b'5_10',b'male',35.0,b'A',...,1.33876,19.0,51.0,b'yes',b'yes',32.0,b'no',b'no',999.0,b'0'
1,23.342466,b'yes',b'B',b'absent',4.0,b'yes',b'0_5',b'male',20.6,b'B',...,11.078295,16.0,37.0,b'yes',b'no',1000000.0,b'no',b'yes',163.0,b'1'
2,26.394521,b'yes',b'B',b'absent',6.6,b'yes',b'5_10',b'male',23.4,b'B',...,19.01323,23.0,20.0,b'yes',b'no',1000000.0,b'no',b'yes',435.0,b'1'
3,39.684932,b'no',b'A',b'present',18.1,b'no',b'10_20',b'female',50.0,b'AB',...,29.481647,23.0,29.0,b'yes',b'yes',19.0,b'?',b'no',53.0,b'1'
4,33.358904,b'yes',b'A',b'absent',1.3,b'yes',b'0_5',b'female',9.0,b'AB',...,3.972255,14.0,14.0,b'no',b'no',1000000.0,b'no',b'no',2043.0,b'0'


In [5]:
# convert columns that are in bytes to strings

# select all columns with dtype: object
byte_columns = df.select_dtypes('object').columns

# convert from bytes to strings
for col in byte_columns:
    df[col] = df[col].str.decode('UTF-8')

In [6]:
# now non-numeric columns should have string entries
df.head()

Unnamed: 0,donor_age,donor_age_below_35,donor_ABO,donor_CMV,recipient_age,recipient_age_below_10,recipient_age_int,recipient_gender,recipient_body_mass,recipient_ABO,...,CD3_to_CD34_ratio,ANC_recovery,PLT_recovery,acute_GvHD_II_III_IV,acute_GvHD_III_IV,time_to_acute_GvHD_III_IV,extensive_chronic_GvHD,relapse,survival_time,survival_status
0,22.830137,yes,A,present,9.6,yes,5_10,male,35.0,A,...,1.33876,19.0,51.0,yes,yes,32.0,no,no,999.0,0
1,23.342466,yes,B,absent,4.0,yes,0_5,male,20.6,B,...,11.078295,16.0,37.0,yes,no,1000000.0,no,yes,163.0,1
2,26.394521,yes,B,absent,6.6,yes,5_10,male,23.4,B,...,19.01323,23.0,20.0,yes,no,1000000.0,no,yes,435.0,1
3,39.684932,no,A,present,18.1,no,10_20,female,50.0,AB,...,29.481647,23.0,29.0,yes,yes,19.0,?,no,53.0,1
4,33.358904,yes,A,absent,1.3,yes,0_5,female,9.0,AB,...,3.972255,14.0,14.0,no,no,1000000.0,no,no,2043.0,0


In [7]:
# convert missing data ('?') to NaN values
df = df.replace('?', np.nan)

In [8]:
# let's look at basic summary statistics
df.describe()

Unnamed: 0,donor_age,recipient_age,recipient_body_mass,CD34_x1e6_per_kg,CD3_x1e8_per_kg,CD3_to_CD34_ratio,ANC_recovery,PLT_recovery,time_to_acute_GvHD_III_IV,survival_time
count,187.0,187.0,185.0,187.0,182.0,182.0,187.0,187.0,187.0,187.0
mean,33.472068,9.931551,35.801081,11.891781,4.745714,5.385096,26752.86631,90937.919786,775408.042781,938.743316
std,8.271826,5.305639,19.650922,9.914386,3.859128,9.598716,161747.200525,288242.407688,418425.252689,849.589495
min,18.646575,0.6,6.0,0.79,0.04,0.204132,9.0,9.0,10.0,6.0
25%,27.039726,5.05,19.0,5.35,1.6875,1.786683,13.0,16.0,1000000.0,168.5
50%,33.550685,9.6,33.0,9.72,4.325,2.734462,15.0,21.0,1000000.0,676.0
75%,40.117809,14.05,50.6,15.415,6.785,5.823565,17.0,37.0,1000000.0,1604.0
max,55.553425,20.2,103.4,57.78,20.02,99.56097,1000000.0,1000000.0,1000000.0,3364.0


In [9]:
# the maximum value in columns ANC_recovery, PLT_recovery, and time_to_acute_GvHD_III_IV is 1000000 days
# 1000000 days is equivalent to over 2700 years; this value is likely a placeholder for missing values
# let's convert these into NaNs

df.ANC_recovery = np.where(df.ANC_recovery==1000000, np.nan, df.ANC_recovery)
df.PLT_recovery = np.where(df.PLT_recovery==1000000, np.nan, df.PLT_recovery)
df.time_to_acute_GvHD_III_IV = np.where(df.time_to_acute_GvHD_III_IV==1000000, np.nan, df.time_to_acute_GvHD_III_IV)

In [10]:
# looking at the summary statistics, it should all seem plausible now
df.describe()

Unnamed: 0,donor_age,recipient_age,recipient_body_mass,CD34_x1e6_per_kg,CD3_x1e8_per_kg,CD3_to_CD34_ratio,ANC_recovery,PLT_recovery,time_to_acute_GvHD_III_IV,survival_time
count,187.0,187.0,185.0,187.0,182.0,182.0,182.0,170.0,42.0,187.0
mean,33.472068,9.931551,35.801081,11.891781,4.745714,5.385096,15.307692,31.711765,31.047619,938.743316
std,8.271826,5.305639,19.650922,9.914386,3.859128,9.598716,3.010818,37.009744,23.777374,849.589495
min,18.646575,0.6,6.0,0.79,0.04,0.204132,9.0,9.0,10.0,6.0
25%,27.039726,5.05,19.0,5.35,1.6875,1.786683,13.0,15.0,16.25,168.5
50%,33.550685,9.6,33.0,9.72,4.325,2.734462,15.0,21.0,21.0,676.0
75%,40.117809,14.05,50.6,15.415,6.785,5.823565,17.0,29.75,35.5,1604.0
max,55.553425,20.2,103.4,57.78,20.02,99.56097,26.0,285.0,100.0,3364.0


In [11]:
# let's save this as a csv for future use
df.to_csv('data/bone_marrow.csv', index=False)