In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os

#needed to import utils.py
sys.path.append('../') 

import utils

import numpy as np
import pandas as pd

%matplotlib inline  

---
### Read data from `Demographic` and `ALS-History` csv file

##### Read pre-processed `Patient` CSV file

In [2]:
#set the data directory
data_dir = os.path.abspath('../03_preprocessed_data/')

#set the name of CSV file
data_file = f'{data_dir}/_patient.csv'

#read data and show some info
df_main = utils.read_csv(data_file)
df_main.head()

Unnamed: 0,subject_id,Age,Sex,Qty_Measurements_ALSFRS,Qty_Measurements_VITALSIGNS,Qty_Measurements_FVC,Qty_Measurements_SVC,Qty_Measurements_LABS,Qty_Measurements_HANDGRIPSTRENGTH,Qty_Measurements_MUSCLESTRENGTH,Qty_Measurements
0,329,38.0,Female,12.0,13.0,8.0,0.0,465.0,0.0,0.0,498.0
1,348,52.0,Female,15.0,10.0,0.0,9.0,820.0,0.0,0.0,854.0
2,533,65.0,Female,6.0,10.0,2.0,6.0,126.0,0.0,0.0,150.0
3,586,63.0,Male,1.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0
4,649,48.0,Female,12.0,12.0,12.0,0.0,420.0,0.0,0.0,456.0


##### Read RAW `ALS-History` CSV file and show some stats

In [3]:
#set the data directory
data_dir = os.path.abspath('../01_raw_data/')

#set the name of CSV file
data_file = f'{data_dir}/PROACT_ALSHISTORY.csv'

#read data and show some info
df_raw = utils.read_csv(data_file)

utils.show_columns_stats(df_raw) 

df_raw

subject_id..................... = 12936 rows (100.0%)     0 with NaN (  0.0%) Uniques= 10271 
Site_of_Onset___Bulbar......... =  1281 rows (  9.9%) 11655 with NaN ( 90.1%) Uniques=     3 
Site_of_Onset___Limb........... =  3589 rows (27.74%)  9347 with NaN (72.26%) Uniques=     3 
Site_of_Onset___Limb_and_Bulbar =     0 rows (  0.0%) 12936 with NaN (100.0%) Uniques=     1 
Site_of_Onset___Other.......... =     0 rows (  0.0%) 12936 with NaN (100.0%) Uniques=     1 
Site_of_Onset___Other_Specify.. =     0 rows (  0.0%) 12936 with NaN (100.0%) Uniques=     1 
Site_of_Onset___Spine.......... =     0 rows (  0.0%) 12936 with NaN (100.0%) Uniques=     1 
Subject_ALS_History_Delta...... =  9583 rows (74.08%)  3353 with NaN (25.92%) Uniques=     5 
Disease_Duration............... =     0 rows (  0.0%) 12936 with NaN (100.0%) Uniques=     1 
Symptom........................ =  2656 rows (20.53%) 10280 with NaN (79.47%) Uniques=    11 
Symptom_Other_Specify.......... =    50 rows ( 0.39%) 12886 

Unnamed: 0,subject_id,Site_of_Onset___Bulbar,Site_of_Onset___Limb,Site_of_Onset___Limb_and_Bulbar,Site_of_Onset___Other,Site_of_Onset___Other_Specify,Site_of_Onset___Spine,Subject_ALS_History_Delta,Disease_Duration,Symptom,Symptom_Other_Specify,Location,Location_Other_Specify,Site_of_Onset,Onset_Delta,Diagnosis_Delta
0,89,,1.0,,,,,,,,,,,,,
1,329,,1.0,,,,,0.0,,,,,,,-1706.0,-1068.0
2,348,,,,,,,0.0,,,,,,Onset: Other,-501.0,-302.0
3,406,,1.0,,,,,,,,,,,,,
4,411,1.0,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12931,22564,,,,,,,,,,,LOWER LIMB,,Onset: Limb,-426.0,-67.0
12932,160135,,,,,,,,,,,BULBAR,,Onset: Bulbar,-651.0,-125.0
12933,304192,,,,,,,,,,,LOWER LIMB,,Onset: Limb,-838.0,-351.0
12934,820199,,,,,,,,,,,UPPER LIMB,,Onset: Limb,-155.0,-35.0


### Join the 2 datasets (renaming some columns if necessary)

NOTE: An ERROR occured due to `DeathData` has duplicate values in `subject_id` column

In [4]:
df = utils.join_datasets_by_key(
    df_main=df_main, 
    df_to_join=df_raw, 
    key_name='subject_id', 
    how='left',
    raise_error=True
)

# df.rename(columns={"Subject_Died": "event_dead", "Death_Days": "event_dead_days"}, inplace=True)

NameError: DF_TO_JOIN has duplicated values in KEY column. Remove duplicate keys before joining.

### Correct the duplicate `subject_id` problem in `RAW Data`


NOTE: apparently the reason for duplication `subject_id` values is because in #1 row was assigned the `Symptom` column, and the #2 row the `Onset_Delta` column

SOLUTION: group samples by `subject_id` column, and use `first()` function to get the first `non-NaN` value for each column

After merge values, 10,271 samples remained with no duplicated `subject_id`

In [5]:
df_als_history = df_raw.groupby(['subject_id']).first().reset_index()
df_als_history

Unnamed: 0,subject_id,Site_of_Onset___Bulbar,Site_of_Onset___Limb,Site_of_Onset___Limb_and_Bulbar,Site_of_Onset___Other,Site_of_Onset___Other_Specify,Site_of_Onset___Spine,Subject_ALS_History_Delta,Disease_Duration,Symptom,Symptom_Other_Specify,Location,Location_Other_Specify,Site_of_Onset,Onset_Delta,Diagnosis_Delta
0,89,,1.0,,,,,,,,,,,,,
1,329,,1.0,,,,,0.0,,,,,,,-1706.0,-1068.0
2,348,,,,,,,0.0,,,,,,Onset: Other,-501.0,-302.0
3,406,,1.0,,,,,,,,,,,,,
4,411,1.0,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10266,999823,0.0,1.0,,,,,0.0,,,,,,,-492.0,-84.0
10267,999863,,1.0,,,,,,,,,,,,,
10268,999880,,1.0,,,,,0.0,,,,,,,-2865.0,-2105.0
10269,999929,,,,,,,0.0,,,,,,Onset: Limb,-411.0,-107.0


##### Check if still exist samples with duplicated `subject_id` values

In [6]:
utils.get_duplicated_rows(df=df_als_history, column='subject_id')

Unnamed: 0,subject_id,Site_of_Onset___Bulbar,Site_of_Onset___Limb,Site_of_Onset___Limb_and_Bulbar,Site_of_Onset___Other,Site_of_Onset___Other_Specify,Site_of_Onset___Spine,Subject_ALS_History_Delta,Disease_Duration,Symptom,Symptom_Other_Specify,Location,Location_Other_Specify,Site_of_Onset,Onset_Delta,Diagnosis_Delta


----
----
----
# Pre-process the ALS-History data

### Create a new column called `site_onset` with the aim of standardize its values 
### to [`Limb`, `Bulbar`, `Spine`, `Limb and Bulbar`, and `Other`]

In [7]:
df_als_history['site_onset'] = np.NaN
df_als_history.head(3)

Unnamed: 0,subject_id,Site_of_Onset___Bulbar,Site_of_Onset___Limb,Site_of_Onset___Limb_and_Bulbar,Site_of_Onset___Other,Site_of_Onset___Other_Specify,Site_of_Onset___Spine,Subject_ALS_History_Delta,Disease_Duration,Symptom,Symptom_Other_Specify,Location,Location_Other_Specify,Site_of_Onset,Onset_Delta,Diagnosis_Delta,site_onset
0,89,,1.0,,,,,,,,,,,,,,
1,329,,1.0,,,,,0.0,,,,,,,-1706.0,-1068.0,
2,348,,,,,,,0.0,,,,,,Onset: Other,-501.0,-302.0,


### Set value to `NaN` for any columns having values different of `1`

In [8]:
columns = ['Site_of_Onset___Bulbar',
        'Site_of_Onset___Limb',
        'Site_of_Onset___Limb_and_Bulbar',
        'Site_of_Onset___Other',
        'Site_of_Onset___Other_Specify',
        'Site_of_Onset___Spine']
        

for col in columns:
    df_als_history.loc[(df_als_history[col] != 1), col] = np.NaN
    
df_als_history    

Unnamed: 0,subject_id,Site_of_Onset___Bulbar,Site_of_Onset___Limb,Site_of_Onset___Limb_and_Bulbar,Site_of_Onset___Other,Site_of_Onset___Other_Specify,Site_of_Onset___Spine,Subject_ALS_History_Delta,Disease_Duration,Symptom,Symptom_Other_Specify,Location,Location_Other_Specify,Site_of_Onset,Onset_Delta,Diagnosis_Delta,site_onset
0,89,,1.0,,,,,,,,,,,,,,
1,329,,1.0,,,,,0.0,,,,,,,-1706.0,-1068.0,
2,348,,,,,,,0.0,,,,,,Onset: Other,-501.0,-302.0,
3,406,,1.0,,,,,,,,,,,,,,
4,411,1.0,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10266,999823,,1.0,,,,,0.0,,,,,,,-492.0,-84.0,
10267,999863,,1.0,,,,,,,,,,,,,,
10268,999880,,1.0,,,,,0.0,,,,,,,-2865.0,-2105.0,
10269,999929,,,,,,,0.0,,,,,,Onset: Limb,-411.0,-107.0,


### 1) Set column `site_onset` for the BULBAR onset samples

##### Update samples with `Site_of_Onset`  = `"Onset: Bulbar"`

In [9]:
df_als_history.loc[(df_als_history['Site_of_Onset']=='Onset: Bulbar'), 'site_onset'] = 'Bulbar'
#df_als_history

##### Update samples with `Site_of_Onset___Bulbar`  = `1`

In [10]:
df_als_history.loc[
(df_als_history['Site_of_Onset___Bulbar']==1)
&(df_als_history['Site_of_Onset'].isnull()==True)
&(df_als_history['Site_of_Onset___Limb'].isnull()==True)
&(df_als_history['Site_of_Onset___Spine'].isnull()==True)
&(df_als_history['Site_of_Onset___Limb_and_Bulbar'].isnull()==True)
&(df_als_history['Site_of_Onset___Other'].isnull()==True)
&(df_als_history['Site_of_Onset___Other_Specify'].isnull()==True)
, 'site_onset'] = 'Bulbar' 

#df_als_history

### 2) Set column `site_onset` for LIMB / SPINAL onset samples

##### Update samples with `Site_of_Onset`  = `"Onset: Limb"` or `"Onset: Spine"`

In [11]:
df_als_history.loc[
    (df_als_history['Site_of_Onset']=='Onset: Limb')
    | (df_als_history['Site_of_Onset']=='Onset: Spine'), 'site_onset'] = 'Limb/Spinal'


#df_als_history

##### Update samples with `Site_of_Onset___Limb`  = `1` OR `Site_of_Onset___Spine`  = `1`  

In [12]:
#update samples with 'Site_of_Onset___Limb' = 1
df_als_history.loc[
(df_als_history['Site_of_Onset___Limb']==1)
&(df_als_history['Site_of_Onset___Bulbar'         ].isnull()==True)
&(df_als_history['Site_of_Onset'                  ].isnull()==True)
&(df_als_history['Site_of_Onset___Spine'          ].isnull()==True)
&(df_als_history['Site_of_Onset___Limb_and_Bulbar'].isnull()==True)
&(df_als_history['Site_of_Onset___Other'          ].isnull()==True)
&(df_als_history['Site_of_Onset___Other_Specify'  ].isnull()==True)
, 'site_onset'] = 'Limb/Spinal' 


#update samples with 'Site_of_Onset___Spine' = 1
df_als_history.loc[
 (df_als_history['Site_of_Onset___Spine']==1)
&(df_als_history['Site_of_Onset___Limb'           ].isnull()==True)
&(df_als_history['Site_of_Onset___Bulbar'         ].isnull()==True)
&(df_als_history['Site_of_Onset'                  ].isnull()==True)
&(df_als_history['Site_of_Onset___Limb_and_Bulbar'].isnull()==True)
&(df_als_history['Site_of_Onset___Other'          ].isnull()==True)
&(df_als_history['Site_of_Onset___Other_Specify'  ].isnull()==True)
, 'site_onset'] = 'Limb/Spinal' 


#df_als_history

### 3) Set column `site_onset` for "Bulbar and Limb/Spine" onset samples

##### Update samples with `Site_of_Onset`  = `"Onset: Limb and Bulbar"`

In [13]:
df_als_history.loc[
    (df_als_history['Site_of_Onset']=='Onset: Limb and Bulbar')
 , 'site_onset'] = 'Bulbar and Limb/Spinal'


#df_als_history

##### Update samples with `Site_of_Onset___Limb_and_Bulbar`  = `1`

In [14]:
df_als_history.loc[
 (df_als_history['Site_of_Onset___Limb_and_Bulbar']==1)
&(df_als_history['Site_of_Onset___Limb'           ].isnull()==True)
&(df_als_history['Site_of_Onset___Bulbar'         ].isnull()==True)
&(df_als_history['Site_of_Onset'                  ].isnull()==True)
&(df_als_history['Site_of_Onset___Spine'          ].isnull()==True)
&(df_als_history['Site_of_Onset___Other'          ].isnull()==True)
&(df_als_history['Site_of_Onset___Other_Specify'  ].isnull()==True)
, 'site_onset'] = 'Onset: Limb and Bulbar' 



#df_als_history

### 4) Set column `site_onset` for "Other" onset samples

##### Update samples with `Site_of_Onset`  = `"Onset: Other"`

In [15]:
df_als_history.loc[
    (df_als_history['Site_of_Onset']=='Onset: Other')
 , 'site_onset'] = 'Other'

#df_als_history

##### Update samples with `Site_of_Onset___Other`  = `1` OR `Site_of_Onset___Other_Specify`  = `1` 

In [16]:
#update samples with 'Site_of_Onset___Other' = 1
df_als_history.loc[
 (df_als_history['Site_of_Onset___Other'          ]==1)
&(df_als_history['Site_of_Onset___Other_Specify'  ].isnull()==True)
&(df_als_history['Site_of_Onset___Limb_and_Bulbar'].isnull()==True)
&(df_als_history['Site_of_Onset___Limb'           ].isnull()==True)
&(df_als_history['Site_of_Onset___Bulbar'         ].isnull()==True)
&(df_als_history['Site_of_Onset'                  ].isnull()==True)
&(df_als_history['Site_of_Onset___Spine'          ].isnull()==True)
 , 'site_onset'] = 'Other'

#update samples with 'Site_of_Onset___Other_Specify' = 1
df_als_history.loc[
 (df_als_history['Site_of_Onset___Other_Specify'  ]==1)
&(df_als_history['Site_of_Onset___Other'          ].isnull()==True)
&(df_als_history['Site_of_Onset___Limb_and_Bulbar'].isnull()==True)
&(df_als_history['Site_of_Onset___Limb'           ].isnull()==True)
&(df_als_history['Site_of_Onset___Bulbar'         ].isnull()==True)
&(df_als_history['Site_of_Onset'                  ].isnull()==True)
&(df_als_history['Site_of_Onset___Spine'          ].isnull()==True)
 , 'site_onset'] = 'Other'

#df_als_history

# Try to join the datasets again (renaming some columns if necessary)

In [17]:
df = utils.join_datasets_by_key(df_main=df_main, df_to_join=df_als_history, key_name='subject_id', how='left')

df.rename(columns={'Onset_Delta': 'Symptoms_Onset_Delta', 
                   'site_onset': 'Site_Onset'}, 
          inplace=True)

print(utils.get_quantity_of_rows(df_main))
print(utils.get_quantity_of_rows(df_als_history))
print(utils.get_quantity_of_rows(df))
df

7712
10271
7712


Unnamed: 0,subject_id,Age,Sex,Qty_Measurements_ALSFRS,Qty_Measurements_VITALSIGNS,Qty_Measurements_FVC,Qty_Measurements_SVC,Qty_Measurements_LABS,Qty_Measurements_HANDGRIPSTRENGTH,Qty_Measurements_MUSCLESTRENGTH,...,Subject_ALS_History_Delta,Disease_Duration,Symptom,Symptom_Other_Specify,Location,Location_Other_Specify,Site_of_Onset,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset
0,329,38.0,Female,12.0,13.0,8.0,0.0,465.0,0.0,0.0,...,0.0,,,,,,,-1706.0,-1068.0,Limb/Spinal
1,348,52.0,Female,15.0,10.0,0.0,9.0,820.0,0.0,0.0,...,0.0,,,,,,Onset: Other,-501.0,-302.0,Other
2,533,65.0,Female,6.0,10.0,2.0,6.0,126.0,0.0,0.0,...,0.0,,,,,,,-1023.0,-44.0,Bulbar
3,586,63.0,Male,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,,,,,,Onset: Bulbar,-715.0,-507.0,Bulbar
4,649,48.0,Female,12.0,12.0,12.0,0.0,420.0,0.0,0.0,...,0.0,,Weakness,,FACIAL,,Onset: Bulbar,-341.0,,Bulbar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,22564,63.0,Male,7.0,0.0,0.0,7.0,376.0,14.0,201.0,...,,,,,LOWER LIMB,,Onset: Limb,-426.0,-67.0,Limb/Spinal
7708,160135,67.0,Female,7.0,0.0,0.0,6.0,365.0,14.0,189.0,...,,,,,BULBAR,,Onset: Bulbar,-651.0,-125.0,Bulbar
7709,304192,59.0,Female,7.0,0.0,0.0,7.0,365.0,14.0,168.0,...,,,,,LOWER LIMB,,Onset: Limb,-838.0,-351.0,Limb/Spinal
7710,820199,49.0,Male,7.0,0.0,0.0,7.0,363.0,14.0,173.0,...,,,,,UPPER LIMB,,Onset: Limb,-155.0,-35.0,Limb/Spinal


### Create column `Diagnosis_Delay` (in months)

In [18]:
df['Diagnosis_Delay_in_Days'] = np.abs(df.Symptoms_Onset_Delta) - np.abs(df.Diagnosis_Delta)

diagnosis_delay = df['Diagnosis_Delay_in_Days'].apply( lambda x: utils.calculate_months_from_days(x)) 

df.loc[df.index,'Diagnosis_Delay'] = diagnosis_delay

df

Unnamed: 0,subject_id,Age,Sex,Qty_Measurements_ALSFRS,Qty_Measurements_VITALSIGNS,Qty_Measurements_FVC,Qty_Measurements_SVC,Qty_Measurements_LABS,Qty_Measurements_HANDGRIPSTRENGTH,Qty_Measurements_MUSCLESTRENGTH,...,Symptom,Symptom_Other_Specify,Location,Location_Other_Specify,Site_of_Onset,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset,Diagnosis_Delay_in_Days,Diagnosis_Delay
0,329,38.0,Female,12.0,13.0,8.0,0.0,465.0,0.0,0.0,...,,,,,,-1706.0,-1068.0,Limb/Spinal,638.0,21.0
1,348,52.0,Female,15.0,10.0,0.0,9.0,820.0,0.0,0.0,...,,,,,Onset: Other,-501.0,-302.0,Other,199.0,6.0
2,533,65.0,Female,6.0,10.0,2.0,6.0,126.0,0.0,0.0,...,,,,,,-1023.0,-44.0,Bulbar,979.0,32.0
3,586,63.0,Male,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,,,,,Onset: Bulbar,-715.0,-507.0,Bulbar,208.0,6.0
4,649,48.0,Female,12.0,12.0,12.0,0.0,420.0,0.0,0.0,...,Weakness,,FACIAL,,Onset: Bulbar,-341.0,,Bulbar,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,22564,63.0,Male,7.0,0.0,0.0,7.0,376.0,14.0,201.0,...,,,LOWER LIMB,,Onset: Limb,-426.0,-67.0,Limb/Spinal,359.0,11.0
7708,160135,67.0,Female,7.0,0.0,0.0,6.0,365.0,14.0,189.0,...,,,BULBAR,,Onset: Bulbar,-651.0,-125.0,Bulbar,526.0,17.0
7709,304192,59.0,Female,7.0,0.0,0.0,7.0,365.0,14.0,168.0,...,,,LOWER LIMB,,Onset: Limb,-838.0,-351.0,Limb/Spinal,487.0,16.0
7710,820199,49.0,Male,7.0,0.0,0.0,7.0,363.0,14.0,173.0,...,,,UPPER LIMB,,Onset: Limb,-155.0,-35.0,Limb/Spinal,120.0,4.0


### Code Diagnosis_Delay
	- Short    : >18 months   Code: 0
	- Average : >8 and <=18  Code: 1
	- Long   : <=8 months   Code: 2


In [19]:
to_update = df.loc[(df.Diagnosis_Delay > 18)]
df.loc[to_update.index, 'Diagnosis_Delay_Str'] = 'Short'
df.loc[to_update.index, 'Diagnosis_Delay_Coded'] = 0


to_update = df.loc[(
     (df.Diagnosis_Delay > 8)
    &(df.Diagnosis_Delay <= 18)
)]
df.loc[to_update.index, 'Diagnosis_Delay_Str'] = 'Average'
df.loc[to_update.index, 'Diagnosis_Delay_Coded'] = 1


to_update = df.loc[(df.Diagnosis_Delay <= 8)]
df.loc[to_update.index, 'Diagnosis_Delay_Str'] = 'Long'
df.loc[to_update.index, 'Diagnosis_Delay_Coded'] = 2

display(df)

Unnamed: 0,subject_id,Age,Sex,Qty_Measurements_ALSFRS,Qty_Measurements_VITALSIGNS,Qty_Measurements_FVC,Qty_Measurements_SVC,Qty_Measurements_LABS,Qty_Measurements_HANDGRIPSTRENGTH,Qty_Measurements_MUSCLESTRENGTH,...,Location,Location_Other_Specify,Site_of_Onset,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset,Diagnosis_Delay_in_Days,Diagnosis_Delay,Diagnosis_Delay_Str,Diagnosis_Delay_Coded
0,329,38.0,Female,12.0,13.0,8.0,0.0,465.0,0.0,0.0,...,,,,-1706.0,-1068.0,Limb/Spinal,638.0,21.0,Short,0.0
1,348,52.0,Female,15.0,10.0,0.0,9.0,820.0,0.0,0.0,...,,,Onset: Other,-501.0,-302.0,Other,199.0,6.0,Long,2.0
2,533,65.0,Female,6.0,10.0,2.0,6.0,126.0,0.0,0.0,...,,,,-1023.0,-44.0,Bulbar,979.0,32.0,Short,0.0
3,586,63.0,Male,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,,,Onset: Bulbar,-715.0,-507.0,Bulbar,208.0,6.0,Long,2.0
4,649,48.0,Female,12.0,12.0,12.0,0.0,420.0,0.0,0.0,...,FACIAL,,Onset: Bulbar,-341.0,,Bulbar,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,22564,63.0,Male,7.0,0.0,0.0,7.0,376.0,14.0,201.0,...,LOWER LIMB,,Onset: Limb,-426.0,-67.0,Limb/Spinal,359.0,11.0,Average,1.0
7708,160135,67.0,Female,7.0,0.0,0.0,6.0,365.0,14.0,189.0,...,BULBAR,,Onset: Bulbar,-651.0,-125.0,Bulbar,526.0,17.0,Average,1.0
7709,304192,59.0,Female,7.0,0.0,0.0,7.0,365.0,14.0,168.0,...,LOWER LIMB,,Onset: Limb,-838.0,-351.0,Limb/Spinal,487.0,16.0,Average,1.0
7710,820199,49.0,Male,7.0,0.0,0.0,7.0,363.0,14.0,173.0,...,UPPER LIMB,,Onset: Limb,-155.0,-35.0,Limb/Spinal,120.0,4.0,Long,2.0


### Show variables stats

In [20]:
utils.show_columns_stats(df=df)

subject_id....................... =  7712 rows (100.0%)     0 with NaN (  0.0%) Uniques=  7712 
Age.............................. =  7712 rows (100.0%)     0 with NaN (  0.0%) Uniques=   191 
Sex.............................. =  7712 rows (100.0%)     0 with NaN (  0.0%) Uniques=     2 
Qty_Measurements_ALSFRS.......... =  7712 rows (100.0%)     0 with NaN (  0.0%) Uniques=    32 
Qty_Measurements_VITALSIGNS...... =  7712 rows (100.0%)     0 with NaN (  0.0%) Uniques=    55 
Qty_Measurements_FVC............. =  7712 rows (100.0%)     0 with NaN (  0.0%) Uniques=    20 
Qty_Measurements_SVC............. =  7712 rows (100.0%)     0 with NaN (  0.0%) Uniques=    17 
Qty_Measurements_LABS............ =  7712 rows (100.0%)     0 with NaN (  0.0%) Uniques=   995 
Qty_Measurements_HANDGRIPSTRENGTH =  7712 rows (100.0%)     0 with NaN (  0.0%) Uniques=    38 
Qty_Measurements_MUSCLESTRENGTH.. =  7712 rows (100.0%)     0 with NaN (  0.0%) Uniques=   293 
Qty_Measurements................. =  771

----
----
----
## Create the `Age_at_Onset` column
### Calculation based on difference between the `Age` (at trial entrance) and the 
### `Symptoms_Onset_Delta` columns

In [21]:
# crete column with default values
df['Age_at_Onset'] = np.NaN


# get only rows with values in Age and Symptoms_Onset_Delta columns
df_calc_age_onset = df.loc[(df.Age.isnull()==False) & (df.Symptoms_Onset_Delta.isnull()==False)].copy()
print(f'Rows = {utils.get_quantity_of_rows(df_calc_age_onset)}')


# calculate the age at symptoms onset
ages_calculated = df_calc_age_onset.apply( 
    lambda x: utils.calculate_age_from_onset_delta(
        x['Age'], 
        x['Symptoms_Onset_Delta']), 
    axis=1
) 

#update samples with the calculated Age_at_Onset
df.loc[df_calc_age_onset.index,'Age_at_Onset'] = ages_calculated


df[['Age', 'Symptoms_Onset_Delta', 'Age_at_Onset']]



Rows = 7547


Unnamed: 0,Age,Symptoms_Onset_Delta,Age_at_Onset
0,38.0,-1706.0,34.0
1,52.0,-501.0,51.0
2,65.0,-1023.0,63.0
3,63.0,-715.0,61.0
4,48.0,-341.0,47.0
...,...,...,...
7707,63.0,-426.0,62.0
7708,67.0,-651.0,66.0
7709,59.0,-838.0,57.0
7710,49.0,-155.0,49.0


----
----
----
## Define  a new AGE_RANGE column (grouping samples by ranges of years old)
#### Based on the `Age_at_Onset` column

In [22]:
#define a dictionary with age ranges
age_ranges = {
    '0-39' : [0, 39],
    '40-49': [40, 49],
    '50-59': [50, 59],
    '60-69': [60, 69],
    '70+'  : [70, 999],
}

age_ranges_codes = {
    '0-39' : 0,
    '40-49': 1,
    '50-59': 2,
    '60-69': 3,
    '70+'  : 4,
}

age_ranges.keys()

dict_keys(['0-39', '40-49', '50-59', '60-69', '70+'])

### Create _Age_Range_at_Onset_ and _Age_Range_at_Onset_Coded_ columns

In [23]:
#create Age_Range column and set its value

df_temp = df

df_temp['Age_Range_at_Onset'] = np.NAN

for key, value in age_ranges.items():
    label = key
    min_age = value[0]
    max_age = value[1] + 1
    indices = df_temp[(df_temp['Age_at_Onset'] >= min_age) & (df_temp['Age_at_Onset'] < max_age)]
    df_temp.loc[indices.index, 'Age_Range_at_Onset'] = label
    df_temp.loc[indices.index, 'Age_Range_at_Onset_Coded'] = age_ranges_codes.get(key)


df = df_temp
df


Unnamed: 0,subject_id,Age,Sex,Qty_Measurements_ALSFRS,Qty_Measurements_VITALSIGNS,Qty_Measurements_FVC,Qty_Measurements_SVC,Qty_Measurements_LABS,Qty_Measurements_HANDGRIPSTRENGTH,Qty_Measurements_MUSCLESTRENGTH,...,Symptoms_Onset_Delta,Diagnosis_Delta,Site_Onset,Diagnosis_Delay_in_Days,Diagnosis_Delay,Diagnosis_Delay_Str,Diagnosis_Delay_Coded,Age_at_Onset,Age_Range_at_Onset,Age_Range_at_Onset_Coded
0,329,38.0,Female,12.0,13.0,8.0,0.0,465.0,0.0,0.0,...,-1706.0,-1068.0,Limb/Spinal,638.0,21.0,Short,0.0,34.0,0-39,0.0
1,348,52.0,Female,15.0,10.0,0.0,9.0,820.0,0.0,0.0,...,-501.0,-302.0,Other,199.0,6.0,Long,2.0,51.0,50-59,2.0
2,533,65.0,Female,6.0,10.0,2.0,6.0,126.0,0.0,0.0,...,-1023.0,-44.0,Bulbar,979.0,32.0,Short,0.0,63.0,60-69,3.0
3,586,63.0,Male,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,-715.0,-507.0,Bulbar,208.0,6.0,Long,2.0,61.0,60-69,3.0
4,649,48.0,Female,12.0,12.0,12.0,0.0,420.0,0.0,0.0,...,-341.0,,Bulbar,,,,,47.0,40-49,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7707,22564,63.0,Male,7.0,0.0,0.0,7.0,376.0,14.0,201.0,...,-426.0,-67.0,Limb/Spinal,359.0,11.0,Average,1.0,62.0,60-69,3.0
7708,160135,67.0,Female,7.0,0.0,0.0,6.0,365.0,14.0,189.0,...,-651.0,-125.0,Bulbar,526.0,17.0,Average,1.0,66.0,60-69,3.0
7709,304192,59.0,Female,7.0,0.0,0.0,7.0,365.0,14.0,168.0,...,-838.0,-351.0,Limb/Spinal,487.0,16.0,Average,1.0,57.0,50-59,2.0
7710,820199,49.0,Male,7.0,0.0,0.0,7.0,363.0,14.0,173.0,...,-155.0,-35.0,Limb/Spinal,120.0,4.0,Long,2.0,49.0,40-49,1.0


----
----
----
# Drop some irrelevant columns 

In [24]:
irrelevant_cols = [
    'Site_of_Onset___Bulbar', 
    'Site_of_Onset___Limb', 
    'Site_of_Onset___Limb_and_Bulbar', 
    'Site_of_Onset___Other', 
    'Site_of_Onset___Other_Specify', 
    'Site_of_Onset___Spine', 
    'Subject_ALS_History_Delta',
    'Disease_Duration', 
    'Symptom',
    'Symptom_Other_Specify', 
    'Location', 
    'Location_Other_Specify', 
    'Site_of_Onset',
    'Age',
]


df.drop(
    columns=irrelevant_cols, 
    inplace=True,
)

----
----
----
## Delete samples having NaN values in the following columns:
 - #### `Age_at_Onset`
 - #### `Diagnosis_Delay`
 - #### `Site_Onset`

In [25]:
# Age_of_Onset
to_delete = df.loc[(df.Age_at_Onset.isnull())]
df = utils.remove_rows(df=df, to_delete=to_delete)

# Diagnosis_Delay
to_delete = df.loc[(df.Diagnosis_Delay.isnull())]
df = utils.remove_rows(df=df, to_delete=to_delete)

# Site_Onset
to_delete = df.loc[(df.Site_Onset.isnull())]
df = utils.remove_rows(df=df, to_delete=to_delete)

  - Previous=7712, To delete=165, After=7547
  - Previous=7547, To delete=2265, After=5282
  - Previous=5282, To delete=52, After=5230


In [26]:
utils.show_columns_stats(df=df)

subject_id....................... =  5230 rows (100.0%)     0 with NaN (  0.0%) Uniques=  5230 
Sex.............................. =  5230 rows (100.0%)     0 with NaN (  0.0%) Uniques=     2 
Qty_Measurements_ALSFRS.......... =  5230 rows (100.0%)     0 with NaN (  0.0%) Uniques=    32 
Qty_Measurements_VITALSIGNS...... =  5230 rows (100.0%)     0 with NaN (  0.0%) Uniques=    55 
Qty_Measurements_FVC............. =  5230 rows (100.0%)     0 with NaN (  0.0%) Uniques=    20 
Qty_Measurements_SVC............. =  5230 rows (100.0%)     0 with NaN (  0.0%) Uniques=    17 
Qty_Measurements_LABS............ =  5230 rows (100.0%)     0 with NaN (  0.0%) Uniques=   953 
Qty_Measurements_HANDGRIPSTRENGTH =  5230 rows (100.0%)     0 with NaN (  0.0%) Uniques=    38 
Qty_Measurements_MUSCLESTRENGTH.. =  5230 rows (100.0%)     0 with NaN (  0.0%) Uniques=   293 
Qty_Measurements................. =  5230 rows (100.0%)     0 with NaN (  0.0%) Uniques=  1087 
Symptoms_Onset_Delta............. =  523

----
----
----
# Drop unnecessary columns

In [27]:
irrelevant_cols = [
    'Diagnosis_Delay_in_Days', 
]

df.drop(
    columns=irrelevant_cols, 
    inplace=True,
)


----
----
----
# Save the pre-processed data to CSV file

In [28]:
dir_dest = os.path.abspath('../03_preprocessed_data/')
csv_dest = f'{dir_dest}/_patient.csv'
utils.save_to_csv(df=df, csv_file=csv_dest, with_index=False)


5230 samples were saved


---
---
---
# OTHER TESTS