In [12]:
import pandas as pd
import numpy as np

In [13]:
# Read csv file
df = pd.read_csv(r'clinical_dataset.csv', delimiter=';')
df.head(10)

Unnamed: 0,part_id,fried,gender,age,hospitalization_one_year,hospitalization_three_years,ortho_hypotension,vision,audition,weight_loss,...,health_rate_comparison,pain_perception,activity_regular,smoking,alcohol_units,katz_index,iadl_grade,comorbidities_count,comorbidities_significant_count,medication_count
0,1001,Non frail,F,75,0,0,No,Sees moderately,Hears well,No,...,3 - About the same,4.2,> 2 h and < 5 h per week,Never smoked,1.0,6.0,31.0,5,0,5
1,1002,Pre-frail,M,73,0,1,No,Sees moderately,Hears moderately,No,...,2 - A little worse,3.3,> 2 h and < 5 h per week,Never smoked,14.0,6.0,26.0,12,0,4
2,1003,Pre-frail,M,72,0,0,No,Sees moderately,Hears moderately,No,...,3 - About the same,3.4,< 2 h per week,Past smoker (stopped at least 6 months),21.0,6.0,26.0,9,0,3
3,1004,Frail,F,88,0,0,No,Sees moderately,Hears moderately,No,...,3 - About the same,7.3,< 2 h per week,Never smoked,21.0,5.5,20.0,11,0,8
4,1005,Pre-frail,F,83,0,4,Yes,Sees moderately,Hears poorly,No,...,3 - About the same,3.0,> 5 h per week,Current smoker,0.0,6.0,30.0,12,0,4
5,1006,Non frail,M,72,0,1,No,Sees moderately,Hears well,No,...,3 - About the same,0.0,> 2 h and < 5 h per week,Never smoked,7.0,6.0,28.0,5,0,6
6,1007,Non frail,F,70,0,0,No,Sees moderately,Hears well,No,...,3 - About the same,2.4,> 2 h and < 5 h per week,Current smoker,0.0,6.0,31.0,4,0,2
7,1008,Frail,F,74,10,11,No,Sees moderately,Hears moderately,Yes,...,4 - A little better,5.0,> 2 h and < 5 h per week,Never smoked,0.0,5.0,25.0,8,2,2
8,1009,Pre-frail,F,76,0,1,No,Sees moderately,Hears well,No,...,2 - A little worse,2.0,> 5 h per week,Never smoked,0.0,6.0,31.0,10,0,6
9,1010,Frail,F,72,0,2,No,Sees moderately,Hears well,Yes,...,2 - A little worse,5.0,< 2 h per week,Past smoker (stopped at least 6 months),10.5,2.5,18.0,9,1,7


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 55 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   part_id                          540 non-null    int64  
 1   fried                            540 non-null    object 
 2   gender                           540 non-null    object 
 3   age                              540 non-null    int64  
 4   hospitalization_one_year         540 non-null    int64  
 5   hospitalization_three_years      540 non-null    int64  
 6   ortho_hypotension                540 non-null    object 
 7   vision                           540 non-null    object 
 8   audition                         540 non-null    object 
 9   weight_loss                      538 non-null    object 
 10  exhaustion_score                 540 non-null    int64  
 11  raise_chair_time                 539 non-null    float64
 12  balance_single        

## **Preprocessing the clinical dataset**


In [15]:
# Convert nominal features to numerical
df['fried'].replace(['Frail', 'Pre-frail', 'Non frail'],
                        [2, 1, 0], inplace=True)
df['gender'].replace(['F','M'],
                        [1, 0], inplace=True)

df['ortho_hypotension'].replace(['Yes','No'],
                        [1, 0], inplace=True)
df['vision'].replace(['Sees well', 'Sees moderately', 'Sees poorly'],
                        [2, 1, 0], inplace=True)

df['audition'].replace(['Hears well', 'Hears moderately', 'Hears poorly'],
                        [2, 1, 0], inplace=True)
df['weight_loss'].replace(['Yes','No'],
                        [1, 0], inplace=True)

df['balance_single'].replace(['>5 sec', '<5 sec', 'test non realizable'],
                        [2, 1, 0], inplace=True)
df['gait_speed_slower'].replace(['Yes', 'No'],
                        [1, 0], inplace=True)

df['grip_strength_abnormal'].replace(['Yes', 'No'],
                        [1, 0], inplace=True)
df['low_physical_activity'].replace(['Yes', 'No'],
                        [1, 0], inplace=True)

df['memory_complain'].replace(['Yes', 'No'],
                        [1, 0], inplace=True)
df['sleep'].replace(['Permanent sleep problem', 'Occasional sleep problem', 'No sleep problem'],
                        [2, 1, 0], inplace=True)

df['living_alone'].replace(['Yes', 'No'],
                        [1, 0], inplace=True)
df['leisure_club'].replace(['Yes', 'No'],
                        [1, 0], inplace=True)

df['house_suitable_participant'].replace(['Yes', 'No'],
                        [1, 0], inplace=True)
df['house_suitable_professional'].replace(['Yes', 'No'],
                        [1, 0], inplace=True)

df['health_rate'].replace(['5-Excellent','4-Good', '3 - Medium', '2 - Bad', '1 - Very bad'],
                        [5, 4, 3, 2, 1], inplace=True)
df['health_rate_comparison'].replace(['5 - A lot better', '4 - A little better', '3 - About the same', '2 - A little worse','1 - A lot worse'],
                        [5, 4, 3, 2, 1], inplace=True)

df['activity_regular'].replace(['> 5 h per week', '> 2 h and < 5 h per week', '< 2 h per week', 'No'],
                        [3, 2, 1, 0], inplace=True)
df['smoking'].replace(['Current smoker', 'Past smoker (stopped at least 6 months)', 'Never smoked'],
                        [2, 1, 0], inplace=True)

df.head()

Unnamed: 0,part_id,fried,gender,age,hospitalization_one_year,hospitalization_three_years,ortho_hypotension,vision,audition,weight_loss,...,health_rate_comparison,pain_perception,activity_regular,smoking,alcohol_units,katz_index,iadl_grade,comorbidities_count,comorbidities_significant_count,medication_count
0,1001,0,1,75,0,0,0,1,2,0.0,...,3.0,4.2,2.0,0.0,1.0,6.0,31.0,5,0,5
1,1002,1,0,73,0,1,0,1,1,0.0,...,2.0,3.3,2.0,0.0,14.0,6.0,26.0,12,0,4
2,1003,1,0,72,0,0,0,1,1,0.0,...,3.0,3.4,1.0,1.0,21.0,6.0,26.0,9,0,3
3,1004,2,1,88,0,0,0,1,1,0.0,...,3.0,7.3,1.0,0.0,21.0,5.5,20.0,11,0,8
4,1005,1,1,83,0,4,1,1,0,0.0,...,3.0,3.0,3.0,2.0,0.0,6.0,30.0,12,0,4


In [16]:
print(df.isnull().sum())

part_id                              0
fried                                0
gender                               0
age                                  0
hospitalization_one_year             0
hospitalization_three_years          0
ortho_hypotension                    0
vision                               0
audition                             0
weight_loss                          2
exhaustion_score                     0
raise_chair_time                     1
balance_single                       1
gait_get_up                          2
gait_speed_4m                        0
gait_optional_binary                 0
gait_speed_slower                    0
grip_strength_abnormal               0
low_physical_activity                0
falls_one_year                       0
fractures_three_years                0
bmi_score                            2
bmi_body_fat                       133
waist                                3
lean_body_mass                     133
screening_score          

In [17]:
print(df.columns)

Index(['part_id', 'fried', 'gender', 'age', 'hospitalization_one_year',
       'hospitalization_three_years', 'ortho_hypotension', 'vision',
       'audition', 'weight_loss', 'exhaustion_score', 'raise_chair_time',
       'balance_single', 'gait_get_up', 'gait_speed_4m',
       'gait_optional_binary', 'gait_speed_slower', 'grip_strength_abnormal',
       'low_physical_activity', 'falls_one_year', 'fractures_three_years',
       'bmi_score', 'bmi_body_fat', 'waist', 'lean_body_mass',
       'screening_score', 'cognitive_total_score', 'memory_complain', 'sleep',
       'mmse_total_score', 'depression_total_score', 'anxiety_perception',
       'living_alone', 'leisure_out', 'leisure_club', 'social_visits',
       'social_calls', 'social_phone', 'social_skype', 'social_text',
       'house_suitable_participant', 'house_suitable_professional',
       'stairs_number', 'life_quality', 'health_rate',
       'health_rate_comparison', 'pain_perception', 'activity_regular',
       'smoking', 'alc

In [18]:
# Remove erroneous values
for i in df.columns:
    for j in range (len(df[i])):
        if df[i][j] == 999 or type(df[i][j]).__name__ == 'str':
            df.at[j,i] = None
print (df['raise_chair_time'])


0      13.00
1      16.00
2      10.00
3        NaN
4      13.00
       ...  
535    14.00
536    14.80
537     6.59
538    11.10
539     6.60
Name: raise_chair_time, Length: 540, dtype: float64


In [19]:
# Handle missing values
for i in df.columns:
    mean = df[i].mean(skipna=True)
    for j in range (len(df[i])):
        if pd.isna(df[i][j]):df.at[j,i] = mean
print (df['raise_chair_time'])


0      13.000000
1      16.000000
2      10.000000
3      13.471636
4      13.000000
         ...    
535    14.000000
536    14.800000
537     6.590000
538    11.100000
539     6.600000
Name: raise_chair_time, Length: 540, dtype: float64


In [20]:
df.gait_speed_slower.head(20)

0            0
1            1
2            1
3            1
4            1
5            0
6            0
7            1
8            1
9     0.277985
10           1
11           0
12           0
13           1
14           0
15           1
16           1
17           1
18           1
19    0.277985
Name: gait_speed_slower, dtype: object

In [21]:
df.head(10)

Unnamed: 0,part_id,fried,gender,age,hospitalization_one_year,hospitalization_three_years,ortho_hypotension,vision,audition,weight_loss,...,health_rate_comparison,pain_perception,activity_regular,smoking,alcohol_units,katz_index,iadl_grade,comorbidities_count,comorbidities_significant_count,medication_count
0,1001,0,1,75,0,0.0,0,1,2,0.0,...,3.0,4.2,2.0,0.0,1.0,6.0,31.0,5,0,5
1,1002,1,0,73,0,1.0,0,1,1,0.0,...,2.0,3.3,2.0,0.0,14.0,6.0,26.0,12,0,4
2,1003,1,0,72,0,0.0,0,1,1,0.0,...,3.0,3.4,1.0,1.0,21.0,6.0,26.0,9,0,3
3,1004,2,1,88,0,0.0,0,1,1,0.0,...,3.0,7.3,1.0,0.0,21.0,5.5,20.0,11,0,8
4,1005,1,1,83,0,4.0,1,1,0,0.0,...,3.0,3.0,3.0,2.0,0.0,6.0,30.0,12,0,4
5,1006,0,0,72,0,1.0,0,1,2,0.0,...,3.0,0.0,2.0,0.0,7.0,6.0,28.0,5,0,6
6,1007,0,1,70,0,0.0,0,1,2,0.0,...,3.0,2.4,2.0,2.0,0.0,6.0,31.0,4,0,2
7,1008,2,1,74,10,11.0,0,1,1,1.0,...,4.0,5.0,2.0,0.0,0.0,5.0,25.0,8,2,2
8,1009,1,1,76,0,1.0,0,1,2,0.0,...,2.0,2.0,3.0,0.0,0.0,6.0,31.0,10,0,6
9,1010,2,1,72,0,2.0,0,1,2,1.0,...,2.0,5.0,1.0,1.0,10.5,2.5,18.0,9,1,7


In [22]:
is_numeric = df.apply(lambda x: pd.to_numeric(x, errors='coerce').notnull().all())

# This will return a boolean series indicating whether each column contains only numeric data
print(is_numeric)


part_id                            True
fried                              True
gender                             True
age                                True
hospitalization_one_year           True
hospitalization_three_years        True
ortho_hypotension                  True
vision                             True
audition                           True
weight_loss                        True
exhaustion_score                   True
raise_chair_time                   True
balance_single                     True
gait_get_up                        True
gait_speed_4m                      True
gait_optional_binary               True
gait_speed_slower                  True
grip_strength_abnormal             True
low_physical_activity              True
falls_one_year                     True
fractures_three_years              True
bmi_score                          True
bmi_body_fat                       True
waist                              True
lean_body_mass                     True


In [23]:
print(df.isnull().sum())

part_id                            0
fried                              0
gender                             0
age                                0
hospitalization_one_year           0
hospitalization_three_years        0
ortho_hypotension                  0
vision                             0
audition                           0
weight_loss                        0
exhaustion_score                   0
raise_chair_time                   0
balance_single                     0
gait_get_up                        0
gait_speed_4m                      0
gait_optional_binary               0
gait_speed_slower                  0
grip_strength_abnormal             0
low_physical_activity              0
falls_one_year                     0
fractures_three_years              0
bmi_score                          0
bmi_body_fat                       0
waist                              0
lean_body_mass                     0
screening_score                    0
cognitive_total_score              0
m

## **Save the corrected dataset**


In [24]:
#Export and save
df.to_csv('clinical_dataset_corrected.csv', index=False)
#df.to_csv('/content/drive/My Drive/Colab Notebooks/AI_In_Health/mariarevythi/clinical_dataset_corrected.csv', index=False)