In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [2]:
beacons = pd.read_csv("beacons_dataset.csv", delimiter=';')
clinical = pd.read_csv("clinical_dataset.csv", delimiter=';')

In [3]:
beacons.head(10)

Unnamed: 0,part_id,ts_date,ts_time,room
0,3089,20170915,06:45:22,Kitchen
1,3089,20170915,06:45:33,Bedroom
2,3089,20170915,06:45:39,Outdoor
3,3089,20170915,06:45:53,Bedroom
4,3089,20170915,06:46:09,Outdoor
5,3089,20170915,06:46:23,Bedroom
6,3089,20170915,06:46:39,Outdoor
7,3089,20170915,06:46:53,Bedroom
8,3089,20170915,06:47:09,Outdoor
9,3089,20170915,06:47:23,Bedroom


In [4]:
clinical.head()

Unnamed: 0,part_id,fried,gender,age,hospitalization_one_year,hospitalization_three_years,ortho_hypotension,vision,audition,weight_loss,...,health_rate_comparison,pain_perception,activity_regular,smoking,alcohol_units,katz_index,iadl_grade,comorbidities_count,comorbidities_significant_count,medication_count
0,1001,Non frail,F,75,0,0,No,Sees moderately,Hears well,No,...,3 - About the same,4.2,> 2 h and < 5 h per week,Never smoked,1.0,6.0,31.0,5,0,5
1,1002,Pre-frail,M,73,0,1,No,Sees moderately,Hears moderately,No,...,2 - A little worse,3.3,> 2 h and < 5 h per week,Never smoked,14.0,6.0,26.0,12,0,4
2,1003,Pre-frail,M,72,0,0,No,Sees moderately,Hears moderately,No,...,3 - About the same,3.4,< 2 h per week,Past smoker (stopped at least 6 months),21.0,6.0,26.0,9,0,3
3,1004,Frail,F,88,0,0,No,Sees moderately,Hears moderately,No,...,3 - About the same,7.3,< 2 h per week,Never smoked,21.0,5.5,20.0,11,0,8
4,1005,Pre-frail,F,83,0,4,Yes,Sees moderately,Hears poorly,No,...,3 - About the same,3.0,> 5 h per week,Current smoker,0.0,6.0,30.0,12,0,4


In [5]:
beacons.columns

Index(['part_id', 'ts_date', 'ts_time', 'room'], dtype='object')

In [6]:
clinical.columns

Index(['part_id', 'fried', 'gender', 'age', 'hospitalization_one_year',
       'hospitalization_three_years', 'ortho_hypotension', 'vision',
       'audition', 'weight_loss', 'exhaustion_score', 'raise_chair_time',
       'balance_single', 'gait_get_up', 'gait_speed_4m',
       'gait_optional_binary', 'gait_speed_slower', 'grip_strength_abnormal',
       'low_physical_activity', 'falls_one_year', 'fractures_three_years',
       'bmi_score', 'bmi_body_fat', 'waist', 'lean_body_mass',
       'screening_score', 'cognitive_total_score', 'memory_complain', 'sleep',
       'mmse_total_score', 'depression_total_score', 'anxiety_perception',
       'living_alone', 'leisure_out', 'leisure_club', 'social_visits',
       'social_calls', 'social_phone', 'social_skype', 'social_text',
       'house_suitable_participant', 'house_suitable_professional',
       'stairs_number', 'life_quality', 'health_rate',
       'health_rate_comparison', 'pain_perception', 'activity_regular',
       'smoking', 'alc

In [7]:
clinical.isnull().sum()

part_id                              0
fried                                0
gender                               0
age                                  0
hospitalization_one_year             0
hospitalization_three_years          0
ortho_hypotension                    0
vision                               0
audition                             0
weight_loss                          2
exhaustion_score                     0
raise_chair_time                     1
balance_single                       1
gait_get_up                          2
gait_speed_4m                        0
gait_optional_binary                 0
gait_speed_slower                    0
grip_strength_abnormal               0
low_physical_activity                0
falls_one_year                       0
fractures_three_years                0
bmi_score                            2
bmi_body_fat                       133
waist                                3
lean_body_mass                     133
screening_score          

In [8]:
clinical.nunique()

part_id                            540
fried                                3
gender                               2
age                                 27
hospitalization_one_year             8
hospitalization_three_years         12
ortho_hypotension                    2
vision                               3
audition                             3
weight_loss                          2
exhaustion_score                     2
raise_chair_time                   277
balance_single                       3
gait_get_up                        244
gait_speed_4m                      222
gait_optional_binary                 2
gait_speed_slower                    3
grip_strength_abnormal               2
low_physical_activity                2
falls_one_year                       9
fractures_three_years                6
bmi_score                          511
bmi_body_fat                       150
waist                               72
lean_body_mass                     403
screening_score          

# Preprocessing

In [9]:
erroneous_dict = {999: np.nan, "test non realizable": np.nan, "Test not adequate": np.nan}

clinical_data_proc = clinical.replace(erroneous_dict)
clinical.head()

Unnamed: 0,part_id,fried,gender,age,hospitalization_one_year,hospitalization_three_years,ortho_hypotension,vision,audition,weight_loss,...,health_rate_comparison,pain_perception,activity_regular,smoking,alcohol_units,katz_index,iadl_grade,comorbidities_count,comorbidities_significant_count,medication_count
0,1001,Non frail,F,75,0,0,No,Sees moderately,Hears well,No,...,3 - About the same,4.2,> 2 h and < 5 h per week,Never smoked,1.0,6.0,31.0,5,0,5
1,1002,Pre-frail,M,73,0,1,No,Sees moderately,Hears moderately,No,...,2 - A little worse,3.3,> 2 h and < 5 h per week,Never smoked,14.0,6.0,26.0,12,0,4
2,1003,Pre-frail,M,72,0,0,No,Sees moderately,Hears moderately,No,...,3 - About the same,3.4,< 2 h per week,Past smoker (stopped at least 6 months),21.0,6.0,26.0,9,0,3
3,1004,Frail,F,88,0,0,No,Sees moderately,Hears moderately,No,...,3 - About the same,7.3,< 2 h per week,Never smoked,21.0,5.5,20.0,11,0,8
4,1005,Pre-frail,F,83,0,4,Yes,Sees moderately,Hears poorly,No,...,3 - About the same,3.0,> 5 h per week,Current smoker,0.0,6.0,30.0,12,0,4


In [10]:
labelen = LabelEncoder()
for col in clinical.columns:
    if clinical[col].dtypes == np.object:
        clinical_data_proc[col] = labelen.fit_transform(clinical[col])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if clinical[col].dtypes == np.object:


In [11]:
pd.set_option("display.max_columns", None)

clinical_data_procnan = clinical_data_proc.dropna(axis = 0)
clinical_data_procnan

Unnamed: 0,part_id,fried,gender,age,hospitalization_one_year,hospitalization_three_years,ortho_hypotension,vision,audition,weight_loss,exhaustion_score,raise_chair_time,balance_single,gait_get_up,gait_speed_4m,gait_optional_binary,gait_speed_slower,grip_strength_abnormal,low_physical_activity,falls_one_year,fractures_three_years,bmi_score,bmi_body_fat,waist,lean_body_mass,screening_score,cognitive_total_score,memory_complain,sleep,mmse_total_score,depression_total_score,anxiety_perception,living_alone,leisure_out,leisure_club,social_visits,social_calls,social_phone,social_skype,social_text,house_suitable_participant,house_suitable_professional,stairs_number,life_quality,health_rate,health_rate_comparison,pain_perception,activity_regular,smoking,alcohol_units,katz_index,iadl_grade,comorbidities_count,comorbidities_significant_count,medication_count
0,1001.0,1,0,75.0,0.0,0.0,0,0,2,0,1.0,13.00,1,18.00,7.00,False,0,0,0,1.0,0.0,27.587326,30.4,101.0,47.3280,14.0,25.0,0,0,30.0,1.0,5.0,1,7.0,1,3.0,7.0,210.0,0.0,0.0,1,1,0.0,7.4,2,2,4.2,1,1,1.00,6.0,31.0,5.0,0.0,5.0
1,1002.0,2,1,73.0,0.0,1.0,0,0,0,0,1.0,16.00,1,60.00,11.00,False,2,1,0,1.0,0.0,25.069362,23.4,109.0,58.1394,14.0,27.0,0,0,28.0,8.0,8.2,0,7.0,0,1.0,20.0,140.0,0.0,0.0,1,1,20.0,7.0,2,1,3.3,1,1,14.00,6.0,26.0,12.0,0.0,4.0
2,1003.0,2,1,72.0,0.0,0.0,0,0,0,0,1.0,10.00,1,21.00,11.00,False,2,1,0,1.0,0.0,34.386317,32.8,122.0,64.4448,14.0,26.0,0,1,27.0,1.0,3.4,0,7.0,1,5.0,20.0,210.0,0.0,10.0,1,1,7.0,7.8,3,2,3.4,0,2,21.00,6.0,26.0,9.0,0.0,3.0
4,1005.0,2,0,83.0,0.0,4.0,1,0,1,0,1.0,13.00,0,42.00,8.00,False,2,0,0,1.0,0.0,25.997436,29.6,88.0,45.6896,12.0,24.0,1,1,27.0,3.0,8.2,1,7.0,1,2.0,10.0,70.0,0.0,0.0,1,1,0.0,8.4,2,2,3.0,2,0,0.00,6.0,30.0,12.0,0.0,4.0
5,1006.0,1,1,72.0,0.0,1.0,0,0,2,0,1.0,12.00,1,18.00,6.00,False,0,0,0,0.0,0.0,25.432526,23.4,100.0,56.3010,13.0,26.0,0,0,29.0,0.0,1.0,0,14.0,1,5.0,20.0,420.0,0.0,21.0,1,1,10.0,9.4,4,2,0.0,1,1,7.00,6.0,28.0,5.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,3593.0,2,1,75.0,0.0,1.0,1,2,2,0,1.0,14.00,1,6.20,2.30,False,0,1,0,1.0,0.0,31.909140,36.3,112.0,60.1328,11.0,27.0,0,0,27.0,2.0,0.0,1,0.0,0,0.0,8.0,240.0,0.0,3.0,1,1,0.0,10.0,3,2,1.0,2,2,0.00,5.5,31.0,8.0,1.0,10.0
536,3594.0,1,1,71.0,0.0,0.0,0,2,2,0,1.0,14.80,1,8.40,2.70,False,0,0,0,2.0,1.0,26.161896,32.3,93.0,51.7905,10.0,27.0,0,0,29.0,2.0,2.4,0,5.0,1,4.0,3.0,15.0,0.0,30.0,1,1,15.0,8.5,3,3,2.0,2,2,17.25,5.0,28.0,7.0,0.0,3.0
537,3600.0,1,0,77.0,0.0,0.0,0,2,2,0,1.0,6.59,1,6.06,2.28,False,0,0,0,0.0,0.0,921.143251,30.0,86.0,44.9400,14.0,21.0,0,2,27.0,3.0,4.0,1,7.0,0,2.0,20.0,100.0,30.0,0.0,1,1,1.0,7.4,3,1,3.9,2,1,0.50,6.0,31.0,5.0,0.0,4.0
538,3601.0,2,1,84.0,0.0,0.0,1,2,0,0,1.0,11.10,1,8.22,2.85,False,0,1,0,0.0,0.0,27.380117,25.3,102.0,67.0059,14.0,26.0,0,0,28.0,0.0,0.0,1,14.0,1,7.0,25.0,250.0,0.0,20.0,1,1,0.0,6.5,3,2,4.9,1,1,28.00,6.0,31.0,4.0,1.0,8.0


In [12]:
for col in clinical_data_proc.columns:
    clinical_data_proc_filled = clinical_data_proc.fillna(clinical_data_proc[col].median())

clinical_data_proc_filled.groupby(['fried']).size()    

fried
0    100
1    213
2    227
dtype: int64

# Feature Selection

In [13]:
variance = np.var(clinical_data_procnan)
variance

part_id                            945304.503604
fried                                   0.377198
gender                                  0.238114
age                                    29.333272
hospitalization_one_year                1.047431
hospitalization_three_years             1.572955
ortho_hypotension                       0.119750
vision                                  0.904178
audition                                0.752049
weight_loss                             0.067344
exhaustion_score                        0.108669
raise_chair_time                       30.283047
balance_single                          0.219699
gait_get_up                            33.517521
gait_speed_4m                           6.415189
gait_optional_binary                    0.088190
gait_speed_slower                       0.511052
grip_strength_abnormal                  0.223868
low_physical_activity                   0.082085
falls_one_year                          0.705862
fractures_three_year

In [14]:
vt = VarianceThreshold(threshold = 0.511)
_ = vt.fit(clinical_data_procnan)

# Get the boolean mask
mask = vt.get_support()

clinical_data_reduced = clinical_data_procnan.loc[:, mask]
clinical_data_reduced


Unnamed: 0,part_id,age,hospitalization_one_year,hospitalization_three_years,vision,audition,raise_chair_time,gait_get_up,gait_speed_4m,gait_speed_slower,falls_one_year,bmi_score,bmi_body_fat,waist,lean_body_mass,screening_score,cognitive_total_score,sleep,mmse_total_score,depression_total_score,anxiety_perception,leisure_out,social_visits,social_calls,social_phone,social_skype,social_text,stairs_number,life_quality,health_rate,health_rate_comparison,pain_perception,activity_regular,alcohol_units,iadl_grade,comorbidities_count,comorbidities_significant_count,medication_count
0,1001.0,75.0,0.0,0.0,0,2,13.00,18.00,7.00,0,1.0,27.587326,30.4,101.0,47.3280,14.0,25.0,0,30.0,1.0,5.0,7.0,3.0,7.0,210.0,0.0,0.0,0.0,7.4,2,2,4.2,1,1.00,31.0,5.0,0.0,5.0
1,1002.0,73.0,0.0,1.0,0,0,16.00,60.00,11.00,2,1.0,25.069362,23.4,109.0,58.1394,14.0,27.0,0,28.0,8.0,8.2,7.0,1.0,20.0,140.0,0.0,0.0,20.0,7.0,2,1,3.3,1,14.00,26.0,12.0,0.0,4.0
2,1003.0,72.0,0.0,0.0,0,0,10.00,21.00,11.00,2,1.0,34.386317,32.8,122.0,64.4448,14.0,26.0,1,27.0,1.0,3.4,7.0,5.0,20.0,210.0,0.0,10.0,7.0,7.8,3,2,3.4,0,21.00,26.0,9.0,0.0,3.0
4,1005.0,83.0,0.0,4.0,0,1,13.00,42.00,8.00,2,1.0,25.997436,29.6,88.0,45.6896,12.0,24.0,1,27.0,3.0,8.2,7.0,2.0,10.0,70.0,0.0,0.0,0.0,8.4,2,2,3.0,2,0.00,30.0,12.0,0.0,4.0
5,1006.0,72.0,0.0,1.0,0,2,12.00,18.00,6.00,0,0.0,25.432526,23.4,100.0,56.3010,13.0,26.0,0,29.0,0.0,1.0,14.0,5.0,20.0,420.0,0.0,21.0,10.0,9.4,4,2,0.0,1,7.00,28.0,5.0,0.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,3593.0,75.0,0.0,1.0,2,2,14.00,6.20,2.30,0,1.0,31.909140,36.3,112.0,60.1328,11.0,27.0,0,27.0,2.0,0.0,0.0,0.0,8.0,240.0,0.0,3.0,0.0,10.0,3,2,1.0,2,0.00,31.0,8.0,1.0,10.0
536,3594.0,71.0,0.0,0.0,2,2,14.80,8.40,2.70,0,2.0,26.161896,32.3,93.0,51.7905,10.0,27.0,0,29.0,2.0,2.4,5.0,4.0,3.0,15.0,0.0,30.0,15.0,8.5,3,3,2.0,2,17.25,28.0,7.0,0.0,3.0
537,3600.0,77.0,0.0,0.0,2,2,6.59,6.06,2.28,0,0.0,921.143251,30.0,86.0,44.9400,14.0,21.0,2,27.0,3.0,4.0,7.0,2.0,20.0,100.0,30.0,0.0,1.0,7.4,3,1,3.9,2,0.50,31.0,5.0,0.0,4.0
538,3601.0,84.0,0.0,0.0,2,0,11.10,8.22,2.85,0,0.0,27.380117,25.3,102.0,67.0059,14.0,26.0,0,28.0,0.0,0.0,14.0,7.0,25.0,250.0,0.0,20.0,0.0,6.5,3,2,4.9,1,28.00,31.0,4.0,1.0,8.0


In [15]:
selector = SelectKBest(mutual_info_classif, k=10)
selector.fit(clinical_data_reduced, clinical_data_procnan['fried'])
# Get columns to keep and create new dataframe with those only
cols = selector.get_support(indices=True)
features_df_new = clinical_data_reduced.iloc[:,cols]
features_df_new.drop('gait_speed_slower', axis = 1).columns

Index(['hospitalization_three_years', 'vision', 'gait_get_up', 'waist',
       'depression_total_score', 'leisure_out', 'pain_perception',
       'activity_regular', 'comorbidities_count'],
      dtype='object')

# Classification

In [16]:
X = clinical_data_proc_filled[['hospitalization_one_year', 'gait_get_up', 'waist','mmse_total_score', 'social_text', 'health_rate', 'pain_perception','comorbidities_count', 'medication_count']]
# X1 = clinical_data_proc_filled[['depression_total_score', 'health_rate', 'pain_perception', 'activity_regular', 'comorbidities_count']]
y = clinical_data_proc_filled['fried']

In [17]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 42)

In [31]:
knclass = KNeighborsClassifier(n_neighbors= 5).fit(X_train, y_train)
pred1 = knclass.predict(X_test)
pred1
print('Accuracy of K Nearest Neighbors Classifier: {:.5f}'
     .format(knclass.score(X_test, y_test)))
print('Accuracy of K Nearest Neighbors Classifier: {:.5f}'
     .format(knclass.score(X_train, y_train)))     

Accuracy of K Nearest Neighbors Classifier: 0.56296
Accuracy of K Nearest Neighbors Classifier: 0.67654


In [74]:
svmclas = SVC(gamma = 2, kernel="rbf").fit(X_train, y_train)
pred2 = svmclas.predict(X_test)
print('Accuracy of Support Vector Machine: {:.5f}'
     .format(svmclas.score(X_test, y_test)))
print('Accuracy of Support Vector Machine: {:.5f}'
     .format(svmclas.score(X_train, y_train)))     

Accuracy of Support Vector Machine: 0.60000
Accuracy of Support Vector Machine: 0.65432


In [32]:
print("Before undersampling: ", Counter(y_train))
undersample = RandomUnderSampler(sampling_strategy='all')

# fit and apply the transform
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)

# summarize class distribution
print("After undersampling: ", Counter(y_train_under))

Before undersampling:  Counter({2: 174, 1: 160, 0: 71})
After undersampling:  Counter({0: 71, 1: 71, 2: 71})


In [44]:
knclass_under = KNeighborsClassifier(n_neighbors= 10).fit(X_train_under, y_train_under)
pred1_under = knclass.predict(X_test)
pred1_under
print('Accuracy of K Nearest Neighbors Classifier: {:.5f}'
     .format(knclass_under.score(X_test, y_test)))
print('Accuracy of K Nearest Neighbors Classifier: {:.5f}'
     .format(knclass_under.score(X_train_under, y_train_under)))

Accuracy of K Nearest Neighbors Classifier: 0.53333
Accuracy of K Nearest Neighbors Classifier: 0.57531


In [67]:
# summarize class distribution
print("Before oversampling: ",Counter(y_train))

# define oversampling strategy
SMOTE = RandomOverSampler(sampling_strategy="all", random_state=42)

# fit and apply the transform
X_train_SMOTE, y_train_SMOTE = SMOTE.fit_resample(X_train, y_train)

# summarize class distribution
print("After oversampling: ",Counter(y_train_SMOTE))

Before oversampling:  Counter({2: 174, 1: 160, 0: 71})
After oversampling:  Counter({2: 174, 1: 174, 0: 174})


In [68]:
knclass_over = KNeighborsClassifier(n_neighbors= 14).fit(X_train_SMOTE, y_train_SMOTE)
pred1_over = knclass.predict(X_test)
pred1_over
print('Accuracy of K Nearest Neighbors Classifier: {:.5f}'
     .format(knclass_over.score(X_test, y_test)))
print('Accuracy of K Nearest Neighbors Classifier: {:.5f}'
     .format(knclass_under.score(X_train_SMOTE, y_train_SMOTE)))

Accuracy of K Nearest Neighbors Classifier: 0.52593
Accuracy of K Nearest Neighbors Classifier: 0.60153


In [92]:
model = SVC(gamma = 0.1, kernel="rbf")
# model = KNeighborsClassifier(n_neighbors= 14)
over = RandomOverSampler(sampling_strategy="not majority", random_state= 42)
under = RandomUnderSampler(sampling_strategy= "all")
steps = [('o', over), ('u', under), ('model', model)]
pipeline = Pipeline(steps=steps)
pipeline.fit(X_train, y_train)
scores = pipeline.score(X_test, y_test)
score_train = pipeline.score(X_train, y_train)
print('ROC AUC score for the combined sampling method: %.5f' % scores)
print('ROC AUC score for the combined sampling method: %.5f' % score_train)

ROC AUC score for the combined sampling method: 0.53333
ROC AUC score for the combined sampling method: 0.54815


To do: 

1) Naive Bayes
2) Randrom Forest