In [100]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import graphviz 

In [99]:
# TODO: Check for class imbalances to predict correctly the prediction model

In [79]:
data = pd.read_csv('../data/raw/clinical_dataset.csv', sep=';').drop(columns=['weight_loss',
                                           'exhaustion_score',
                                           'gait_speed_slower',
                                           'grip_strength_abnormal',
                                           'low_physical_activity'])

## Remove Outliers

In [80]:
filled_data = data
for columns in data: 
    if (data[columns].dtype == object):
        data[columns].replace('Test not adequate', np.nan, inplace=True)
        data[columns].replace('test non realizable', np.nan, inplace=True)
    elif (data[columns].dtype == np.float64 or data[columns].dtype == np.int64):
        data[columns].replace(999, np.nan, inplace=True)


In [81]:
# Remove null values
# TODO: Remove entries of features with missing values
# TODO: Remove features with many missing values 
# Find rows where there are mon than 1 null value


for column in data:
    # Rplace nan values with the median of each column
    if data[column].dtype != object and data[column].dtype != bool:
        data[column].fillna(data[column].median(), inplace=True)

    # Replace categorical values with the most used column value 
    elif (data[column].dtype != np.float64 and data[column].dtype != np.int64):
         data[column].fillna(data[column].value_counts().index[0], inplace=True)


In [82]:
# Order categorical data
ordered_categories = {
    'fried': ['Non frail', 'Pre-frail', 'Frail'],
    'gender': ['F', 'M'],
    'gait_optional_binary': [True, False],
    'ortho_hypotension': ['No', 'Yes'],
    'vision': ['Sees well', 'Sees moderately','Sees poorly'  ],
    'audition': ['Hears well', 'Hears moderately', 'Hears poorly' ],
    'weight_loss': ['No', 'Yes'],
    'balance_single': ['>5 sec', '<5 sec'],
    'gait_speed_slower': ['No', 'Yes'],
    'grip_strength_abnormal': ['No', 'Yes'],
    'low_physical_activity': ['No', 'Yes'],
    'memory_complain': ['No', 'Yes'],
    'sleep': ['No sleep problem', 'Occasional sleep problem', 'Permanent sleep problem'],
    'living_alone': ['No', 'Yes'],
    'leisure_club': ['Yes', 'No'],
    'house_suitable_participant': ['Yes', 'No'],
    'house_suitable_professional': ['Yes', 'No'],
    'health_rate': [  '5 - Excellent', '4 - Good', '3 - Medium', '2 - Bad', '1 - Very bad'],
    'health_rate_comparison': [ '5 - A lot better', '4 - A little better', '3 - About the same', '2 - A little worse',
  '1 - A lot worse'],
    'activity_regular': ['> 5 h per week', '> 2 h and < 5 h per week', '< 2 h per week',  'No'],
    'smoking': ['Never smoked', 'Past smoker (stopped at least 6 months)', 'Current smoker']    
}
# encode data
encoder = LabelEncoder()

for column in data:
    if data[column].dtype == np.object or data[column].dtype == np.bool:
        encoder.fit(data[column])
        encoder.classes_ = ordered_categories[column]
        data[column] = encoder.transform(data[column]) 
        
data.to_csv('../data/preprocessed/preprocessed.csv', sep=';')

In [85]:
# Split dataset to train and test 
data_shape = data.shape[0]
splitter = int(np.ceil(0.8*data_shape))
train_data = data.iloc[:splitter]
test_data = data.iloc[splitter:]

In [93]:
test_data.head()

Unnamed: 0,part_id,fried,gender,age,hospitalization_one_year,hospitalization_three_years,ortho_hypotension,vision,audition,raise_chair_time,...,health_rate_comparison,pain_perception,activity_regular,smoking,alcohol_units,katz_index,iadl_grade,comorbidities_count,comorbidities_significant_count,medication_count
432,3081,1,0,73,0,0.0,0,0,0,9.15,...,0,3.0,1,0,0.0,5.5,31.0,6,0,2
433,3082,0,0,74,0,0.0,0,0,0,9.47,...,2,0.2,0,0,7.5,5.5,28.0,1,0,1
434,3083,2,1,83,0,0.0,0,0,1,23.47,...,3,5.3,1,0,8.0,6.0,30.0,5,1,6
435,3084,2,0,76,0,0.0,1,0,0,17.06,...,2,4.2,0,1,0.0,6.0,31.0,5,0,2
436,3085,1,0,79,0,0.0,0,0,0,11.22,...,2,0.0,0,1,10.5,6.0,28.0,8,0,8


### Train

In [94]:
# Model Decision trees
tree_classifier = tree.DecisionTreeClassifier()
X = train_data.drop(['fried','part_id'] ,axis=1)
tree_classifier.fit(X, train_data['fried'])

# train_columns = list(encoded_data.columns)
# train_columns.remove("part_id")
# fig, ax = plt.subplots( figsize=(18,4))
# ax.bar(range(len(train_data.columns)-1),tree_classifier.feature_importances_ )
# plt.xticks(range(len(train_data.columns)),train_data.columns,rotation=90)
# ax.set_title("Feature importance")
# plt.show()

DecisionTreeClassifier()

In [75]:
# import graphviz 
# print(len(train_columns))
# dot_data = tree.export_graphviz(tree_classifier, 
#                                 out_file=None,
#                                 filled=True, 
#                                 rounded=True,  
#                                 special_characters=True,
#                                feature_names =X.columns) 
# graph = graphviz.Source(dot_data)
# graph

## Prediction

In [95]:
test_without_id = test_data.drop(['fried','part_id'], axis=1)
y_predict = tree_classifier.predict(test_without_id)

### Evaluation

In [105]:
results = pd.DataFrame({'id': test_data['part_id'],'Actual': test_data['fried'],'predicted':y_predict})

result = confusion_matrix(results['Actual'],results['predicted'], normalize='all')
print(result)

[[0.47222222 0.12962963 0.01851852]
 [0.16666667 0.08333333 0.08333333]
 [0.00925926 0.03703704 0.        ]]
