In [4]:
!gdown 1YAQHOBSmpDfwYeFq1dMQv94DyYPkp7Xc

zsh:1: command not found: gdown


In [5]:
import pandas as pd
import numpy as np
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [6]:
data = pd.read_csv('datasets/alzheimer.csv')

In [7]:
data

Unnamed: 0,PatientID,Domain,Age,BMI,HeartRate,BloodPressure,Cholesterol,CognitiveTestScore,StressMobilityTestScore,RiskLevel,FamilyHistoryDementia,TreatmentResponse
0,P000001,Orthopedics,62.0,28.09,69.0,115.0,206.0,82.78,59.68,High Risk,No,Stable
1,P000002,Orthopedics,19.0,22.83,78.0,116.0,196.0,76.00,60.13,Low Risk,No,Improved
2,P000003,Pediatrics,18.0,27.40,71.0,116.0,200.0,58.52,68.76,Low Risk,Yes,Deteriorated
3,P000004,Pediatrics,47.0,27.81,78.0,105.0,176.0,83.56,70.71,Medium Risk,Yes,Improved
4,P000005,Neurology,38.0,17.92,69.0,113.0,208.0,74.45,76.36,High Risk,No,Stable
...,...,...,...,...,...,...,...,...,...,...,...,...
29995,P029996,Cardiology,30.0,29.28,65.0,138.0,286.0,80.59,81.70,Medium Risk,No,Improved
29996,P029997,Orthopedics,42.0,34.00,68.0,117.0,190.0,62.90,69.58,Low Risk,No,Improved
29997,P029998,Pediatrics,69.0,22.87,59.0,107.0,212.0,82.72,50.81,Low Risk,No,Stable
29998,P029999,Pediatrics,42.0,29.74,74.0,115.0,191.0,74.63,63.08,High Risk,No,Improved


In [8]:
data.isna().sum()

PatientID                    0
Domain                     200
Age                        200
BMI                        200
HeartRate                   50
BloodPressure              200
Cholesterol                200
CognitiveTestScore          50
StressMobilityTestScore    200
RiskLevel                    0
FamilyHistoryDementia        0
TreatmentResponse            0
dtype: int64

In [9]:
def drop_data(data:pd.DataFrame, columns:list):
  data_copy = data.copy()
  data_copy.drop(columns, axis=1, inplace=True)
  return data_copy

In [10]:
def simple_impute_data(data:pd.DataFrame, columns:list, strategy:str):
  imputer = SimpleImputer(strategy=strategy)
  data_copy = data.copy()

  for column in columns:
    data_copy[column] = imputer.fit_transform(data_copy[[column]])
  return data_copy

In [11]:
def label_data(data:pd.DataFrame, columns:list):
  encoder = LabelEncoder()
  data_copy = data.copy()

  for column in columns:
    data_copy[column] = encoder.fit_transform(data_copy[[column]].astype(str).values.ravel())

    # if 'nan' in encoder.classes_:
    #   data_copy.loc[data_copy[column] == -1, column] = np.nan
  return data_copy

In [12]:
# def knn_impute_data(data:pd.DataFrame, columns:list, n_neighbors:int, weights:str):
#   imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights)
#   data_copy = data.copy()

#   for column in columns:
#     data_copy[column] = imputer.fit_transform(data_copy[column].to_numpy().reshape(-1, 1))
#   return data_copy

In [13]:
data = drop_data(data=data, columns=['PatientID', 'Domain'])

data = simple_impute_data(data=data, columns=['Age', 'BMI', 'HeartRate', 'BloodPressure', 'Cholesterol', 'CognitiveTestScore', 'StressMobilityTestScore'], strategy='mean')
data = label_data(data=data, columns=['RiskLevel', 'FamilyHistoryDementia', 'TreatmentResponse'])

In [14]:
data.sample(5)

Unnamed: 0,Age,BMI,HeartRate,BloodPressure,Cholesterol,CognitiveTestScore,StressMobilityTestScore,RiskLevel,FamilyHistoryDementia,TreatmentResponse
6318,75.0,33.24,95.0,122.0,216.0,60.83,73.54,0,1,2
5218,76.0,33.64,70.0,126.0,170.0,89.32,59.19,2,0,1
2507,37.0,22.43,92.0,118.0,172.0,92.16,62.33,2,0,1
18222,87.0,16.24,74.0,141.0,213.0,74.3,73.71,1,1,1
14273,75.0,28.83,51.0,99.0,148.0,76.31,77.09,2,1,0


In [15]:
data.isna().sum()

Age                        0
BMI                        0
HeartRate                  0
BloodPressure              0
Cholesterol                0
CognitiveTestScore         0
StressMobilityTestScore    0
RiskLevel                  0
FamilyHistoryDementia      0
TreatmentResponse          0
dtype: int64

In [16]:
input_data = data.copy()
input_data = drop_data(data=input_data, columns=['RiskLevel'])
input_data.sample(3)

Unnamed: 0,Age,BMI,HeartRate,BloodPressure,Cholesterol,CognitiveTestScore,StressMobilityTestScore,FamilyHistoryDementia,TreatmentResponse
5970,21.0,31.03,72.0,95.0,182.0,69.45,71.83,1,0
1281,83.0,24.86,50.0,151.0,153.0,56.95,62.73,0,1
13854,89.0,25.4,67.0,140.0,213.0,68.03,70.46,0,0


In [17]:
target_data = data.copy()['RiskLevel']
target_data.sample(3)

24179    2
9783     1
7921     2
Name: RiskLevel, dtype: int64

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(input_data, target_data, test_size=0.2)

In [19]:
model = XGBClassifier(max_depth=5, min_child_weight=1, n_estimators=100, n_jobs=-1, learning_rate=0.1)
model.fit(X_train, Y_train)

y_pred = model.predict(X_test)

In [20]:
print("Accuracy:", metrics.accuracy_score(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))

Accuracy: 0.333
[[542 795 574]
 [575 840 687]
 [583 788 616]]
              precision    recall  f1-score   support

           0       0.32      0.28      0.30      1911
           1       0.35      0.40      0.37      2102
           2       0.33      0.31      0.32      1987

    accuracy                           0.33      6000
   macro avg       0.33      0.33      0.33      6000
weighted avg       0.33      0.33      0.33      6000



In [21]:
classifier = DecisionTreeClassifier(max_depth=3)
classifier = classifier.fit(X_train,Y_train)

y_pred = classifier.predict(X_test)

In [22]:
print("Accuracy:", metrics.accuracy_score(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))

Accuracy: 0.3455
[[ 816  951  144]
 [ 830 1133  139]
 [ 838 1025  124]]
              precision    recall  f1-score   support

           0       0.33      0.43      0.37      1911
           1       0.36      0.54      0.43      2102
           2       0.30      0.06      0.10      1987

    accuracy                           0.35      6000
   macro avg       0.33      0.34      0.30      6000
weighted avg       0.33      0.35      0.30      6000

