In [68]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay



In [69]:
alz = pd.read_csv("alzheimers_disease_data.csv")
alz.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [70]:
alz.columns

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality',
       'SleepQuality', 'FamilyHistoryAlzheimers', 'CardiovascularDisease',
       'Diabetes', 'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP',
       'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
       'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion',
       'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
       'Forgetfulness', 'Diagnosis', 'DoctorInCharge'],
      dtype='object')

In [71]:
alz = alz.drop(columns = ['PatientID', 'Ethnicity', 'DoctorInCharge'])
alz.describe()

Unnamed: 0,Age,Gender,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,FamilyHistoryAlzheimers,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
count,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,...,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0
mean,74.908795,0.506282,1.286645,27.655697,0.288506,10.039442,4.920202,4.993138,7.051081,0.25221,...,5.080055,0.208004,0.156817,4.982958,0.205212,0.158213,0.150768,0.158678,0.301536,0.353653
std,8.990221,0.500077,0.904527,7.217438,0.453173,5.75791,2.857191,2.909055,1.763573,0.434382,...,2.892743,0.405974,0.363713,2.949775,0.40395,0.365026,0.357906,0.365461,0.459032,0.478214
min,60.0,0.0,0.0,15.008851,0.0,0.002003,0.003616,0.009385,4.002629,0.0,...,0.00046,0.0,0.0,0.001288,0.0,0.0,0.0,0.0,0.0,0.0
25%,67.0,0.0,1.0,21.611408,0.0,5.13981,2.570626,2.458455,5.482997,0.0,...,2.566281,0.0,0.0,2.342836,0.0,0.0,0.0,0.0,0.0,0.0
50%,75.0,1.0,1.0,27.823924,0.0,9.934412,4.766424,5.076087,7.115646,0.0,...,5.094439,0.0,0.0,5.038973,0.0,0.0,0.0,0.0,0.0,0.0
75%,83.0,1.0,2.0,33.869778,1.0,15.157931,7.427899,7.558625,8.562521,1.0,...,7.546981,0.0,0.0,7.58149,0.0,0.0,0.0,0.0,1.0,1.0
max,90.0,1.0,3.0,39.992767,1.0,19.989293,9.987429,9.998346,9.99984,1.0,...,9.996467,1.0,1.0,9.999747,1.0,1.0,1.0,1.0,1.0,1.0


In [72]:
alz['EducationLevel'].value_counts()

EducationLevel
1    854
2    636
0    446
3    213
Name: count, dtype: int64

In [73]:
alz.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 32 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        2149 non-null   int64  
 1   Gender                     2149 non-null   int64  
 2   EducationLevel             2149 non-null   int64  
 3   BMI                        2149 non-null   float64
 4   Smoking                    2149 non-null   int64  
 5   AlcoholConsumption         2149 non-null   float64
 6   PhysicalActivity           2149 non-null   float64
 7   DietQuality                2149 non-null   float64
 8   SleepQuality               2149 non-null   float64
 9   FamilyHistoryAlzheimers    2149 non-null   int64  
 10  CardiovascularDisease      2149 non-null   int64  
 11  Diabetes                   2149 non-null   int64  
 12  Depression                 2149 non-null   int64  
 13  HeadInjury                 2149 non-null   int64

In [74]:
alz.isnull().sum().sum()

np.int64(0)

In [75]:
cat_col = alz.select_dtypes(include = 'object')
cat_col.columns

Index([], dtype='object')

In [76]:
num_col = alz.select_dtypes(include = ['int64', 'float64'])
num_col.columns

Index(['Age', 'Gender', 'EducationLevel', 'BMI', 'Smoking',
       'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'FamilyHistoryAlzheimers', 'CardiovascularDisease', 'Diabetes',
       'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP', 'DiastolicBP',
       'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
       'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion',
       'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
       'Forgetfulness', 'Diagnosis'],
      dtype='object')

In [77]:
num_col = alz.drop(columns = ['Diagnosis'])

In [78]:
categorical_features = []

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(
        handle_unknown='use_encoded_value',
        unknown_value=-1
    ))
])

In [79]:

numerical_features = ['Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI', 'Smoking',
       'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'FamilyHistoryAlzheimers', 'CardiovascularDisease', 'Diabetes',
       'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP', 'DiastolicBP',
       'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
       'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion',
       'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
       'Forgetfulness']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

In [80]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

In [81]:
final_columns = categorical_features + numerical_features + ['Diagnosis']
alz_preprocessed = pd.DataFrame(alz, columns = final_columns)
alz_preprocessed.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,73,0,,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,...,6.518877,0,0,1.725883,0,0,0,1,0,0
1,89,0,,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,...,7.118696,0,0,2.592424,0,0,0,0,1,0
2,73,0,,1,17.795882,0,19.555085,7.844988,1.826335,9.673574,...,5.895077,0,0,7.119548,0,1,0,1,0,0
3,74,1,,1,33.800817,1,12.209266,8.428001,7.435604,8.392554,...,8.965106,0,1,6.481226,0,0,0,0,0,0
4,89,0,,0,20.716974,0,18.454356,6.310461,0.795498,5.597238,...,6.045039,0,0,0.014691,0,0,1,1,0,0


# Linear Regression

In [82]:

lr = LogisticRegression()

model_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lr)
])

model_lr

In [83]:
X = alz_preprocessed.drop(columns='Diagnosis', axis = 1)
y = alz_preprocessed['Diagnosis']

X_train,X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [84]:
model_lr.fit(X_train,y_train)



In [85]:
result = model_lr.predict(X_test)



In [86]:
cof_metrix = confusion_matrix(y_test, result)
cm = cof_metrix.ravel()
tp = cm[0]
fp = cm[1]
fn = cm[2]
tn = cm[3]
print('True Positive:', tp)
print('False Positive:', fp)
print('False Negative:', fn)
print('True Negative:', tn)

True Positive: 248
False Positive: 29
False Negative: 44
True Negative: 109


In [87]:
accuracy_score_value = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp / (tp+fn)
f1_score = 2 * ((precision*recall)/ (precision+recall))
print('Accuracy Score:', accuracy_score_value)
print('Precision:', precision)
print('Recall:', recall)
print('f1 Score:', f1_score)

Accuracy Score: 0.8302325581395349
Precision: 0.8953068592057761
Recall: 0.8493150684931506
f1 Score: 0.8717047451669595


# Gradient Boosting Classifier

In [88]:
xgb = GradientBoostingClassifier(n_estimators=100, max_depth=3)

model_gbx = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb)
])

model_gbx

In [89]:
X = alz_preprocessed.drop(columns='Diagnosis', axis = 1)
y = alz_preprocessed['Diagnosis']

X_train,X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [90]:
model_gbx.fit(X_train,y_train)



In [91]:
result = model_gbx.predict(X_test)



In [92]:
xgb_cof_metrix = confusion_matrix(y_test, result)
xgb_cm = xgb_cof_metrix.ravel()
tp = xgb_cm[0]
fp = xgb_cm[1]
fn = xgb_cm[2]
tn = xgb_cm[3]
print('True Positive:', tp)
print('False Positive:', fp)
print('False Negative:', fn)
print('True Negative:', tn)

True Positive: 270
False Positive: 7
False Negative: 12
True Negative: 141


In [93]:
xgb_accuracy_score_value = (tp+tn)/(tp+tn+fp+fn)
xgb_precision = tp/(tp+fp)
xgb_recall = tp / (tp+fn)
xgb_f1_score = 2 * ((xgb_precision*xgb_recall)/ (xgb_precision+xgb_recall))
print('Accuracy Score:', xgb_accuracy_score_value)
print('Precision:', xgb_precision)
print('Recall:', xgb_recall)
print('f1 Score:', xgb_f1_score)

Accuracy Score: 0.9558139534883721
Precision: 0.9747292418772563
Recall: 0.9574468085106383
f1 Score: 0.9660107334525939


##### By comparing the accuracy score, precision, recall, and f1 scores, Gradient Boosting Classifier does a better job than Linear Regression. Therefore, the model will be built upon the Gradient Boosting Classifier.