In [12]:
import pandas as pd
import numpy as np

# Relative path to the dataset
file_path_Alzheimer = "../Dataset/alzheimers_disease_data.csv"
file_path_Parkinson = "../Dataset/parkinsons_disease_data.csv"

# Load the dataset
data_Alzheimer = pd.read_csv(file_path_Alzheimer)
data_Parkinson = pd.read_csv(file_path_Parkinson)

print(data_Alzheimer.info()) ## Check if there's any Null Info and check variable type.
print(data_Parkinson.info()) ## Check if there's any Null Info and check variable type.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [17]:
## Verify if Data is imbalanced or balaneced

print("Alzheimer Diagnosis Distribution:\n", data_Alzheimer['Diagnosis'].value_counts())
print("Parkinson Diagnosis Distribution:\n", data_Parkinson['Diagnosis'].value_counts())

## This step is vital because if an Algorithm is trained with most of the patients having no Alzheimer, it will 
## Tend to favour training towards that aspect

## Proportion of 0s (no Alzheimer's): 64.6 %
## Proportion of 1s (Alzheimer's): 35.4 %

## Proportion of 0s (Parkinson): 61.9 %
## Proportion of 1s (no  Parkinson's): 38.1 %

## I'm not going to balance the dataset yet so I compare the results between an imbalanced data vs balanced


Alzheimer Diagnosis Distribution:
 Diagnosis
0    1389
1     760
Name: count, dtype: int64
Parkinson Diagnosis Distribution:
 Diagnosis
1    1304
0     801
Name: count, dtype: int64


In [18]:
## Cleaning Data

## Removing Education Level and DoctorInCharge Variables
## Alzheimer Dataset
del data_Alzheimer['DoctorInCharge']
del data_Alzheimer['EducationLevel']
## Parkinson Dataset
del data_Parkinson['DoctorInCharge']
del data_Parkinson['EducationLevel']

In [19]:
from sklearn.model_selection import train_test_split

X = data_Alzheimer.drop(columns=['Diagnosis'])
y = data_Alzheimer['Diagnosis']

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Why random_state = 42 ? So anyone using that random_state will get the same splitting

X_train.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
1433,6184,87,1,2,27.764232,1,16.54317,0.281379,5.923418,7.836104,...,25.399206,3.085543,0,0,6.643693,0,0,0,0,0
630,5381,70,0,0,37.098744,0,1.360202,9.24299,1.819284,5.218052,...,8.292136,5.61683,0,1,3.884562,0,0,0,0,1
78,4829,82,1,3,15.908275,0,16.329031,1.915913,6.607292,6.146166,...,21.042238,3.662461,0,0,4.013722,0,0,1,1,0
366,5117,76,1,0,30.302432,1,11.81403,6.28117,6.204349,6.825155,...,28.609438,4.648135,0,0,9.3557,1,0,1,0,1
1996,6747,61,0,0,24.565357,1,2.273373,9.976581,2.057188,4.715534,...,2.629135,9.601238,1,0,8.818932,0,1,0,0,0


In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the training data, and transform the test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## We're going to use Logistic Regression because it's used for binery classification problems as having alzheimer or not.
## Commonly used in prediction and classificiation problems.

model = LogisticRegression(max_iter=1000)  # Increase max_iter if convergence warning
# Train the model
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{conf_matrix}')

# Classification report
class_report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{class_report}')

Accuracy: 0.8233
Confusion Matrix:
[[245  32]
 [ 44 109]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.87       277
           1       0.77      0.71      0.74       153

    accuracy                           0.82       430
   macro avg       0.81      0.80      0.80       430
weighted avg       0.82      0.82      0.82       430

