In [1]:
#libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
#prepping the data
path = 'alzheimers_disease_data.csv'
data = pd.read_csv(path)
data.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [3]:
# cleaning duplicates in data set
data.drop_duplicates(subset='PatientID', inplace=True)
data.drop(columns='PatientID', axis=1, inplace=True)
print(data.columns)

Index(['Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI', 'Smoking',
       'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'FamilyHistoryAlzheimers', 'CardiovascularDisease', 'Diabetes',
       'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP', 'DiastolicBP',
       'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
       'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion',
       'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
       'Forgetfulness', 'Diagnosis', 'DoctorInCharge'],
      dtype='object')


In [4]:
print("Column containing missing values:")
print(data.columns[data.isnull().any()].tolist())

print("\n\nNull values in data set:")
print(data.isnull().sum())

df = pd.DataFrame(data)

Column containing missing values:
[]


Null values in data set:
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
Diffic

In [5]:
# Medical History Columns

medical_history_columns = [
    'FamilyHistoryAlzheimers',
    'CardiovascularDisease',
    'Diabetes',
    'Depression',
    'HeadInjury',
    'Hypertension',
    'Diagnosis'
]


mh_columns = df[medical_history_columns].copy()

# Adding new columns for the 0 values
# ex. FamilyHistoryAlzheimers will have another column with NoFamilyHistoryAlzheimers as the name
# The values for the new column will be the opposite of the old column

for col in mh_columns.columns:
    mh_columns[f'No{col}'] = mh_columns[col].apply(lambda x: 1 if x == 0 else 0)

# renaming old columns to have a Yes as the first substring

mh_columns.rename(columns={col: f'Yes{col}' for col in mh_columns.columns if not col.startswith('No')}, inplace=True)

print(mh_columns)

# saving this to a new excel file
output_file = "medical_history_columns.csv"
mh_columns.to_csv(output_file, index=False)


      YesFamilyHistoryAlzheimers  YesCardiovascularDisease  YesDiabetes  \
0                              0                         0            1   
1                              0                         0            0   
2                              1                         0            0   
3                              0                         0            0   
4                              0                         0            0   
...                          ...                       ...          ...   
2144                           0                         0            0   
2145                           0                         0            0   
2146                           0                         0            0   
2147                           0                         1            0   
2148                           0                         0            0   

      YesDepression  YesHeadInjury  YesHypertension  YesDiagnosis  \
0                 1           

In [6]:
# Medical History Not Diagnosed
mh_no = mh_columns[mh_columns['YesDiagnosis'] == 0]
mh_no = mh_no.astype(bool)

# print(mh_no)
mh_no.head()

# Apply apriori
mh_no_frequent_itemsets = apriori(mh_no, min_support=0.5, use_colnames=True)


# Generate rules
mh_no_rules = association_rules(mh_no_frequent_itemsets, metric='confidence', min_threshold=0.7, num_itemsets=len(mh_no_frequent_itemsets))
mh_no_rules = mh_no_rules[mh_no_rules['consequents'].apply(lambda x: 'NoDiagnosis' in x and len(x) == 1)]

print(mh_no_frequent_itemsets)
print(mh_no_rules)

# mh_columns = mh_columns.astype(bool)

# mh_columns_frequent_itemsets = apriori(mh_columns, min_support=0.5, use_colnames=True)
# print(mh_columns_frequent_itemsets)

output_file_assoc_rules = "medical_history_NotDiagnosed.csv"
mh_no_rules.to_csv(output_file_assoc_rules, index=False)


     support                                           itemsets
0   0.737221                        (NoFamilyHistoryAlzheimers)
1   0.863931                          (NoCardiovascularDisease)
2   0.840893                                       (NoDiabetes)
3   0.797696                                     (NoDepression)
4   0.902808                                     (NoHeadInjury)
..       ...                                                ...
84  0.624190  (NoHypertension, NoDiagnosis, NoDepression, No...
85  0.512599  (NoHeadInjury, NoCardiovascularDisease, NoDiag...
86  0.557955  (NoHeadInjury, NoCardiovascularDisease, NoDiag...
87  0.538517  (NoHeadInjury, NoCardiovascularDisease, NoDiag...
88  0.516199  (NoHeadInjury, NoDiagnosis, NoDepression, NoDi...

[89 rows x 2 columns]
                                           antecedents    consequents  \
10                         (NoFamilyHistoryAlzheimers)  (NoDiagnosis)   
20                           (NoCardiovascularDisease)  (NoDiag

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


In [7]:
# Medical History Not Diagnosed
mh_yes = mh_columns[mh_columns['YesDiagnosis'] == 1]
mh_yes = mh_yes.astype(bool)

# print(mh_no)
mh_yes.head()

# Apply apriori
mh_yes_frequent_itemsets = apriori(mh_yes, min_support=0.5, use_colnames=True)


# Generate rules
mh_yes_rules = association_rules(mh_yes_frequent_itemsets, metric='confidence', min_threshold=0.7, num_itemsets=len(mh_yes_frequent_itemsets))
mh_yes_rules = mh_yes_rules[mh_yes_rules['consequents'].apply(lambda x: 'YesDiagnosis' in x and len(x) == 1)]

print(mh_yes_frequent_itemsets)
print(mh_yes_rules)

output_file_assoc_rules = "medical_history_YesDiagnosed.csv"
mh_yes_rules.to_csv(output_file_assoc_rules, index=False)


     support                                           itemsets
0   1.000000                                     (YesDiagnosis)
1   0.767105                        (NoFamilyHistoryAlzheimers)
2   0.840789                          (NoCardiovascularDisease)
3   0.864474                                       (NoDiabetes)
4   0.802632                                     (NoDepression)
..       ...                                                ...
92  0.548684  (NoHeadInjury, NoCardiovascularDisease, NoDepr...
93  0.506579  (NoCardiovascularDisease, NoDepression, YesDia...
94  0.559211  (NoHeadInjury, NoCardiovascularDisease, YesDia...
95  0.513158  (NoHeadInjury, NoCardiovascularDisease, NoDepr...
96  0.548684  (NoHeadInjury, NoDepression, YesDiagnosis, NoD...

[97 rows x 2 columns]
                                           antecedents     consequents  \
0                          (NoFamilyHistoryAlzheimers)  (YesDiagnosis)   
2                            (NoCardiovascularDisease)  (YesD

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


In [8]:
# All medical history columns
mh_columns = mh_columns.astype(bool)
mh_columns.head()

# Apply apriori
mh_columns_frequent_itemsets = apriori(mh_columns, min_support=0.3, use_colnames=True)

# Generate rules
mh_columns_rules = association_rules(mh_columns_frequent_itemsets, metric='confidence', min_threshold=0.6, num_itemsets=len(mh_columns_frequent_itemsets))
mh_columns_rules = mh_columns_rules[mh_columns_rules['consequents'].apply(lambda x: ('NoDiagnosis' in x or 'YesDiagnosis' in x) and len(x) == 1)]

print(mh_columns_frequent_itemsets)
print(mh_columns_rules)

output_file_assoc_rules = "medical_history_All.csv"
mh_columns_rules.to_csv(output_file_assoc_rules, index=False)

      support                                           itemsets
0    0.353653                                     (YesDiagnosis)
1    0.747790                        (NoFamilyHistoryAlzheimers)
2    0.855747                          (NoCardiovascularDisease)
3    0.849232                                       (NoDiabetes)
4    0.799442                                     (NoDepression)
..        ...                                                ...
113  0.318753  (NoCardiovascularDisease, NoDiagnosis, NoDepre...
114  0.360633  (NoHeadInjury, NoCardiovascularDisease, NoDiag...
115  0.348069  (NoHeadInjury, NoCardiovascularDisease, NoDiag...
116  0.333644  (NoHeadInjury, NoDiagnosis, NoDepression, NoDi...
117  0.341554  (NoFamilyHistoryAlzheimers, NoHeadInjury, NoCa...

[118 rows x 2 columns]
                                           antecedents    consequents  \
12                         (NoFamilyHistoryAlzheimers)  (NoDiagnosis)   
22                           (NoCardiovascularDise

In [9]:
# Symptoms 
symptom_and_assessment_columns = [
  'MemoryComplaints',
  'BehavioralProblems',
  'Confusion',
  'Disorientation',
  'PersonalityChanges',
  'DifficultyCompletingTasks',
  'Forgetfulness',
  'Diagnosis'
]

s_columns = df[symptom_and_assessment_columns].copy()

# Adding new columns for the 0 values
# ex. FamilyHistoryAlzheimers will have another column with NoFamilyHistoryAlzheimers as the name
# The values for the new column will be the opposite of the old column

for col in s_columns.columns:
    s_columns[f'No{col}'] = s_columns[col].apply(lambda x: 1 if x == 0 else 0)

# renaming old columns to have a Yes as the first substring

s_columns.rename(columns={col: f'Yes{col}' for col in s_columns.columns if not col.startswith('No')}, inplace=True)

print(s_columns)

# saving this to a new excel file
output_file = "symptom_and_assessment_columns.csv"
s_columns.to_csv(output_file, index=False)

      YesMemoryComplaints  YesBehavioralProblems  YesConfusion  \
0                       0                      0             0   
1                       0                      0             0   
2                       0                      0             0   
3                       0                      1             0   
4                       0                      0             0   
...                   ...                    ...           ...   
2144                    0                      0             1   
2145                    0                      1             0   
2146                    0                      0             0   
2147                    0                      0             0   
2148                    0                      1             0   

      YesDisorientation  YesPersonalityChanges  YesDifficultyCompletingTasks  \
0                     0                      0                             1   
1                     0                      0 

In [10]:
# Symptoms Not Diagnosed

s_no = s_columns[s_columns['YesDiagnosis'] == 0]
s_no = s_no.astype(bool)

s_no.head()

# Apply apriori
s_no_frequent_itemsets = apriori(s_no, min_support=0.5, use_colnames=True)

# Generate rules

s_no_rules = association_rules(s_no_frequent_itemsets, metric='confidence', min_threshold=0.7, num_itemsets=len(s_no_frequent_itemsets))
s_no_rules = s_no_rules[s_no_rules['consequents'].apply(lambda x: 'NoDiagnosis' in x and len(x) == 1)]

print(s_no_frequent_itemsets)
print(s_no_rules)

output_file_assoc_rules = "symptom_NotDiagnosed.csv"
s_no_rules.to_csv(output_file_assoc_rules, index=False)

      support                                           itemsets
0    0.884089                               (NoMemoryComplaints)
1    0.903528                             (NoBehavioralProblems)
2    0.789057                                      (NoConfusion)
3    0.835133                                 (NoDisorientation)
4    0.843772                             (NoPersonalityChanges)
..        ...                                                ...
130  0.502520  (NoDifficultyCompletingTasks, NoConfusion, NoM...
131  0.506120  (NoDifficultyCompletingTasks, NoConfusion, NoM...
132  0.532757  (NoDifficultyCompletingTasks, NoMemoryComplain...
133  0.504680  (NoDifficultyCompletingTasks, NoConfusion, NoB...
134  0.534917  (NoDifficultyCompletingTasks, NoBehavioralProb...

[135 rows x 2 columns]
                                           antecedents    consequents  \
12                                (NoMemoryComplaints)  (NoDiagnosis)   
23                              (NoBehavioralProbl

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


In [11]:
# Symtoms Diagnosed

s_yes = s_columns[s_columns['YesDiagnosis'] == 1]
s_yes = s_yes.astype(bool)

s_yes.head()

# Apply apriori
s_yes_frequent_itemsets = apriori(s_yes, min_support=0.5, use_colnames=True)

# Generate rules
s_yes_rules = association_rules(s_yes_frequent_itemsets, metric='confidence', min_threshold=0.7, num_itemsets=len(s_yes_frequent_itemsets))
s_yes_rules = s_yes_rules[s_yes_rules['consequents'].apply(lambda x: 'YesDiagnosis' in x and len(x) == 1)]

# print(s_yes_frequent_itemsets)
print(s_yes_rules)

output_file_assoc_rules = "symptom_YesDiagnosed.csv"
s_yes_rules.to_csv(output_file_assoc_rules, index=False)

                                           antecedents     consequents  \
0                                 (NoMemoryComplaints)  (YesDiagnosis)   
1                               (NoBehavioralProblems)  (YesDiagnosis)   
4                                        (NoConfusion)  (YesDiagnosis)   
5                                   (NoDisorientation)  (YesDiagnosis)   
8                               (NoPersonalityChanges)  (YesDiagnosis)   
9                        (NoDifficultyCompletingTasks)  (YesDiagnosis)   
11                                   (NoForgetfulness)  (YesDiagnosis)   
41              (NoDisorientation, NoMemoryComplaints)  (YesDiagnosis)   
45          (NoMemoryComplaints, NoPersonalityChanges)  (YesDiagnosis)   
47   (NoDifficultyCompletingTasks, NoMemoryComplaints)  (YesDiagnosis)   
51                 (NoBehavioralProblems, NoConfusion)  (YesDiagnosis)   
55            (NoBehavioralProblems, NoDisorientation)  (YesDiagnosis)   
61        (NoBehavioralProblems, NoPer

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


In [12]:
# Symptoms All

s_columns = s_columns.astype(bool)
s_columns.head()

#  Apply apriori
s_columns_frequent_itemsets = apriori(s_columns, min_support=0.3, use_colnames=True)

# Generate rules
s_columns_rules = association_rules(s_columns_frequent_itemsets, metric='confidence', min_threshold=0.6, num_itemsets=len(s_columns_frequent_itemsets))
s_columns_rules = s_columns_rules[s_columns_rules['consequents'].apply(lambda x: ('NoDiagnosis' in x or 'YesDiagnosis' in x) and len(x) == 1)]

print(s_columns_frequent_itemsets)
print(s_columns_rules)

output_file_assoc_rules = "symptom_All.csv"
s_columns_rules.to_csv(output_file_assoc_rules, index=False)


      support                                           itemsets
0    0.301536                                 (YesForgetfulness)
1    0.353653                                     (YesDiagnosis)
2    0.791996                               (NoMemoryComplaints)
3    0.843183                             (NoBehavioralProblems)
4    0.794788                                      (NoConfusion)
..        ...                                                ...
200  0.345742  (NoDifficultyCompletingTasks, NoBehavioralProb...
201  0.328060  (NoDifficultyCompletingTasks, NoForgetfulness,...
202  0.306189  (NoDifficultyCompletingTasks, NoConfusion, NoD...
203  0.315961  (NoDifficultyCompletingTasks, NoConfusion, NoM...
204  0.308050  (NoDifficultyCompletingTasks, NoMemoryComplain...

[205 rows x 2 columns]
                                            antecedents    consequents  \
15                                 (NoMemoryComplaints)  (NoDiagnosis)   
26                               (NoBehavioralPr

In [13]:
yesno_columns = [
  'FamilyHistoryAlzheimers',
  'CardiovascularDisease',
  'Diabetes',
  'Depression',
  'HeadInjury',
  'Hypertension',
  'MemoryComplaints',
  'BehavioralProblems',
  'Confusion',
  'Disorientation',
  'PersonalityChanges',
  'DifficultyCompletingTasks',
  'Forgetfulness',
  'Diagnosis'
]

yn_columns = df[yesno_columns].copy()

# Adding new columns for the 0 values

for col in yn_columns.columns:
  yn_columns[f'No{col}'] = yn_columns[col].apply(lambda x: 1 if x == 0 else 0)

# renaming old columns to have a Yes as the first substring

yn_columns.rename(columns={col: f'Yes{col}' for col in yn_columns.columns if not col.startswith('No')}, inplace=True)

print(yn_columns)

# saving this to a new excel file

output_file = "alzheimers_categorical_columns.csv"
yn_columns.to_csv(output_file, index=False)

      YesFamilyHistoryAlzheimers  YesCardiovascularDisease  YesDiabetes  \
0                              0                         0            1   
1                              0                         0            0   
2                              1                         0            0   
3                              0                         0            0   
4                              0                         0            0   
...                          ...                       ...          ...   
2144                           0                         0            0   
2145                           0                         0            0   
2146                           0                         0            0   
2147                           0                         1            0   
2148                           0                         0            0   

      YesDepression  YesHeadInjury  YesHypertension  YesMemoryComplaints  \
0                 1    

In [14]:
# Categorical Columns Not Diagnosed
yn_no = yn_columns[yn_columns['YesDiagnosis'] == 0]
yn_no = yn_no.astype(bool)

yn_no.head()

# Apply apriori
yn_no_frequent_itemsets = apriori(yn_no, min_support=0.5, use_colnames=True)

# Generate rules
yn_no_rules = association_rules(yn_no_frequent_itemsets, metric='confidence', min_threshold=0.7, num_itemsets=len(yn_no_frequent_itemsets))
yn_no_rules = yn_no_rules[yn_no_rules['consequents'].apply(lambda x: 'NoDiagnosis' in x and len(x) == 1)]

# print(yn_no_frequent_itemsets)
print(yn_no_rules)

output_file_assoc_rules = "alzheimers_categorical_NotDiagnosed.csv"
yn_no_rules.to_csv(output_file_assoc_rules, index=False)

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


                                             antecedents    consequents  \
24                           (NoFamilyHistoryAlzheimers)  (NoDiagnosis)   
47                             (NoCardiovascularDisease)  (NoDiagnosis)   
69                                          (NoDiabetes)  (NoDiagnosis)   
89                                        (NoDepression)  (NoDiagnosis)   
106                                       (NoHeadInjury)  (NoDiagnosis)   
...                                                  ...            ...   
12049  (NoMemoryComplaints, NoHeadInjury, NoBehaviora...  (NoDiagnosis)   
12081  (NoDifficultyCompletingTasks, NoMemoryComplain...  (NoDiagnosis)   
12115  (NoMemoryComplaints, NoHeadInjury, NoBehaviora...  (NoDiagnosis)   
12145  (NoDifficultyCompletingTasks, NoMemoryComplain...  (NoDiagnosis)   
12178  (NoDifficultyCompletingTasks, NoMemoryComplain...  (NoDiagnosis)   

       antecedent support  consequent support   support  confidence  lift  \
24               0.737

In [15]:
# Categorical columns Diagnosed
yn_yes = yn_columns[yn_columns['YesDiagnosis'] == 1]
yn_yes = yn_yes.astype(bool)

yn_yes.head()

# Apply apriori

yn_yes_frequent_itemsets = apriori(yn_yes, min_support=0.5, use_colnames=True)

# Generate rules

yn_yes_rules = association_rules(yn_yes_frequent_itemsets, metric='confidence', min_threshold=0.7, num_itemsets=len(yn_yes_frequent_itemsets))
yn_yes_rules = yn_yes_rules[yn_yes_rules['consequents'].apply(lambda x: 'YesDiagnosis' in x and len(x) == 1)]

# print(yn_yes_frequent_itemsets)
print(yn_yes_rules)

output_file_assoc_rules = "alzheimers_categorical_YesDiagnosed.csv"

yn_yes_rules.to_csv(output_file_assoc_rules, index=False)


                                            antecedents     consequents  \
0                           (NoFamilyHistoryAlzheimers)  (YesDiagnosis)   
2                             (NoCardiovascularDisease)  (YesDiagnosis)   
5                                          (NoDiabetes)  (YesDiagnosis)   
7                                        (NoDepression)  (YesDiagnosis)   
9                                        (NoHeadInjury)  (YesDiagnosis)   
...                                                 ...             ...   
5436  (NoDisorientation, NoConfusion, NoPersonalityC...  (YesDiagnosis)   
5453  (NoDifficultyCompletingTasks, NoDisorientation...  (YesDiagnosis)   
5467  (NoDifficultyCompletingTasks, NoConfusion, NoP...  (YesDiagnosis)   
5485  (NoDifficultyCompletingTasks, NoDisorientation...  (YesDiagnosis)   
5507  (NoDifficultyCompletingTasks, NoDisorientation...  (YesDiagnosis)   

      antecedent support  consequent support   support  confidence  lift  \
0               0.76710

  cert_metric = np.where(certainty_denom == 0, 0, certainty_num / certainty_denom)


In [19]:
# Categorical columns All

yn_columns = yn_columns.astype(bool)

# Apply apriori
yn_columns_frequent_itemsets = apriori(yn_columns, min_support=0.3, use_colnames=True)

# Generate rules

yn_columns_rules = association_rules(yn_columns_frequent_itemsets, metric='confidence', min_threshold=0.5, num_itemsets=len(yn_columns_frequent_itemsets))
yn_columns_rules = yn_columns_rules[yn_columns_rules['consequents'].apply(lambda x: ('NoDiagnosis' in x or 'YesDiagnosis' in x) and len(x) == 1)]

# print(yn_columns_frequent_itemsets)
print(yn_columns_rules)

output_file_assoc_rules = "alzheimers_categorical_All.csv"
yn_columns_rules.to_csv(output_file_assoc_rules, index=False)

                                             antecedents    consequents  \
28                           (NoFamilyHistoryAlzheimers)  (NoDiagnosis)   
52                             (NoCardiovascularDisease)  (NoDiagnosis)   
75                                          (NoDiabetes)  (NoDiagnosis)   
95                                        (NoDepression)  (NoDiagnosis)   
113                                       (NoHeadInjury)  (NoDiagnosis)   
...                                                  ...            ...   
98871  (NoDifficultyCompletingTasks, NoHeadInjury, No...  (NoDiagnosis)   
99083  (NoMemoryComplaints, NoBehavioralProblems, NoP...  (NoDiagnosis)   
99128  (NoDifficultyCompletingTasks, NoMemoryComplain...  (NoDiagnosis)   
99174  (NoDifficultyCompletingTasks, NoMemoryComplain...  (NoDiagnosis)   
99350  (NoDifficultyCompletingTasks, NoMemoryComplain...  (NoDiagnosis)   

       antecedent support  consequent support   support  confidence      lift  \
28               0