In [214]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

pd.options.mode.chained_assignment = None  # default='warn'

In [215]:
#Load Dataset
dataset = pd.read_csv('https://raw.githubusercontent.com/gmbt18/CS176-Project/cedric-preproc/Exasens.csv')

#Removing unnecessary rows and cols
dataset.drop(dataset.columns[[9,10,11,12]],axis=1,inplace=True)
dataset.drop([0,1],axis=0,inplace=True)

#Renaming cols
dataset.columns.values[2] = "Img Min"
dataset.columns.values[3] = "Img Avg"
dataset.columns.values[4] = "Real Min"
dataset.columns.values[5] = "Real Avg"

#Rearranging Cols
# dataset = dataset[['Gender','Age','Smoking','Diagnosis']]

#Rearranging Rows
dataset = dataset.reset_index()
del dataset['index']

In [216]:
dataset.head()

Unnamed: 0,Diagnosis,ID,Img Min,Img Avg,Real Min,Real Avg,Gender,Age,Smoking
0,COPD,301-4,-320.61,-300.5635307,-495.26,-464.1719907,1.0,77.0,2.0
1,COPD,302-3,-325.39,-314.7503595,-473.73,-469.2631404,0.0,72.0,2.0
2,COPD,303-3,-323.0,-317.4360556,-476.12,-471.8976667,1.0,73.0,3.0
3,COPD,304-4,-327.78,-317.3996698,-473.73,-468.856388,1.0,76.0,2.0
4,COPD,305-4,-325.39,-316.1557853,-478.52,-472.8697828,0.0,65.0,2.0


In [217]:
ds2 = dataset.iloc[:,[0,6,7,8]]

# Group Ages
ds2['Age'].mask((ds2['Age']<40),1,inplace=True)
ds2['Age'].mask((ds2['Age']>=40) & (ds2['Age']<60),2,inplace=True)
ds2['Age'].mask(ds2['Age']>=60,3,inplace=True)

# Change diagnosis to binary
# ds2['Diagnosis'].replace(['Asthma','Infected','HC'],'None',inplace=True)

# Format the columns as string categoricals to process with pd.get_dummies()
ds2['Diagnosis'] = ds2.Diagnosis.astype(str)
ds2['Age'] = ds2.Age.astype(str)
ds2['Gender'] = ds2.Gender.astype(str)
ds2['Smoking'] = ds2.Smoking.astype(str)

ds2['Diagnosis'] = pd.Categorical(ds2.Diagnosis)
ds2['Age'] = pd.Categorical(ds2.Age)
ds2['Gender'] = pd.Categorical(ds2.Gender)
ds2['Smoking'] = pd.Categorical(ds2.Smoking)

ds2 = pd.get_dummies(ds2)

In [218]:
ds2.head()

Unnamed: 0,Diagnosis_Asthma,Diagnosis_COPD,Diagnosis_HC,Diagnosis_Infected,Gender_0.0,Gender_1.0,Age_1.0,Age_2.0,Age_3.0,Smoking_1.0,Smoking_2.0,Smoking_3.0
0,0,1,0,0,0,1,0,0,1,0,1,0
1,0,1,0,0,1,0,0,0,1,0,1,0
2,0,1,0,0,0,1,0,0,1,0,0,1
3,0,1,0,0,0,1,0,0,1,0,1,0
4,0,1,0,0,1,0,0,0,1,0,1,0


In [219]:
frequent_itemsets = apriori(ds2, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

diagnoses={'Diagnosis_COPD','Diagnosis_Infected','Diagnosis_Asthma','Diagnosis_HC','Diagnosis_None'}

rules_summary = rules.iloc[:,[0,1,4,5,6]]

# Removes rules unusable rules
# Rules should only have the format ('attr1',...) -> ('Diagnosis')
for i in range(len(rules_summary)):
  if(len(rules_summary['consequents'][i].intersection(diagnoses)) == 0
     or len(rules_summary['consequents'][i]) > 1):
    rules_summary.drop(index=i,inplace=True)
  elif(len(rules_summary['antecedents'][i].intersection(diagnoses)) != 0):
    rules_summary.drop(index=i,inplace=True)
  # if you want only 3-set items
  elif(len(rules_summary['antecedents'][i]) <3):
    rules_summary.drop(index=i,inplace=True)

In [220]:
# Check the top 10 rules with the highest support
rules_summary.sort_values(by='support',ascending=False)[:10]

Unnamed: 0,antecedents,consequents,support,confidence,lift
440,"(Smoking_2.0, Gender_1.0, Age_3.0)",(Diagnosis_COPD),0.115288,0.807018,4.075949
466,"(Gender_0.0, Age_1.0, Smoking_1.0)",(Diagnosis_HC),0.092732,0.578125,1.441699
499,"(Age_2.0, Gender_0.0, Smoking_1.0)",(Diagnosis_HC),0.052632,0.538462,1.342788
532,"(Age_1.0, Smoking_1.0, Gender_1.0)",(Diagnosis_HC),0.052632,0.65625,1.636523
578,"(Gender_0.0, Age_1.0, Smoking_1.0)",(Diagnosis_Infected),0.047619,0.296875,1.480664
513,"(Age_2.0, Gender_0.0, Smoking_2.0)",(Diagnosis_HC),0.042607,0.485714,1.21125
420,"(Gender_0.0, Smoking_2.0, Age_3.0)",(Diagnosis_COPD),0.032581,0.52,2.626329
366,"(Gender_0.0, Smoking_1.0, Age_3.0)",(Diagnosis_Asthma),0.030075,0.631579,3.15
486,"(Gender_0.0, Age_1.0, Smoking_3.0)",(Diagnosis_HC),0.030075,0.48,1.197
612,"(Gender_0.0, Age_2.0, Smoking_1.0)",(Diagnosis_Infected),0.027569,0.282051,1.406731


In [221]:
# Check the top 10 rules with the highest lift
rules_summary.sort_values(by='lift',ascending=False)[:10]

Unnamed: 0,antecedents,consequents,support,confidence,lift
454,"(Smoking_3.0, Gender_1.0, Age_3.0)",(Diagnosis_COPD),0.010025,1.0,5.050633
440,"(Smoking_2.0, Gender_1.0, Age_3.0)",(Diagnosis_COPD),0.115288,0.807018,4.075949
430,"(Gender_0.0, Smoking_3.0, Age_3.0)",(Diagnosis_COPD),0.010025,0.666667,3.367089
366,"(Gender_0.0, Smoking_1.0, Age_3.0)",(Diagnosis_Asthma),0.030075,0.631579,3.15
420,"(Gender_0.0, Smoking_2.0, Age_3.0)",(Diagnosis_COPD),0.032581,0.52,2.626329
380,"(Gender_0.0, Smoking_2.0, Age_3.0)",(Diagnosis_Asthma),0.025063,0.4,1.995
598,"(Gender_0.0, Age_1.0, Smoking_3.0)",(Diagnosis_Infected),0.025063,0.4,1.995
592,"(Gender_0.0, Age_1.0, Smoking_2.0)",(Diagnosis_Infected),0.010025,0.333333,1.6625
402,"(Smoking_1.0, Gender_1.0, Age_3.0)",(Diagnosis_Asthma),0.010025,0.333333,1.6625
532,"(Age_1.0, Smoking_1.0, Gender_1.0)",(Diagnosis_HC),0.052632,0.65625,1.636523


In [222]:
# Check the top 10 rules with the highest confidence
rules_summary.sort_values(by='confidence',ascending=False)[:10]

Unnamed: 0,antecedents,consequents,support,confidence,lift
454,"(Smoking_3.0, Gender_1.0, Age_3.0)",(Diagnosis_COPD),0.010025,1.0,5.050633
440,"(Smoking_2.0, Gender_1.0, Age_3.0)",(Diagnosis_COPD),0.115288,0.807018,4.075949
430,"(Gender_0.0, Smoking_3.0, Age_3.0)",(Diagnosis_COPD),0.010025,0.666667,3.367089
532,"(Age_1.0, Smoking_1.0, Gender_1.0)",(Diagnosis_HC),0.052632,0.65625,1.636523
366,"(Gender_0.0, Smoking_1.0, Age_3.0)",(Diagnosis_Asthma),0.030075,0.631579,3.15
543,"(Age_1.0, Smoking_3.0, Gender_1.0)",(Diagnosis_HC),0.012531,0.625,1.558594
552,"(Age_2.0, Smoking_1.0, Gender_1.0)",(Diagnosis_HC),0.02005,0.615385,1.534615
566,"(Age_2.0, Smoking_3.0, Gender_1.0)",(Diagnosis_HC),0.017544,0.583333,1.454688
466,"(Gender_0.0, Age_1.0, Smoking_1.0)",(Diagnosis_HC),0.092732,0.578125,1.441699
499,"(Age_2.0, Gender_0.0, Smoking_1.0)",(Diagnosis_HC),0.052632,0.538462,1.342788


In [223]:
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Gender_0.0),(Diagnosis_Asthma),0.601504,0.200501,0.137845,0.229167,1.142969,0.017242,1.037188
1,(Diagnosis_Asthma),(Gender_0.0),0.200501,0.601504,0.137845,0.6875,1.142969,0.017242,1.275188
2,(Age_2.0),(Diagnosis_Asthma),0.325815,0.200501,0.067669,0.207692,1.035865,0.002343,1.009076
3,(Diagnosis_Asthma),(Age_2.0),0.200501,0.325815,0.067669,0.3375,1.035865,0.002343,1.017638
4,(Age_3.0),(Diagnosis_Asthma),0.308271,0.200501,0.092732,0.300813,1.500305,0.030923,1.143469
