In [1]:
# I found a dataset on Kaggle showing 2392 patients tested in the context of asthma indicators. The link:

# https://www.kaggle.com/datasets/rabieelkharoua/asthma-disease-dataset/data

# It is important to add that there is a misleading intruction on Kaggle's dataset's description. It says that all the patients are diagnosed with asthma, while the vast majority has value "No" in the variable Diagnose. For comparisonal purposes, we will split the dataset in two groups - 
# asthmatic and non-asthmatic minority, later on in the analysis.

In [2]:
# The purpose of this project is to get insight about the popularity of different traits combinations among asthmatic people and bad habits combinations in the general population. By general population, I mean the whole dataset (I will create asthmatic subset later on).

In [3]:
astma <- read.csv('asthma_disease_data.csv')

In [4]:
head(astma)

Unnamed: 0_level_0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,⋯,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis,DoctorInCharge
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>
1,5034,63,0,1,0,15.84874,0,0.8944483,5.488696,8.701003,⋯,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
2,5035,26,1,2,2,22.75704,0,5.8973295,6.341014,5.153966,⋯,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
3,5036,57,0,2,1,18.3954,0,6.739367,9.196237,6.840647,⋯,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
4,5037,40,1,2,1,38.51528,0,1.4045027,5.826532,4.253036,⋯,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
5,5038,61,0,0,3,19.2838,0,4.6044926,3.127048,9.625799,⋯,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid
6,5039,21,0,2,0,21.81298,0,0.4700439,1.759118,9.549262,⋯,2.328191,5.898515,1,0,1,0,0,1,0,Dr_Confid


In [5]:
str(astma)

'data.frame':	2392 obs. of  29 variables:
 $ PatientID             : int  5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 ...
 $ Age                   : int  63 26 57 40 61 21 45 26 49 45 ...
 $ Gender                : int  0 1 0 1 0 0 1 0 1 1 ...
 $ Ethnicity             : int  1 2 2 2 0 2 1 0 1 1 ...
 $ EducationLevel        : int  0 2 1 1 3 0 1 1 2 1 ...
 $ BMI                   : num  15.8 22.8 18.4 38.5 19.3 ...
 $ Smoking               : int  0 0 0 0 0 0 1 1 0 0 ...
 $ PhysicalActivity      : num  0.894 5.897 6.739 1.405 4.604 ...
 $ DietQuality           : num  5.49 6.34 9.2 5.83 3.13 ...
 $ SleepQuality          : num  8.7 5.15 6.84 4.25 9.63 ...
 $ PollutionExposure     : num  7.388 1.97 1.461 0.582 0.981 ...
 $ PollenExposure        : num  2.86 7.46 1.45 7.57 3.05 ...
 $ DustExposure          : num  0.974 6.585 5.446 3.965 8.261 ...
 $ PetAllergy            : int  1 0 0 0 0 1 0 0 1 0 ...
 $ FamilyHistoryAsthma   : int  1 0 1 0 0 0 0 0 0 0 ...
 $ HistoryOfAllergies    : int 

In [6]:
# The dominant majority of the patient do not have asthma

In [7]:
table(astma$Diagnosis)


   0    1 
2268  124 

In [8]:
any(is.na(astma))

In [9]:
# No missing values. We can go directly to essential variables transformations

## Data preprocessing 

In [10]:
# Patient id column is not needed. Let's remove it

In [11]:
astma$PatientID <- NULL

In [12]:
# Age

In [13]:
summary(astma$Age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   5.00   23.00   42.00   42.14   61.00   79.00 

In [14]:
# Create Age Categories
astma$Age_Group <- cut(astma$Age, 
                       breaks = c(0, 17, 35, 55, Inf), 
                       labels = c("Child", "Young Adult", "Middle-Aged", "Senior"),
                       right = TRUE) 

# Check the distribution
table(astma$Age_Group)


      Child Young Adult Middle-Aged      Senior 
        414         573         617         788 

In [15]:
astma$Age <- astma$Age_Group

In [16]:
# Ethnicity

In [17]:
astma$Ethnicity <- factor(astma$Ethnicity, 
                          levels = c(0, 1, 2, 3), 
                          labels = c("Caucasian", "African American", "Asian", "Other"))

# Check the encoding
table(astma$Ethnicity)


       Caucasian African American            Asian            Other 
            1465              475              229              223 

In [18]:
# Education

In [19]:
table(astma$Education)


  0   1   2   3 
478 933 749 232 

In [20]:
astma$Education <- factor(astma$Education, 
                          levels = c(0, 1, 2, 3), 
                          labels = c("None", "High School", "Bachelor's", "Higher"))

# Check the encoding
table(astma$Education)


       None High School  Bachelor's      Higher 
        478         933         749         232 

In [21]:
astma$EducationLevel <- astma$Education

In [22]:
astma$EducationLevel <- NULL

In [23]:
# BMI

In [24]:
summary(astma$BMI)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  15.03   20.97   27.05   27.24   33.56   39.99 

In [25]:
# Convert BMI into Categories
astma$BMI_Category <- cut(astma$BMI, 
                          breaks = c(0, 18.5, 24.9, 29.9, Inf), 
                          labels = c("Underweight", "Normal", "Overweight", "Obese"),
                          right = TRUE)

# Check distribution
table(astma$BMI_Category)


Underweight      Normal  Overweight       Obese 
        344         631         477         940 

In [26]:
astma$BMI <- astma$BMI_Category

In [27]:
astma$BMI_Category <- NULL

In [28]:
# PhysicalActivity     

In [29]:
summary(astma$PhysicalActivity)    

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00174 2.57833 5.01688 5.05179 7.54023 9.99581 

In [30]:
# Convert PhysicalActivity into categorical levels
astma$PhysicalActivity <- cut(astma$PhysicalActivity, 
                              breaks = c(0, 3, 6, Inf), 
                              labels = c("Sedentary", "Moderate", "Active"),
                              right = TRUE)

# Check distribution
table(astma$PhysicalActivity)


Sedentary  Moderate    Active 
      694       727       971 

In [31]:
# Convert DietQuality into categorical levels
astma$DietQuality <- cut(astma$DietQuality, 
                         breaks = c(0, 3, 6, 8, 10), 
                         labels = c("Poor", "Average", "Good", "Excellent"),
                         right = TRUE)

# Convert SleepQuality into categorical levels
astma$SleepQuality <- cut(astma$SleepQuality, 
                          breaks = c(4, 5, 7, 10), 
                          labels = c("Low", "Medium", "High"),
                          right = TRUE)

# Convert PollutionExposure into categorical levels
astma$PollutionExposure <- cut(astma$PollutionExposure, 
                               breaks = c(0, 3, 6, 10), 
                               labels = c("Low", "Moderate", "High"),
                               right = TRUE)

# Convert PollenExposure into categorical levels
astma$PollenExposure <- cut(astma$PollenExposure, 
                            breaks = c(0, 3, 6, 10), 
                            labels = c("Low", "Moderate", "High"),
                            right = TRUE)

# Convert DustExposure into categorical levels
astma$DustExposure <- cut(astma$DustExposure, 
                          breaks = c(0, 3, 6, 10), 
                          labels = c("Low", "Moderate", "High"),
                          right = TRUE)

# Check the distribution of the new variables
table(astma$DietQuality)
table(astma$SleepQuality)
table(astma$PollutionExposure)
table(astma$PollenExposure)
table(astma$DustExposure)



     Poor   Average      Good Excellent 
      715       704       482       491 


   Low Medium   High 
   385    821   1186 


     Low Moderate     High 
     733      683      976 


     Low Moderate     High 
     689      715      988 


     Low Moderate     High 
     711      738      943 

In [32]:
# Convert LungFunctionFEV1 into categorical levels
astma$LungFunctionFEV1 <- cut(astma$LungFunctionFEV1, 
                               breaks = c(0, 2, 3, 4), 
                               labels = c("Low", "Normal", "High"),
                               right = TRUE)

# Convert LungFunctionFVC into categorical levels
astma$LungFunctionFVC <- cut(astma$LungFunctionFVC, 
                              breaks = c(1.5, 3, 4, 6), 
                              labels = c("Low", "Normal", "High"),
                              right = TRUE)

# Check the distribution of the new variables
table(astma$LungFunctionFEV1)
table(astma$LungFunctionFVC)



   Low Normal   High 
   726    816    850 


   Low Normal   High 
   791    532   1069 

In [33]:
# The name of the doctor in chare is of course confidential, so let's remove this column

In [34]:
astma$DoctorInCharge <- NULL

In [35]:
table(astma$Ethnicity)


       Caucasian African American            Asian            Other 
            1465              475              229              223 

In [36]:
# I think it is crucial to try to focus only on association rules of items that are meaningful for our analysis. For exemple, as we have overepresentation of Caucasian in Ethinicity variable, I would omit this column as well. We stay with more significant location/environmental factors,
# such as the Dust Exposure or Pollution Exposure (which vary in accordance with different locations, in our dataset caee - ethinicities)

In [37]:
astma$Ethnicity <- NULL

In [38]:
str(astma)

'data.frame':	2392 obs. of  27 variables:
 $ Age                   : Factor w/ 4 levels "Child","Young Adult",..: 4 2 4 3 4 2 3 2 3 3 ...
 $ Gender                : int  0 1 0 1 0 0 1 0 1 1 ...
 $ BMI                   : Factor w/ 4 levels "Underweight",..: 1 2 1 4 2 2 4 3 4 4 ...
 $ Smoking               : int  0 0 0 0 0 0 1 1 0 0 ...
 $ PhysicalActivity      : Factor w/ 3 levels "Sedentary","Moderate",..: 1 2 3 1 2 1 3 3 1 1 ...
 $ DietQuality           : Factor w/ 4 levels "Poor","Average",..: 2 3 4 2 2 1 3 1 2 1 ...
 $ SleepQuality          : Factor w/ 3 levels "Low","Medium",..: 3 2 2 1 3 3 2 2 2 3 ...
 $ PollutionExposure     : Factor w/ 3 levels "Low","Moderate",..: 3 1 1 1 1 1 3 3 2 1 ...
 $ PollenExposure        : Factor w/ 3 levels "Low","Moderate",..: 1 3 1 3 2 3 2 3 3 1 ...
 $ DustExposure          : Factor w/ 3 levels "Low","Moderate",..: 1 3 2 2 3 3 3 1 3 3 ...
 $ PetAllergy            : int  1 0 0 0 0 1 0 0 1 0 ...
 $ FamilyHistoryAsthma   : int  1 0 1 0 0 0 0 0 0 0 ...


In [39]:
# Convert binary variables (0/1) into factors
binary_vars <- c("Gender", "Smoking", "PetAllergy", "FamilyHistoryAsthma", 
                  "HistoryOfAllergies", "Eczema", "HayFever", "GastroesophagealReflux", 
                  "Wheezing", "ShortnessOfBreath", "ChestTightness", "Coughing", 
                  "NighttimeSymptoms", "ExerciseInduced", "Diagnosis")

astma[binary_vars] <- lapply(astma[binary_vars], function(x) factor(x, levels = c(0,1), labels = c("No", "Yes")))

In [40]:
str(astma)

'data.frame':	2392 obs. of  27 variables:
 $ Age                   : Factor w/ 4 levels "Child","Young Adult",..: 4 2 4 3 4 2 3 2 3 3 ...
 $ Gender                : Factor w/ 2 levels "No","Yes": 1 2 1 2 1 1 2 1 2 2 ...
 $ BMI                   : Factor w/ 4 levels "Underweight",..: 1 2 1 4 2 2 4 3 4 4 ...
 $ Smoking               : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 1 1 ...
 $ PhysicalActivity      : Factor w/ 3 levels "Sedentary","Moderate",..: 1 2 3 1 2 1 3 3 1 1 ...
 $ DietQuality           : Factor w/ 4 levels "Poor","Average",..: 2 3 4 2 2 1 3 1 2 1 ...
 $ SleepQuality          : Factor w/ 3 levels "Low","Medium",..: 3 2 2 1 3 3 2 2 2 3 ...
 $ PollutionExposure     : Factor w/ 3 levels "Low","Moderate",..: 3 1 1 1 1 1 3 3 2 1 ...
 $ PollenExposure        : Factor w/ 3 levels "Low","Moderate",..: 1 3 1 3 2 3 2 3 3 1 ...
 $ DustExposure          : Factor w/ 3 levels "Low","Moderate",..: 1 3 2 2 3 3 3 1 3 3 ...
 $ PetAllergy            : Factor w/ 2 levels "No","Yes": 2 

In [41]:
# Let's ensure there are no missing values

In [42]:
any(is.na(astma))

## Frequent items comparison

In [43]:
# Load required packages

In [44]:
install.packages("arules")

Instalowanie pakietu w 'C:/Users/grzes/AppData/Local/R/win-library/4.4'
(ponieważ 'lib' nie jest określony)



pakiet 'arules' został pomyślnie rozpakowany oraz sumy MD5 zostały sprawdzone

Pobrane pakiety binarne są w
	C:\Users\grzes\AppData\Local\Temp\RtmpywaCk6\downloaded_packages


In [45]:
library(arules)
library(arulesViz)

"pakiet 'arules' został zbudowany w wersji R 4.4.2"
Ładowanie wymaganego pakietu: Matrix


Dołączanie pakietu: 'arules'


Następujące obiekty zostały zakryte z 'package:base':

    abbreviate, write


"pakiet 'arulesViz' został zbudowany w wersji R 4.4.2"


In [46]:
# Let's convert the data to transactional format. This will enable the system to read the data as each row represents the transactions with different items

In [47]:
astma_trans <- as(astma, "transactions")

In [48]:
# View first 5 transactions
inspect(astma_trans[1:5])

    items                         transactionID
[1] {Age=Senior,                               
     Gender=No,                                
     BMI=Underweight,                          
     Smoking=No,                               
     PhysicalActivity=Sedentary,               
     DietQuality=Average,                      
     SleepQuality=High,                        
     PollutionExposure=High,                   
     PollenExposure=Low,                       
     DustExposure=Low,                         
     PetAllergy=Yes,                           
     FamilyHistoryAsthma=Yes,                  
     HistoryOfAllergies=No,                    
     Eczema=No,                                
     HayFever=No,                              
     GastroesophagealReflux=No,                
     LungFunctionFEV1=Low,                     
     LungFunctionFVC=High,                     
     Wheezing=No,                              
     ShortnessOfBreath=No,              

In [49]:
summary(astma_trans)

transactions as itemMatrix in sparse format with
 2392 rows (elements/itemsets/transactions) and
 71 columns (items) and a density of 0.3802817 

most frequent items:
             Diagnosis=No                Smoking=No GastroesophagealReflux=No 
                     2268                      2053                      2014 
            PetAllergy=No                 Eczema=No                   (Other) 
                     1995                      1933                     54321 

element (itemset/transaction) length distribution:
sizes
  27 
2392 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
     27      27      27      27      27      27 

includes extended item information - examples:
           labels variables      levels
1       Age=Child       Age       Child
2 Age=Young Adult       Age Young Adult
3 Age=Middle-Aged       Age Middle-Aged

includes extended transaction information - examples:
  transactionID
1             1
2             2
3             3

In [50]:
# On average 38% of each row is filled out. That would indicate we may find some association rules - the density is not too high

## Let's compare the frequency of the items in the general population (astma_trans) and the subgroup with diagnosed asthma (Diagnose=Yes)

In [51]:
# Items support from non-asthmatic population

In [52]:
asthma_no <- subset(astma_trans, items %in% "Diagnosis=No")

In [53]:
sort(itemFrequency(asthma_no, type="relative"), decreasing = TRUE)

In [54]:
# Asthmatic patients

In [55]:
asthma_yes <- subset(astma_trans, items %in% "Diagnosis=Yes")

In [56]:
inspect(asthma_yes[1:5])

    items                         transactionID
[1] {Age=Young Adult,                          
     Gender=Yes,                               
     BMI=Underweight,                          
     Smoking=No,                               
     PhysicalActivity=Active,                  
     DietQuality=Poor,                         
     SleepQuality=Medium,                      
     PollutionExposure=Low,                    
     PollenExposure=High,                      
     DustExposure=Low,                         
     PetAllergy=Yes,                           
     FamilyHistoryAsthma=No,                   
     HistoryOfAllergies=Yes,                   
     Eczema=Yes,                               
     HayFever=No,                              
     GastroesophagealReflux=No,                
     LungFunctionFEV1=Low,                     
     LungFunctionFVC=Low,                      
     Wheezing=No,                              
     ShortnessOfBreath=No,              

In [57]:
sort(itemFrequency(asthma_yes, type="relative"), decreasing = TRUE)

In [58]:
# Asthmatic individuals differ most notably from non-asthmatics in respiratory symptoms. Wheezing is reported by 65.3% of asthmatics compared to 59.3% of non-asthmatics, while nighttime symptoms are significantly higher in asthmatics (55.6% vs. 39.5%). 
# Chest tightness and shortness of breath show smaller differences between groups.

# Lung function measures also set asthmatics apart. While 45.2% have high FVC, 27.4% have low FVC, compared to 44.7% and 33% in non-asthmatics, suggesting more variability in lung function among asthmatics.

# Environmental factors such as pollution and pollen exposure are similar between groups, with only slight differences in high exposure levels (43.5% vs. ~41%). Asthmatics also remain just as physically active as non-asthmatics, though they report slightly better sleep quality.

# Notably, asthmatics are less likely to smoke (11.3% vs. 14.3%) and have slightly lower rates of pet allergies, eczema, and hay fever, contradicting common assumptions. A family history of asthma appears equally in both groups (30%), reinforcing its genetic influence.

# Overall, asthma is best distinguished by respiratory symptoms and lung function, while lifestyle and environmental factors show only minor differences.

## Rules in the general population

In [59]:
frequent_general <- apriori(astma_trans, parameter = list(supp = 0.05, conf = 0.25, minlen = 2))

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
       0.25    0.1    1 none FALSE            TRUE       5    0.05      2
 maxlen target  ext
     10  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 119 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[71 item(s), 2392 transaction(s)] done [0.00s].
sorting and recoding items ... [71 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 7 8 9 10

"Mining stopped (maxlen reached). Only patterns up to a length of 10 returned!"


 done [2.01s].
writing ... [2185809 rule(s)] done [0.38s].
creating S4 object  ... done [1.11s].


In [60]:
inspect(sort(frequent_general[1:30], by = 'lift'))

     lhs                   rhs                         support    confidence
[1]  {Education=Higher} => {Coughing=No}               0.05476589 0.5646552 
[2]  {Education=Higher} => {Gender=No}                 0.05392977 0.5560345 
[3]  {Smoking=Yes}      => {PhysicalActivity=Active}   0.06061873 0.4277286 
[4]  {Education=Higher} => {SleepQuality=High}         0.05058528 0.5215517 
[5]  {Education=Higher} => {Eczema=No}                 0.08193980 0.8448276 
[6]  {Education=Higher} => {ChestTightness=No}         0.05016722 0.5172414 
[7]  {Smoking=Yes}      => {LungFunctionFEV1=Normal}   0.05016722 0.3539823 
[8]  {Smoking=Yes}      => {BMI=Obese}                 0.05769231 0.4070796 
[9]  {Smoking=Yes}      => {PollutionExposure=High}    0.05978261 0.4218289 
[10] {Smoking=Yes}      => {Education=High School}     0.05643813 0.3982301 
[11] {Education=Higher} => {ExerciseInduced=Yes}       0.05978261 0.6163793 
[12] {Smoking=Yes}      => {Wheezing=No}               0.05811037 0.4100295 

In [61]:
# These rules do not tell us that much about the relations within our database. Lift levels are close to 1 (independent items) and many rules do not have a meaning in terms of our analysis (ex. {Education=Higher} => {Coughing=No})

In [62]:
inspect(sort(frequent_general[1:30], by = 'support'))

     lhs                   rhs                         support    confidence
[1]  {Education=Higher} => {Diagnosis=No}              0.08904682 0.9181034 
[2]  {Education=Higher} => {Eczema=No}                 0.08193980 0.8448276 
[3]  {Education=Higher} => {Smoking=No}                0.08026756 0.8275862 
[4]  {Education=Higher} => {GastroesophagealReflux=No} 0.07901338 0.8146552 
[5]  {Education=Higher} => {PetAllergy=No}             0.07483278 0.7715517 
[6]  {Smoking=Yes}      => {SleepQuality=High}         0.06981605 0.4926254 
[7]  {Education=Higher} => {HayFever=No}               0.06897993 0.7112069 
[8]  {Smoking=Yes}      => {Gender=Yes}                0.06772575 0.4778761 
[9]  {Education=Higher} => {FamilyHistoryAsthma=No}    0.06688963 0.6896552 
[10] {Smoking=Yes}      => {LungFunctionFVC=High}      0.06396321 0.4513274 
[11] {Smoking=Yes}      => {PhysicalActivity=Active}   0.06061873 0.4277286 
[12] {Education=Higher} => {ExerciseInduced=Yes}       0.05978261 0.6163793 

In [63]:
# Let's do the same think, but with higer level of itemsets

In [64]:
frequent_general_3 <- apriori(astma_trans, parameter = list(supp = 0.07, conf = 0.25, minlen = 3))

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
       0.25    0.1    1 none FALSE            TRUE       5    0.07      3
 maxlen target  ext
     10  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 167 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[71 item(s), 2392 transaction(s)] done [0.00s].
sorting and recoding items ... [70 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 7 8 9 done [0.46s].
writing ... [726936 rule(s)] done [0.09s].
creating S4 object  ... done [0.19s].


In [65]:
inspect(sort(frequent_general_3[1:30], by = 'lift'))

     lhs                             rhs                            support confidence   coverage      lift count
[1]  {Smoking=Yes,                                                                                               
      Diagnosis=No}               => {ShortnessOfBreath=No}      0.07775920  0.5723077 0.13586957 1.1455732   186
[2]  {Smoking=Yes,                                                                                               
      PetAllergy=No}              => {NighttimeSymptoms=Yes}     0.07483278  0.6462094 0.11580268 1.0726807   179
[3]  {Smoking=Yes,                                                                                               
      HayFever=No}                => {FamilyHistoryAsthma=No}    0.07817726  0.7362205 0.10618729 1.0532532   187
[4]  {Smoking=Yes,                                                                                               
      Eczema=No}                  => {Wheezing=Yes}              0.07107023  0.6273063 0

In [73]:
# The highest rule in terms of the lift shows negative diagnosis and shortness of breath in the case of the patients who are smokers. It prooves our observation from frequent items comparison we did above - people who are already asthmatic, will take care of their health and not smoke.

In [67]:
inspect(sort(frequent_general_3[1:30], by = 'support'))

     lhs                             rhs                            support confidence   coverage      lift count
[1]  {Smoking=Yes,                                                                                               
      NighttimeSymptoms=Yes}      => {Diagnosis=No}              0.08403010  0.9571429 0.08779264 1.0094734   201
[2]  {Smoking=Yes,                                                                                               
      Diagnosis=No}               => {NighttimeSymptoms=Yes}     0.08403010  0.6184615 0.13586957 1.0266204   201
[3]  {Smoking=Yes,                                                                                               
      HistoryOfAllergies=No}      => {Diagnosis=No}              0.08193980  0.9607843 0.08528428 1.0133140   196
[4]  {Smoking=Yes,                                                                                               
      Diagnosis=No}               => {HistoryOfAllergies=No}     0.08193980  0.6030769 0

## Rules in the asthmatic subset of population

In [68]:
asthma_rules <- apriori(asthma_yes, parameter = list(supp = 0.05, conf = 0.25, minlen = 2))

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
       0.25    0.1    1 none FALSE            TRUE       5    0.05      2
 maxlen target  ext
     10  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 6 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[70 item(s), 124 transaction(s)] done [0.00s].
sorting and recoding items ... [70 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 7 8 9 10

"Mining stopped (maxlen reached). Only patterns up to a length of 10 returned!"


 done [0.53s].
writing ... [5160179 rule(s)] done [1.04s].
creating S4 object  ... done [1.94s].


In [69]:
inspect(sort(asthma_rules[1:30], by = 'lift'))

     lhs                             rhs                          support   
[1]  {Smoking=Yes}                => {GastroesophagealReflux=Yes} 0.07258065
[2]  {GastroesophagealReflux=Yes} => {Smoking=Yes}                0.07258065
[3]  {SleepQuality=Low}           => {Education=Bachelor's}       0.05645161
[4]  {SleepQuality=Low}           => {ChestTightness=Yes}         0.07258065
[5]  {Smoking=Yes}                => {DustExposure=High}          0.05645161
[6]  {SleepQuality=Low}           => {HistoryOfAllergies=Yes}     0.06451613
[7]  {SleepQuality=Low}           => {LungFunctionFEV1=High}      0.05645161
[8]  {Smoking=Yes}                => {PollenExposure=High}        0.06451613
[9]  {SleepQuality=Low}           => {ShortnessOfBreath=Yes}      0.07258065
[10] {Smoking=Yes}                => {PhysicalActivity=Active}    0.05645161
[11] {Smoking=Yes}                => {ChestTightness=No}          0.08064516
[12] {SleepQuality=Low}           => {NighttimeSymptoms=No}       0.06451613

In [74]:
# Here we start observing some intresting things. Very high lift values for {Smoking=Yes} => {GastroesophagealReflux=Yes} and the inverse. Lower, but still high lifts for {SleepQuality=Low} => {Education=Bachelor's} and {SleepQuality=Low} => {ChestTightness=Yes}.
# {SleepQuality=Low} => {LungFunctionFEV1=High} rule represents as well relatively high level of the lift. These facts proove that things that we can control in our behaviour and habits have also impact on our health. Young adults (people at bachelor level, it may include students in vast majority of cases) 
# are the most vulnerable group, as they sleep less

In [71]:
inspect(sort(asthma_rules[1:30], by = 'support'))

     lhs                             rhs                          support   
[1]  {Smoking=Yes}                => {Diagnosis=Yes}              0.11290323
[2]  {Smoking=Yes}                => {Eczema=No}                  0.08870968
[3]  {Smoking=Yes}                => {PetAllergy=No}              0.08870968
[4]  {Smoking=Yes}                => {ChestTightness=No}          0.08064516
[5]  {Smoking=Yes}                => {ExerciseInduced=Yes}        0.08064516
[6]  {Smoking=Yes}                => {GastroesophagealReflux=Yes} 0.07258065
[7]  {GastroesophagealReflux=Yes} => {Smoking=Yes}                0.07258065
[8]  {Smoking=Yes}                => {SleepQuality=High}          0.07258065
[9]  {Smoking=Yes}                => {NighttimeSymptoms=Yes}      0.07258065
[10] {Smoking=Yes}                => {Wheezing=Yes}               0.07258065
[11] {Smoking=Yes}                => {FamilyHistoryAsthma=No}     0.07258065
[12] {Smoking=Yes}                => {HayFever=No}                0.07258065

In [75]:
# {Smoking=Yes} => {Diagnosis=Yes} the most frequent item in terms of support level. It may mean that smoking is in fact a risky activity, that goes together with asthma risk