In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
import mlxtend.frequent_patterns
import mlxtend.preprocessing

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import proj_code_pkg.vaers_csv
import proj_code_pkg.freq_itemsets

In [4]:
data_path = './data/2021VAERSDATA.csv'
symptoms_path = './data/2021VAERSSYMPTOMS.csv'
vax_path = './data/2021VAERSVAX.csv'

In [5]:
%%time
merged_data = proj_code_pkg.vaers_csv.merge_dataframes(
    [proj_code_pkg.vaers_csv.read_data_file(data_path),
     proj_code_pkg.vaers_csv.read_symptoms_file(symptoms_path),
     proj_code_pkg.vaers_csv.read_vax_file(vax_path)])

CPU times: user 6.15 s, sys: 844 ms, total: 6.99 s
Wall time: 8.12 s


In [6]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 942308 entries, 916600 to 1845804
Data columns (total 51 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   RECVDATE         942308 non-null  datetime64[ns]
 1   STATE            837756 non-null  object        
 2   AGE_YRS          857303 non-null  float64       
 3   CAGE_YR          764411 non-null  float64       
 4   CAGE_MO          6394 non-null    float64       
 5   SEX              942308 non-null  object        
 6   RPT_DATE         491 non-null     datetime64[ns]
 7   SYMPTOM_TEXT     942132 non-null  object        
 8   DIED             16373 non-null   object        
 9   DATEDIED         15056 non-null   datetime64[ns]
 10  L_THREAT         23603 non-null   object        
 11  ER_VISIT         79 non-null      object        
 12  HOSPITAL         93526 non-null   object        
 13  HOSPDAYS         67476 non-null   float64       
 14  X_STAY        

In [7]:
%%time
baskets = merged_data.apply(proj_code_pkg.freq_itemsets.build_basket, axis=1).tolist()

CPU times: user 50.4 s, sys: 731 ms, total: 51.2 s
Wall time: 51.2 s


In [8]:
%%time
one_hot_baskets_df = proj_code_pkg.freq_itemsets.build_one_hot_basket_dataset(baskets)
one_hot_baskets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 942308 entries, 0 to 942307
Columns: 10882 entries, 21-hydroxylase deficiency to pH urine normal
dtypes: bool(10882)
memory usage: 9.5 GB
CPU times: user 2.01 s, sys: 1.51 s, total: 3.52 s
Wall time: 3.52 s


In [9]:
%%time
frequent_itemsets = mlxtend.frequent_patterns.fpgrowth(one_hot_baskets_df, min_support=0.01, use_colnames=True)
frequent_itemsets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762 entries, 0 to 761
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   support   762 non-null    float64
 1   itemsets  762 non-null    object 
dtypes: float64(1), object(1)
memory usage: 12.0+ KB
CPU times: user 19.3 s, sys: 278 ms, total: 19.6 s
Wall time: 19.6 s


In [10]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.681140,(Female)
1,0.432232,(COVID19 (COVID19 (MODERNA)))
2,0.314323,(Recovered)
3,0.153916,(Age 19-33)
4,0.055922,(TX)
...,...,...
757,0.010837,"(COVID19 (COVID19 (JANSSEN)), Female, Age 19-33)"
758,0.010880,"(COVID19 (COVID19 (JANSSEN)), Headache, Female)"
759,0.016151,"(COVID19 (COVID19 (JANSSEN)), Female, Recovered)"
760,0.014682,"(COVID19 (COVID19 (JANSSEN)), Female, Age 34-48)"


In [11]:
%%time
assoc_rules = mlxtend.frequent_patterns.association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
assoc_rules.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   antecedents         215 non-null    object 
 1   consequents         215 non-null    object 
 2   antecedent support  215 non-null    float64
 3   consequent support  215 non-null    float64
 4   support             215 non-null    float64
 5   confidence          215 non-null    float64
 6   lift                215 non-null    float64
 7   leverage            215 non-null    float64
 8   conviction          215 non-null    float64
dtypes: float64(7), object(2)
memory usage: 15.2+ KB
CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 10.3 ms


In [12]:
assoc_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(COVID19 (COVID19 (MODERNA))),(Female),0.432232,0.681140,0.306979,0.710218,1.042690,0.012568,1.100344
1,"(COVID19 (COVID19 (MODERNA)), Recovered)",(Female),0.148598,0.681140,0.106469,0.716493,1.051903,0.005253,1.124700
2,"(COVID19 (COVID19 (PFIZER-BIONTECH)), Recovered)",(Female),0.123969,0.681140,0.087433,0.705283,1.035444,0.002993,1.081917
3,(Age 19-33),(Female),0.153916,0.681140,0.108675,0.706066,1.036594,0.003836,1.084800
4,"(COVID19 (COVID19 (MODERNA)), Age 19-33)",(Female),0.062033,0.681140,0.045795,0.738239,1.083827,0.003542,1.218131
...,...,...,...,...,...,...,...,...,...
210,(Injection site rash),(Female),0.011732,0.681140,0.010365,0.883492,1.297077,0.002374,2.736796
211,(OR),(Female),0.013964,0.681140,0.010031,0.718346,1.054623,0.000520,1.132098
212,(Unknown Sex),(Age 79-older),0.035149,0.144175,0.028930,0.823073,5.708857,0.023862,4.837165
213,"(Unknown Sex, COVID19 (COVID19 (MODERNA)))",(Age 79-older),0.013034,0.144175,0.010815,0.829751,5.755175,0.008936,5.026899


In [13]:
assoc_rules.to_csv('assoc_rules.csv')