In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
import mlxtend.frequent_patterns
import mlxtend.preprocessing

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import proj_code_pkg.vaers_csv
import proj_code_pkg.freq_itemsets

In [4]:
data_path = './data/2021VAERSDATA.csv'
symptoms_path = './data/2021VAERSSYMPTOMS.csv'
vax_path = './data/2021VAERSVAX.csv'

In [5]:
%%time
merged_data = proj_code_pkg.vaers_csv.merge_dataframes(
    [proj_code_pkg.vaers_csv.read_data_file(data_path),
     proj_code_pkg.vaers_csv.read_symptoms_file(symptoms_path),
     proj_code_pkg.vaers_csv.read_vax_file(vax_path)])

CPU times: user 6.52 s, sys: 860 ms, total: 7.38 s
Wall time: 8.56 s


In [6]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 942308 entries, 916600 to 1845804
Data columns (total 51 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   RECVDATE         942308 non-null  datetime64[ns]
 1   STATE            837756 non-null  object        
 2   AGE_YRS          857303 non-null  float64       
 3   CAGE_YR          764411 non-null  float64       
 4   CAGE_MO          6394 non-null    float64       
 5   SEX              942308 non-null  object        
 6   RPT_DATE         491 non-null     datetime64[ns]
 7   SYMPTOM_TEXT     942132 non-null  object        
 8   DIED             16373 non-null   object        
 9   DATEDIED         15056 non-null   datetime64[ns]
 10  L_THREAT         23603 non-null   object        
 11  ER_VISIT         79 non-null      object        
 12  HOSPITAL         93526 non-null   object        
 13  HOSPDAYS         67476 non-null   float64       
 14  X_STAY        

In [7]:
%%time
baskets = merged_data.apply(proj_code_pkg.freq_itemsets.build_basket, axis=1).tolist()

CPU times: user 49.7 s, sys: 734 ms, total: 50.5 s
Wall time: 50.5 s


In [8]:
%%time
one_hot_baskets_df = proj_code_pkg.freq_itemsets.build_one_hot_basket_dataset(baskets)
one_hot_baskets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 942308 entries, 0 to 942307
Columns: 10882 entries, 21-hydroxylase deficiency to pH urine normal
dtypes: bool(10882)
memory usage: 9.5 GB
CPU times: user 1.91 s, sys: 1.71 s, total: 3.62 s
Wall time: 3.62 s


In [9]:
%%time
frequent_itemsets = mlxtend.frequent_patterns.fpgrowth(one_hot_baskets_df, min_support=0.001, use_colnames=True)
frequent_itemsets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14449 entries, 0 to 14448
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   support   14449 non-null  float64
 1   itemsets  14449 non-null  object 
dtypes: float64(1), object(1)
memory usage: 225.9+ KB
CPU times: user 28.6 s, sys: 353 ms, total: 28.9 s
Wall time: 28.9 s


In [10]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.681140,(Female)
1,0.432232,(COVID19 (COVID19 (MODERNA)))
2,0.314323,(Recovered)
3,0.153916,(Age 19-33)
4,0.055922,(TX)
...,...,...
14444,0.005002,"(COVID19 (COVID19 (JANSSEN)), Hospitalized , F..."
14445,0.003367,"(Age 49-64, COVID19 (COVID19 (JANSSEN)), Hospi..."
14446,0.001067,"(Age 34-48, COVID19 (COVID19 (JANSSEN)), Hospi..."
14447,0.001606,"(Age 49-64, COVID19 (COVID19 (JANSSEN)), Hospi..."


In [11]:
%%time
assoc_rules = mlxtend.frequent_patterns.association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
assoc_rules.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1995 entries, 0 to 1994
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   antecedents         1995 non-null   object 
 1   consequents         1995 non-null   object 
 2   antecedent support  1995 non-null   float64
 3   consequent support  1995 non-null   float64
 4   support             1995 non-null   float64
 5   confidence          1995 non-null   float64
 6   lift                1995 non-null   float64
 7   leverage            1995 non-null   float64
 8   conviction          1995 non-null   float64
dtypes: float64(7), object(2)
memory usage: 140.4+ KB
CPU times: user 105 ms, sys: 284 µs, total: 106 ms
Wall time: 105 ms


In [12]:
assoc_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(Age 34-48, TX, Headache)",(Female),0.002000,0.68114,0.001622,0.810610,1.190078,0.000259,1.683615
1,"(Pain, TX, Age 34-48)",(Female),0.001633,0.68114,0.001319,0.807667,1.185758,0.000207,1.657855
2,"(Age 34-48, TX, Fatigue)",(Female),0.001613,0.68114,0.001299,0.805263,1.182228,0.000200,1.637387
3,"(Dysphagia, Recovered)",(Female),0.002147,0.68114,0.001793,0.835393,1.226462,0.000331,1.937096
4,"(Dysphagia, COVID19 (COVID19 (MODERNA)))",(Female),0.001928,0.68114,0.001553,0.805173,1.182096,0.000239,1.636633
...,...,...,...,...,...,...,...,...,...
1990,"(Heavy menstrual bleeding, COVID19 (COVID19 (P...",(Female),0.002572,0.68114,0.002561,0.995462,1.461464,0.000809,70.265072
1991,"(Age 19-33, Heavy menstrual bleeding)",(Female),0.001310,0.68114,0.001304,0.995948,1.462178,0.000412,78.694562
1992,"(Age 34-48, Heavy menstrual bleeding)",(Female),0.002300,0.68114,0.002289,0.995385,1.461351,0.000723,69.096886
1993,"(Age 34-48, Heavy menstrual bleeding, COVID19 ...",(Female),0.001318,0.68114,0.001314,0.996779,1.463398,0.000416,99.005922


In [22]:
pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None)

In [27]:
assoc_rules.sort_values(by=['leverage', 'confidence', 'lift', 'conviction'], ascending=False).head(200)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1673,(Unknown Sex),(Age 79-older),0.035149,0.144175,0.02893,0.823073,5.708857,0.023862,4.837165
119,"(Injection site erythema, Female)",(COVID19 (COVID19 (MODERNA))),0.027377,0.432232,0.021969,0.802465,1.85656,0.010136,2.874268
1678,"(Unknown Sex, COVID19 (COVID19 (MODERNA)))",(Age 79-older),0.013034,0.144175,0.010815,0.829751,5.755175,0.008936,5.026899
269,(Injection site pruritus),(COVID19 (COVID19 (MODERNA))),0.020459,0.432232,0.017512,0.855957,1.980317,0.008669,3.941661
1603,(Death),(Died),0.008642,0.017375,0.008564,0.991035,57.036611,0.008414,109.609752
1200,"(Male, Life-threatening illness)",(Hospitalized ),0.011286,0.099252,0.009392,0.832158,8.38429,0.008272,5.366641
271,"(Injection site pruritus, Female)",(COVID19 (COVID19 (MODERNA))),0.018921,0.432232,0.016292,0.861069,1.992144,0.008114,4.086689
117,(Injection site erythema),(Female),0.030318,0.68114,0.027377,0.903007,1.325728,0.006727,3.287442
276,"(Injection site erythema, Injection site pruritus)","(COVID19 (COVID19 (MODERNA)), Female)",0.012622,0.306979,0.010454,0.828233,2.698009,0.006579,4.034649
458,(Injection site warmth),(COVID19 (COVID19 (MODERNA))),0.015402,0.432232,0.012548,0.814718,1.884907,0.005891,3.064341
