In [1]:
import numpy as np
import pandas as pd
import mlxtend.frequent_patterns
import mlxtend.preprocessing

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None)

In [4]:
import proj_code_pkg.vaers_csv
import proj_code_pkg.freq_itemsets

In [5]:
data_path = './data/2021VAERSDATA.csv'
symptoms_path = './data/2021VAERSSYMPTOMS.csv'
vax_path = './data/2021VAERSVAX.csv'

In [6]:
%%time
merged_data = proj_code_pkg.vaers_csv.merge_dataframes(
    [proj_code_pkg.vaers_csv.read_data_file(data_path),
     proj_code_pkg.vaers_csv.read_symptoms_file(symptoms_path),
     proj_code_pkg.vaers_csv.read_vax_file(vax_path)])

CPU times: user 6.53 s, sys: 849 ms, total: 7.38 s
Wall time: 8.58 s


In [7]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 993374 entries, 916600 to 1901712
Data columns (total 51 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   RECVDATE         993374 non-null  datetime64[ns]
 1   STATE            882928 non-null  object        
 2   AGE_YRS          903313 non-null  float64       
 3   CAGE_YR          806672 non-null  float64       
 4   CAGE_MO          7258 non-null    float64       
 5   SEX              993374 non-null  object        
 6   RPT_DATE         532 non-null     datetime64[ns]
 7   SYMPTOM_TEXT     993133 non-null  object        
 8   DIED             17698 non-null   object        
 9   DATEDIED         16263 non-null   datetime64[ns]
 10  L_THREAT         24798 non-null   object        
 11  ER_VISIT         89 non-null      object        
 12  HOSPITAL         100673 non-null  object        
 13  HOSPDAYS         72408 non-null   float64       
 14  X_STAY        

In [8]:
%%time
baskets = merged_data.apply(proj_code_pkg.freq_itemsets.build_basket, axis=1).tolist()

CPU times: user 52.5 s, sys: 792 ms, total: 53.3 s
Wall time: 53.3 s


In [9]:
baskets[:5]

[['TX',
  'Age 19-33',
  'Female',
  'Recovered',
  'COVID19 (COVID19 (MODERNA))',
  'Dysphagia',
  'Epiglottitis'],
 ['CA',
  'Age 65-78',
  'Female',
  'Recovered',
  'COVID19 (COVID19 (MODERNA))',
  'Anxiety',
  'Dyspnoea'],
 ['WA',
  'Age 19-33',
  'Female',
  'COVID19 (COVID19 (PFIZER-BIONTECH))',
  'Chest discomfort',
  'Dysphagia',
  'Pain in extremity',
  'Visual impairment'],
 ['WA',
  'Age 49-64',
  'Female',
  'Recovered',
  'COVID19 (COVID19 (MODERNA))',
  'Dizziness',
  'Fatigue',
  'Mobility decreased'],
 ['TX',
  'Age 34-48',
  'Female',
  'COVID19 (COVID19 (MODERNA))',
  'Injection site erythema',
  'Injection site pruritus',
  'Injection site swelling',
  'Injection site warmth']]

In [10]:
%%time
one_hot_baskets_df = proj_code_pkg.freq_itemsets.build_one_hot_basket_dataset(baskets)
one_hot_baskets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 993374 entries, 0 to 993373
Columns: 11074 entries, 21-hydroxylase deficiency to pH urine normal
dtypes: Sparse[bool, 0](11074)
memory usage: 35.8 MB
CPU times: user 2.52 s, sys: 19.6 ms, total: 2.54 s
Wall time: 2.56 s


In [11]:
%%time
frequent_itemsets = mlxtend.frequent_patterns.fpgrowth(one_hot_baskets_df, min_support=0.001, use_colnames=True)
frequent_itemsets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14260 entries, 0 to 14259
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   support   14260 non-null  float64
 1   itemsets  14260 non-null  object 
dtypes: float64(1), object(1)
memory usage: 222.9+ KB
CPU times: user 20.8 s, sys: 239 ms, total: 21.1 s
Wall time: 21.1 s


In [12]:
frequent_itemsets.head(200)

Unnamed: 0,support,itemsets
0,0.67722,(Female)
1,0.428552,(COVID19 (COVID19 (MODERNA)))
2,0.312031,(Recovered)
3,0.152127,(Age 19-33)
4,0.05607,(TX)
5,0.004864,(Dysphagia)
6,0.173983,(Age 65-78)
7,0.096221,(CA)
8,0.040967,(Dyspnoea)
9,0.008004,(Anxiety)


In [13]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

In [14]:
frequent_itemsets.sort_values(by='length', ascending=False).head(20)

Unnamed: 0,support,itemsets,length
5258,0.001052,"(Pain, COVID19 (COVID19 (PFIZER-BIONTECH)), Pyrexia, Recovered, Female, Age 34-48)",6
5400,0.001066,"(Pain, COVID19 (COVID19 (PFIZER-BIONTECH)), Pyrexia, Recovered, Female, Age 49-64)",6
2482,0.001412,"(COVID19 (COVID19 (MODERNA)), Injection site swelling, Injection site pruritus, Injection site erythema, Female, Age 34-48)",6
3310,0.001057,"(Chills, Pain, Recovered, Female, Age 34-48, Headache)",6
3235,0.001429,"(COVID19 (COVID19 (MODERNA)), Chills, Age 65-78, Recovered, Female, Fatigue)",6
3314,0.00104,"(COVID19 (COVID19 (MODERNA)), Chills, Pain, Recovered, Female, Age 34-48)",6
3022,0.001444,"(Chills, COVID19 (COVID19 (PFIZER-BIONTECH)), Recovered, Female, Headache, Age 49-64)",6
2937,0.00133,"(Chills, COVID19 (COVID19 (PFIZER-BIONTECH)), Recovered, Female, Age 34-48, Headache)",6
5314,0.001315,"(COVID19 (COVID19 (MODERNA)), Pain, Recovered, Female, Headache, Age 49-64)",6
3231,0.001059,"(Chills, Age 65-78, Recovered, Female, Fatigue, Headache)",6


In [15]:
%%time
assoc_rules = mlxtend.frequent_patterns.association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
assoc_rules.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1912 entries, 0 to 1911
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   antecedents         1912 non-null   object 
 1   consequents         1912 non-null   object 
 2   antecedent support  1912 non-null   float64
 3   consequent support  1912 non-null   float64
 4   support             1912 non-null   float64
 5   confidence          1912 non-null   float64
 6   lift                1912 non-null   float64
 7   leverage            1912 non-null   float64
 8   conviction          1912 non-null   float64
dtypes: float64(7), object(2)
memory usage: 134.6+ KB
CPU times: user 103 ms, sys: 115 µs, total: 103 ms
Wall time: 101 ms


In [16]:
assoc_rules.sort_values(by=['leverage', 'confidence', 'lift', 'conviction'], ascending=False).head(200)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1589,(Unknown Sex),(Age 79-older),0.036581,0.145562,0.030099,0.822807,5.652606,0.024775,4.822085
1594,"(COVID19 (COVID19 (MODERNA)), Unknown Sex)",(Age 79-older),0.013102,0.145562,0.01084,0.827353,5.683834,0.008933,4.949045
1515,(Death),(Died),0.008738,0.017816,0.008686,0.994009,55.792909,0.00853,163.949167
250,(Injection site pruritus),(COVID19 (COVID19 (MODERNA))),0.019756,0.428552,0.016782,0.849478,1.982206,0.008316,3.796437
1136,"(Male, Life-threatening illness)",(Hospitalized ),0.011318,0.101345,0.009365,0.827448,8.164707,0.008218,5.208033
252,"(Female, Injection site pruritus)",(COVID19 (COVID19 (MODERNA))),0.018246,0.428552,0.015612,0.855669,1.996653,0.007793,3.95929
107,(Injection site erythema),(Female),0.029544,0.67722,0.026635,0.901561,1.331266,0.006628,3.278968
257,"(Injection site erythema, Injection site pruritus)","(COVID19 (COVID19 (MODERNA)), Female)",0.012168,0.303073,0.010009,0.822619,2.71426,0.006322,3.92899
428,(Injection site warmth),(COVID19 (COVID19 (MODERNA))),0.014974,0.428552,0.012066,0.805782,1.880244,0.005649,2.942297
108,"(COVID19 (COVID19 (MODERNA)), Injection site erythema)",(Female),0.022927,0.67722,0.021132,0.921712,1.361023,0.005605,4.123
