# Data Exploration

In [1]:
import numpy as np
import pandas as pd

In [16]:
%%time
def read_vaers_data(filename):
    return pd.read_csv(filename, index_col='VAERS_ID', encoding='iso-8859-1', low_memory=False, memory_map=True)

data = read_vaers_data('./data/2021VAERSDATA.csv')
symptoms = read_vaers_data('./data/2021VAERSSYMPTOMS.csv')
vax = read_vaers_data('./data/2021VAERSVAX.csv')

CPU times: user 6.19 s, sys: 3.34 s, total: 9.53 s
Wall time: 20.5 s


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 654986 entries, 916600 to 1828454
Data columns (total 34 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   RECVDATE      654986 non-null  object 
 1   STATE         572236 non-null  object 
 2   AGE_YRS       583424 non-null  float64
 3   CAGE_YR       518587 non-null  float64
 4   CAGE_MO       3182 non-null    float64
 5   SEX           654986 non-null  object 
 6   RPT_DATE      350 non-null     object 
 7   SYMPTOM_TEXT  654828 non-null  object 
 8   DIED          8536 non-null    object 
 9   DATEDIED      7550 non-null    object 
 10  L_THREAT      9957 non-null    object 
 11  ER_VISIT      52 non-null      object 
 12  HOSPITAL      39734 non-null   object 
 13  HOSPDAYS      26819 non-null   float64
 14  X_STAY        339 non-null     object 
 15  DISABLE       10371 non-null   object 
 16  RECOVD        595432 non-null  object 
 17  VAX_DATE      606079 non-null  object 
 18

In [18]:
symptoms.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 875131 entries, 916600 to 1828454
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   SYMPTOM1         875131 non-null  object 
 1   SYMPTOMVERSION1  875131 non-null  float64
 2   SYMPTOM2         683341 non-null  object 
 3   SYMPTOMVERSION2  683341 non-null  float64
 4   SYMPTOM3         526263 non-null  object 
 5   SYMPTOMVERSION3  526263 non-null  float64
 6   SYMPTOM4         399438 non-null  object 
 7   SYMPTOMVERSION4  399438 non-null  float64
 8   SYMPTOM5         297604 non-null  object 
 9   SYMPTOMVERSION5  297604 non-null  float64
dtypes: float64(5), object(5)
memory usage: 73.4+ MB


In [19]:
vax.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 689290 entries, 916600 to 1828454
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   VAX_TYPE         689290 non-null  object
 1   VAX_MANU         689290 non-null  object
 2   VAX_LOT          472898 non-null  object
 3   VAX_DOSE_SERIES  685885 non-null  object
 4   VAX_ROUTE        519639 non-null  object
 5   VAX_SITE         501263 non-null  object
 6   VAX_NAME         689290 non-null  object
dtypes: object(7)
memory usage: 42.1+ MB


In [15]:
vax['VAX_TYPE'].unique()

array(['COVID19', 'FLUC4', 'DTAPHEPBIP', 'HIBV', 'PNC13', 'RV1', 'UNK',
       'FLU4', 'PPV', 'FLUA3', 'VARZOS', 'MMR', 'DT', 'HPV9', 'DTAP',
       'MMRV', 'TDAP', 'FLUR4', 'DTAPIPVHIB', 'HEPA', 'MNQ', 'FLUX', 'YF',
       'ANTH', 'HEP', 'VARCEL', 'RV5', 'HPV4', 'MENB', 'IPV', 'RAB',
       'FLUA4', 'FLUN4', 'DTAPIPV', 'TYP', 'ADEN_4_7', 'CHOL', 'TTOX',
       'FLU3', 'FLUC3', 'HEPAB', 'TD', 'EBZR', 'PNC', 'DF', 'HPVX',
       'FLUX(H1N1)', 'RVX', 'DTP', 'MEN', 'JEV1', 'BCG', 'PER', 'SMALL',
       'OPV', 'TDAPIPV', 'MENHIB', 'FLUN3', 'FLU(H1N1)', 'MNQHIB',
       'DTPHEP', 'JEVX', 'DTPPVHBHPB', '6VAX-F'], dtype=object)

In [20]:
joined = data.join(symptoms).join(vax)

In [21]:
joined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 926331 entries, 916600 to 1828454
Data columns (total 51 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   RECVDATE         926331 non-null  object 
 1   STATE            823668 non-null  object 
 2   AGE_YRS          842540 non-null  float64
 3   CAGE_YR          751376 non-null  float64
 4   CAGE_MO          6242 non-null    float64
 5   SEX              926331 non-null  object 
 6   RPT_DATE         490 non-null     object 
 7   SYMPTOM_TEXT     926157 non-null  object 
 8   DIED             15966 non-null   object 
 9   DATEDIED         14689 non-null   object 
 10  L_THREAT         23183 non-null   object 
 11  ER_VISIT         79 non-null      object 
 12  HOSPITAL         91230 non-null   object 
 13  HOSPDAYS         66093 non-null   float64
 14  X_STAY           700 non-null     object 
 15  DISABLE          23286 non-null   object 
 16  RECOVD           851855 non-null

In [22]:
joined.head()

Unnamed: 0_level_0,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,DATEDIED,...,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
VAERS_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
916600,01/01/2021,TX,33.0,33.0,,F,,Right side of epiglottis swelled up and hinder...,,,...,,,,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
916601,01/01/2021,CA,73.0,73.0,,F,,Approximately 30 min post vaccination administ...,,,...,,,,COVID19,MODERNA,025L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
916602,01/01/2021,WA,23.0,23.0,,F,,"About 15 minutes after receiving the vaccine, ...",,,...,23.1,,,COVID19,PFIZER\BIONTECH,EL1284,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
916603,01/01/2021,WA,58.0,58.0,,F,,"extreme fatigue, dizziness,. could not lift my...",,,...,,,,COVID19,MODERNA,unknown,UNK,,,COVID19 (COVID19 (MODERNA))
916604,01/01/2021,TX,47.0,47.0,,F,,"Injection site swelling, redness, warm to the ...",,,...,23.1,,,COVID19,MODERNA,,1,IM,LA,COVID19 (COVID19 (MODERNA))


In [23]:
joined.loc[916611]

Unnamed: 0_level_0,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,DATEDIED,...,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
VAERS_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
916611,01/01/2021,NC,33.0,33.0,,F,,12pm Received shot 1 pm Sore arm at injection ...,,,...,23.1,Decreased appetite,23.1,COVID19,MODERNA,039k20a,1,SYR,RA,COVID19 (COVID19 (MODERNA))
916611,01/01/2021,NC,33.0,33.0,,F,,12pm Received shot 1 pm Sore arm at injection ...,,,...,23.1,Headache,23.1,COVID19,MODERNA,039k20a,1,SYR,RA,COVID19 (COVID19 (MODERNA))
916611,01/01/2021,NC,33.0,33.0,,F,,12pm Received shot 1 pm Sore arm at injection ...,,,...,23.1,Musculoskeletal chest pain,23.1,COVID19,MODERNA,039k20a,1,SYR,RA,COVID19 (COVID19 (MODERNA))
916611,01/01/2021,NC,33.0,33.0,,F,,12pm Received shot 1 pm Sore arm at injection ...,,,...,23.1,Pyrexia,23.1,COVID19,MODERNA,039k20a,1,SYR,RA,COVID19 (COVID19 (MODERNA))
916611,01/01/2021,NC,33.0,33.0,,F,,12pm Received shot 1 pm Sore arm at injection ...,,,...,,,,COVID19,MODERNA,039k20a,1,SYR,RA,COVID19 (COVID19 (MODERNA))


In [26]:
joined[joined['DIED'] == 'Y']

Unnamed: 0_level_0,RECVDATE,STATE,AGE_YRS,CAGE_YR,CAGE_MO,SEX,RPT_DATE,SYMPTOM_TEXT,DIED,DATEDIED,...,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
VAERS_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
916803,01/01/2021,LA,78.0,78.0,,M,,respitory colase,Y,10/18/2020,...,,,,FLU4,SANOFI PASTEUR,,4,IM,,INFLUENZA (SEASONAL) (FLUZONE HIGH-DOSE QUADRI...
917117,01/01/2021,AR,82.0,82.0,,M,,"After vaccination, patient tested positive for...",Y,01/01/2021,...,,,,COVID19,MODERNA,,1,IM,AR,COVID19 (COVID19 (MODERNA))
917790,01/03/2021,AR,90.0,90.0,,F,,"At the time of vaccination, there was an outbr...",Y,01/03/2021,...,23.1,,,COVID19,MODERNA,,1,IM,AR,COVID19 (COVID19 (MODERNA))
917793,01/03/2021,AR,78.0,78.0,,F,,Prior to the administration of the COVID 19 va...,Y,01/02/2021,...,,,,COVID19,MODERNA,,1,IM,AR,COVID19 (COVID19 (MODERNA))
918065,01/04/2021,CA,64.0,64.0,,M,,1/1/2020: Residents was found unresponsive. Pr...,Y,01/01/2021,...,,,,COVID19,MODERNA,025J20-2A,1,IM,,COVID19 (COVID19 (MODERNA))
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1828059,10/29/2021,,,,,M,,cancer; Initial information was received on 22...,Y,,...,,,,FLUX,UNKNOWN MANUFACTURER,,UNK,OT,,INFLUENZA (SEASONAL) (NO BRAND NAME)
1828410,10/29/2021,PA,,,,F,,Nobody was listening to her.She couldn't take ...,Y,,...,,,,COVID19,MODERNA,,UNK,OT,,COVID19 (COVID19 (MODERNA))
1828442,10/29/2021,,98.0,98.0,,F,,After receiving both doses of Moderna COVID va...,Y,09/06/2021,...,24.1,,,COVID19,MODERNA,012L20A,1,IM,AR,COVID19 (COVID19 (MODERNA))
1828442,10/29/2021,,98.0,98.0,,F,,After receiving both doses of Moderna COVID va...,Y,09/06/2021,...,24.1,,,COVID19,MODERNA,031L20A,2,IM,AR,COVID19 (COVID19 (MODERNA))
