In [2]:
import pandas as pd
import os
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline


In [3]:
current_path = os.getcwd()
path = os.path.join(current_path, 'data')
files = os.listdir(path)
vaxfiles = [f for f in files if f[-7:] == 'VAX.csv']
datafiles = [f for f in files if f[-8:] == 'DATA.csv']

In [4]:
vaxes = pd.DataFrame()

for f in vaxfiles:
    fname = os.path.join(current_path, os.path.join('data', f))
    data = pd.read_csv(fname, encoding='us-ascii')
    data['fileyear'] = int(f[:4])
    vaxes = vaxes.append(data)
    
vaxes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162634 entries, 0 to 20071
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   VAERS_ID         162634 non-null  int64 
 1   VAX_TYPE         162634 non-null  object
 2   VAX_MANU         162634 non-null  object
 3   VAX_LOT          137090 non-null  object
 4   VAX_DOSE_SERIES  162634 non-null  object
 5   VAX_ROUTE        110883 non-null  object
 6   VAX_SITE         108981 non-null  object
 7   VAX_NAME         162634 non-null  object
 8   fileyear         162634 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 12.4+ MB


In [5]:
datas = pd.DataFrame()

for f in datafiles:
    fname = os.path.join(current_path, os.path.join('data', f))
    data = pd.read_csv(fname, encoding='cp850')
    data['fileyear'] = int(f[:4])
    data[['RECVDATE', 'VAX_DATE']] = data[['RECVDATE', 'VAX_DATE']].apply(pd.to_datetime)
    data['AGE_YRS'] = data['AGE_YRS'].apply(pd.to_numeric)
    datas = datas.append(data)
    
datas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96917 entries, 0 to 12122
Data columns (total 36 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   VAERS_ID      96917 non-null  int64         
 1   RECVDATE      96917 non-null  datetime64[ns]
 2   STATE         95580 non-null  object        
 3   AGE_YRS       86618 non-null  float64       
 4   CAGE_YR       72421 non-null  float64       
 5   CAGE_MO       31723 non-null  float64       
 6   SEX           96917 non-null  object        
 7   RPT_DATE      79132 non-null  object        
 8   SYMPTOM_TEXT  96841 non-null  object        
 9   DIED          1568 non-null   object        
 10  DATEDIED      1371 non-null   object        
 11  L_THREAT      1825 non-null   object        
 12  ER_VISIT      35831 non-null  object        
 13  HOSPITAL      8475 non-null   object        
 14  HOSPDAYS      5564 non-null   float64       
 15  X_STAY        556 non-null    object

In [6]:
bigdata = pd.merge(left = vaxes,
                   right = datas,
                   left_on = 'VAERS_ID',
                   right_on= 'VAERS_ID',
                   how = 'left')[['VAERS_ID', 'RECVDATE', 'AGE_YRS', 'SEX', 'SYMPTOM_TEXT', 'DIED', 'VAX_DATE', 'NUMDAYS', 'LAB_DATA',
                                    'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES', 'VAX_NAME', 'fileyear_y']].rename(columns={'fileyear_y' : 'FILEYEAR'})
bigdata['VAXYEAR'] = pd.DatetimeIndex(bigdata['VAX_DATE']).year
bigdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162634 entries, 0 to 162633
Data columns (total 16 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   VAERS_ID         162634 non-null  int64         
 1   RECVDATE         162634 non-null  datetime64[ns]
 2   AGE_YRS          151507 non-null  float64       
 3   SEX              162634 non-null  object        
 4   SYMPTOM_TEXT     162500 non-null  object        
 5   DIED             3558 non-null    object        
 6   VAX_DATE         153099 non-null  datetime64[ns]
 7   NUMDAYS          145951 non-null  float64       
 8   LAB_DATA         89553 non-null   object        
 9   VAX_TYPE         162634 non-null  object        
 10  VAX_MANU         162634 non-null  object        
 11  VAX_LOT          137090 non-null  object        
 12  VAX_DOSE_SERIES  162634 non-null  object        
 13  VAX_NAME         162634 non-null  object        
 14  FILEYEAR         162

In [11]:
sns.set()
#sns.displot(bigdata['VAX_TYPE'], kde=True)
vaxcounts = vaxes[['VAX_TYPE', 'VAERS_ID']].groupby('VAX_TYPE').agg('count').reset_index()
vaxcounts
#sns.barplot(x = 'VAX_TYPE', y = )
#plt.show()


Unnamed: 0,VAX_TYPE,VAERS_ID
0,ADEN,7
1,ANTH,692
2,BCG,7
3,CHOL,78
4,DT,1267
5,DTAP,6652
6,DTAPH,3
7,DTP,19494
8,DTPHIB,5826
9,DTPIPV,1
