In [3]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

vdata = pd.read_csv("2021VAERSDATA.csv", encoding="iso-8859-1")
# vdata information (columns, rows, datatypes, size)
vdata.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 742277 entries, 0 to 742276
Data columns (total 35 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   VAERS_ID      742277 non-null  int64  
 1   RECVDATE      742277 non-null  object 
 2   STATE         637927 non-null  object 
 3   AGE_YRS       660716 non-null  float64
 4   CAGE_YR       592537 non-null  float64
 5   CAGE_MO       4188 non-null    float64
 6   SEX           742277 non-null  object 
 7   RPT_DATE      767 non-null     object 
 8   SYMPTOM_TEXT  741985 non-null  object 
 9   DIED          10484 non-null   object 
 10  DATEDIED      9217 non-null    object 
 11  L_THREAT      11085 non-null   object 
 12  ER_VISIT      118 non-null     object 
 13  HOSPITAL      47281 non-null   object 
 14  HOSPDAYS      31218 non-null   float64
 15  X_STAY        378 non-null     object 
 16  DISABLE       11921 non-null   object 
 17  RECOVD        669866 non-null  object 
 18  VAX_

In [4]:
# size of each column in mb
for name in vdata.columns:
    col_bytes = vdata[name].memory_usage(index=False, deep=True)
    col_type = vdata[name].dtype
    print(
        name,
        col_type, col_bytes // (1024 ** 2))

VAERS_ID int64 5
RECVDATE object 47
STATE object 39
AGE_YRS float64 5
CAGE_YR float64 5
CAGE_MO float64 5
SEX object 41
RPT_DATE object 22
SYMPTOM_TEXT object 496
DIED object 22
DATEDIED object 22
L_THREAT object 22
ER_VISIT object 22
HOSPITAL object 23
HOSPDAYS float64 5
X_STAY object 22
DISABLE object 22
RECOVD object 39
VAX_DATE object 45
ONSET_DATE object 45
NUMDAYS float64 5
LAB_DATA object 54
V_ADMINBY object 42
V_FUNDBY object 22
OTHER_MEDS object 51
CUR_ILL object 36
HISTORY object 52
PRIOR_VAX object 25
SPLTTYPE object 32
FORM_VERS int64 5
TODAYS_DATE object 47
BIRTH_DEFECT object 22
OFC_VISIT object 26
ER_ED_VISIT object 24
ALLERGIES object 38


In [5]:
# original DIED size in bytes
vdata.DIED.memory_usage(index=False, deep=True)

24025448

In [6]:
# compact DIED size in bytes
vdata.DIED.fillna(False).astype(bool).memory_usage(index=False, deep=True)

742277

In [10]:
vdata["STATE"] = vdata.STATE.str.upper()
states = list(vdata["STATE"].unique())
states
#vdata.STATE.unique()

['TX',
 'CA',
 'WA',
 'NV',
 'KS',
 'OH',
 'TN',
 'VA',
 'NC',
 'NY',
 'AK',
 'GA',
 'NJ',
 'LA',
 nan,
 'IL',
 'MD',
 'ME',
 'MA',
 'MI',
 'CT',
 'FL',
 'OK',
 'AR',
 'ID',
 'PA',
 'IN',
 'MN',
 'NH',
 'MO',
 'CO',
 'NE',
 'UT',
 'AZ',
 'DE',
 'AL',
 'MT',
 'RI',
 'MS',
 'IA',
 'KY',
 'HI',
 'WV',
 'WI',
 'NM',
 'OR',
 'PR',
 'ND',
 'SC',
 'VT',
 'DC',
 'SD',
 'WY',
 'XB',
 'VI',
 'GU',
 'AS',
 'MP',
 'MH',
 'FM',
 'XL',
 'XV',
 'QM']

In [12]:
# convert STATE (text) into encoded_state (number)
vdata["encoded_state"] = vdata.STATE.apply(lambda state: states.index(state))
vdata["encoded_state"] = vdata["encoded_state"].astype(np.uint8)
vdata[["encoded_state", "STATE"]].head(10)

Unnamed: 0,encoded_state,STATE
0,0,TX
1,1,CA
2,2,WA
3,2,WA
4,0,TX
5,0,TX
6,3,NV
7,4,KS
8,5,OH
9,6,TN


In [13]:
# original STATE size in bytes
vdata["STATE"].memory_usage(index=False, deep=True)

40976893

In [14]:
# compact DIED size in bytes
vdata["encoded_state"].memory_usage(index=False,deep=True)

742277

In [15]:
vdata.index

RangeIndex(start=0, stop=742277, step=1)

In [17]:
# We only return the STATE column to save memory
states = list(pd.read_csv(
    "vdata_sample.csv",
    converters={
       "STATE": lambda state: state.upper()  # You need to know the states in advance
    },
    usecols=["STATE"]
)["STATE"].unique())

In [19]:
vdata = pd.read_csv(
    "vdata_sample.csv",
    index_col="VAERS_ID",
    converters={
       "DIED": lambda died: died == "Y",
       "STATE": lambda state: states.index(state.upper())
    },
    usecols=lambda name: name != "SYMPTOM_TEXT"
)
vdata["STATE"] = vdata["STATE"].astype(np.uint8)
vdata.info(memory_usage="deep")

# SYMPTOM_TEXT is not used reducing size

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668049 entries, 1459706 to 1282350
Data columns (total 33 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   RECVDATE      668049 non-null  object 
 1   STATE         668049 non-null  uint8  
 2   AGE_YRS       594491 non-null  float64
 3   CAGE_YR       533013 non-null  float64
 4   CAGE_MO       3723 non-null    float64
 5   SEX           668049 non-null  object 
 6   RPT_DATE      684 non-null     object 
 7   DIED          668049 non-null  bool   
 8   DATEDIED      8274 non-null    object 
 9   L_THREAT      9952 non-null    object 
 10  ER_VISIT      107 non-null     object 
 11  HOSPITAL      42441 non-null   object 
 12  HOSPDAYS      27985 non-null   float64
 13  X_STAY        346 non-null     object 
 14  DISABLE       10695 non-null   object 
 15  RECOVD        603018 non-null  object 
 16  VAX_DATE      617002 non-null  object 
 17  ONSET_DATE    608172 non-null  object 
 1