In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import gzip
import pandas as pd
from pyarrow import csv
import pyarrow.compute as pc

In [3]:
vdata_pd = pd.read_csv("2021VAERSDATA.csv", encoding="iso-8859-1")
columns = list(vdata_pd.columns)
vdata_pd.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 742277 entries, 0 to 742276
Data columns (total 35 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   VAERS_ID      742277 non-null  int64  
 1   RECVDATE      742277 non-null  object 
 2   STATE         637927 non-null  object 
 3   AGE_YRS       660716 non-null  float64
 4   CAGE_YR       592537 non-null  float64
 5   CAGE_MO       4188 non-null    float64
 6   SEX           742277 non-null  object 
 7   RPT_DATE      767 non-null     object 
 8   SYMPTOM_TEXT  741985 non-null  object 
 9   DIED          10484 non-null   object 
 10  DATEDIED      9217 non-null    object 
 11  L_THREAT      11085 non-null   object 
 12  ER_VISIT      118 non-null     object 
 13  HOSPITAL      47281 non-null   object 
 14  HOSPDAYS      31218 non-null   float64
 15  X_STAY        378 non-null     object 
 16  DISABLE       11921 non-null   object 
 17  RECOVD        669866 non-null  object 
 18  VAX_

In [4]:
vdata_arrow = csv.read_csv("2021VAERSDATA.csv")
tot_bytes = sum([
    vdata_arrow[name].nbytes
    for name in vdata_arrow.column_names])
print(f"Total {tot_bytes // (1024 ** 2)} MB")

# this prints the size in Arrow (less than half the memory)

Total 690 MB


In [5]:
for name in vdata_arrow.column_names:
    arr_bytes = vdata_arrow[name].nbytes
    arr_type = vdata_arrow[name].type
    pd_bytes = vdata_pd[name].memory_usage(index=False, deep=True)
    pd_type = vdata_pd[name].dtype
    print(
        name,
        arr_type, arr_bytes // (1024 ** 2),
        pd_type, pd_bytes // (1024 ** 2),)

# Arrow is more specific with type inference and is one of the main reasons why memory usage is lower.

VAERS_ID int64 5 int64 5
RECVDATE string 9 object 47
STATE string 4 object 39
AGE_YRS double 5 float64 5
CAGE_YR int64 5 float64 5
CAGE_MO double 5 float64 5
SEX string 3 object 41
RPT_DATE string 2 object 22
SYMPTOM_TEXT binary 458 object 496
DIED string 2 object 22
DATEDIED string 2 object 22
L_THREAT string 2 object 22
ER_VISIT string 2 object 22
HOSPITAL string 2 object 23
HOSPDAYS int64 5 float64 5
X_STAY string 2 object 22
DISABLE string 2 object 22
RECOVD string 3 object 39
VAX_DATE string 9 object 45
ONSET_DATE string 9 object 45
NUMDAYS int64 5 float64 5
LAB_DATA binary 27 object 54
V_ADMINBY string 4 object 42
V_FUNDBY string 2 object 22
OTHER_MEDS binary 22 object 51
CUR_ILL binary 8 object 36
HISTORY binary 21 object 52
PRIOR_VAX binary 4 object 25
SPLTTYPE string 7 object 32
FORM_VERS int64 5 int64 5
TODAYS_DATE string 9 object 47
BIRTH_DEFECT string 2 object 22
OFC_VISIT string 2 object 26
ER_ED_VISIT string 2 object 24
ALLERGIES binary 9 object 38


In [6]:
# time performance comparison

%timeit pd.read_csv("2021VAERSDATA.csv", encoding="iso-8859-1")
%timeit csv.read_csv("2021VAERSDATA.csv")

7.57 s ± 39.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
645 ms ± 109 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
# without SYMPTOM_TEXT

vdata_pd = pd.read_csv("2021VAERSDATA.csv", encoding="iso-8859-1", usecols=lambda x: x != "SYMPTOM_TEXT")
vdata_pd.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 742277 entries, 0 to 742276
Data columns (total 34 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   VAERS_ID      742277 non-null  int64  
 1   RECVDATE      742277 non-null  object 
 2   STATE         637927 non-null  object 
 3   AGE_YRS       660716 non-null  float64
 4   CAGE_YR       592537 non-null  float64
 5   CAGE_MO       4188 non-null    float64
 6   SEX           742277 non-null  object 
 7   RPT_DATE      767 non-null     object 
 8   DIED          10484 non-null   object 
 9   DATEDIED      9217 non-null    object 
 10  L_THREAT      11085 non-null   object 
 11  ER_VISIT      118 non-null     object 
 12  HOSPITAL      47281 non-null   object 
 13  HOSPDAYS      31218 non-null   float64
 14  X_STAY        378 non-null     object 
 15  DISABLE       11921 non-null   object 
 16  RECOVD        669866 non-null  object 
 17  VAX_DATE      685784 non-null  object 
 18  ONSE

In [8]:
columns.remove("SYMPTOM_TEXT")

In [9]:
vdata_arrow = csv.read_csv(
    "2021VAERSDATA.csv",
     convert_options=csv.ConvertOptions(include_columns=columns))
vdata_arrow.nbytes

242344236

In [10]:
# time performance comparison without SYMPTOM_TEXT
%timeit pd.read_csv("2021VAERSDATA.csv", encoding="iso-8859-1", usecols=lambda x: x != "SYMPTOM_TEXT")
%timeit csv.read_csv("2021VAERSDATA.csv", convert_options=csv.ConvertOptions(include_columns=columns))

5.26 s ± 409 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
679 ms ± 54.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
# use Arrow to load data into pandas
vdata = vdata_arrow.to_pandas()
vdata.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 742277 entries, 0 to 742276
Data columns (total 34 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   VAERS_ID      742277 non-null  int64  
 1   RECVDATE      742277 non-null  object 
 2   STATE         742277 non-null  object 
 3   AGE_YRS       660716 non-null  float64
 4   CAGE_YR       592537 non-null  float64
 5   CAGE_MO       4188 non-null    float64
 6   SEX           742277 non-null  object 
 7   RPT_DATE      742277 non-null  object 
 8   DIED          742277 non-null  object 
 9   DATEDIED      742277 non-null  object 
 10  L_THREAT      742277 non-null  object 
 11  ER_VISIT      742277 non-null  object 
 12  HOSPITAL      742277 non-null  object 
 13  HOSPDAYS      31218 non-null   float64
 14  X_STAY        742277 non-null  object 
 15  DISABLE       742277 non-null  object 
 16  RECOVD        742277 non-null  object 
 17  VAX_DATE      742277 non-null  object 
 18  ONSE

In [12]:
# Arrow can self-destruct its representation while creating the pandas version
vdata = vdata_arrow.to_pandas(self_destruct=True)