# Summary

Read files with extension `sas7bdat`, adjust the data types and save in CSV format.

# Initialize

In [1]:
# Libraries

from glob import glob
import logging
import os
from typing import Optional

import pandas as pd

In [2]:
# Constants
DATA_PATH = os.path.join(os.environ['AIRFLOW_HOME'], 'resources', 'data')
IMMIGRATION_DATA_PATH = os.path.join(DATA_PATH, '18-83510-I94-Data-2016')
CSV_PATH = os.path.join(IMMIGRATION_DATA_PATH, 'csv')
FORMAT = 'sas7bdat'

# Create directory to save CSV files
if not os.path.isdir(CSV_PATH):
    os.mkdir(CSV_PATH)

# Set logging
logging.basicConfig(
    format='%(asctime)s [%(levelname)s] %(message)s',
    level=logging.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger()

In [3]:
# Functions

def sas_to_csv(sas_file: str, extension: str = FORMAT, csv_path: str = CSV_PATH
              ) -> Optional[str]:
    file_path, ext = os.path.splitext(sas_file)
    file_folder, file_name = os.path.split(file_path)
    if ext == f'.{FORMAT:s}':
        return os.path.join(csv_path, f'{file_name}.csv')
    else:
        print(f'Invalid file name "{sas_file}"')

In [4]:
# Schema

# https://pandas.pydata.org/pandas-docs/stable/getting_started/basics.html#dtypes
int_na = pd.Int64Dtype()
str_na = pd.StringDtype()

column_types = {
    'cicid': int_na,
    'i94yr': int_na,
    'i94mon': int_na,
    'i94cit': int_na,
    'i94res': int_na,
    'i94port': str_na,
    'arrdate': int_na,
    'i94mode': int_na,
    'i94addr': str_na,
    'depdate': int_na,
    'i94bir': int_na,
    'i94visa': int_na,
    'count': int_na,
    'dtadfile': str_na,  # data looks like int, but fails to convert
    'visapost': str_na,
    'occup': str_na,
    'entdepa': str_na,
    'entdepd': str_na,
    'entdepu': str_na,
    'matflag': str_na,
    'biryear': int_na,
    'dtaddto': str_na,
    'gender': str_na,
    'insnum': str_na,  # data looks like int, but fails to convert
    'airline': str_na,
    'admnum': int_na,
    'fltno': str_na,
    'visatype': str_na,
}

# SAS to CSV

In [5]:
# Load one file to test

# sas_file = os.path.join(IMMIGRATION_DATA_PATH, 'i94_jan16_sub.sas7bdat')
# it = pd.read_sas(sas_file,
#                  format=FORMAT,
#                  encoding='ISO-8859-1',
#                  chunksize=1000)
# df = next(it)
# df_types = df.astype(dtype=column_types)

# df_types.to_csv(csv_file, index=False)

In [None]:
# ~2 min per file
for sas_file in glob(os.path.join(IMMIGRATION_DATA_PATH, f'*.{FORMAT:s}')):
    if os.path.split(sas_file)[1] != 'i94_jun16_sub.sas7bdat':
        continue

    logger.info(f'Loading "{sas_file:s}"...')
    df = pd.concat(pd.read_sas(sas_file,
                               format=FORMAT,
                               encoding='ISO-8859-1',
                               chunksize=200000))
    logger.info('Done!')
    if sorted(df.columns.tolist()) != sorted(list(column_types.keys())):
        # "i94_jun16_sub.sas7bdat" has extra columns that are not documented:
        # validres,delete_days,delete_mexl,delete_dup,delete_visa,delete_recdup
        logger.error(f'Columns do not match columns in schema: {df.columns.tolist()}')
        df = df.loc[:, column_types.keys()]
    logger.info('Converting schema...')
    df_types = df.astype(dtype=column_types)
    logger.info('Done!')
    csv_file = sas_to_csv(sas_file)
    logger.info(f'Saving CSV file {csv_file:s}...')
    df_types.to_csv(csv_file, index=False)
    logger.info('Done!')

# Inspect one of the DataFrames

In [7]:
df.shape

(2570543, 28)

In [8]:
df_memory_usage = df.memory_usage(index=False, deep=True)
df_memory_usage

cicid        20564344
i94yr        20564344
i94mon       20564344
i94cit       20564344
i94res       20564344
i94port     154232580
arrdate      20564344
i94mode      20564344
i94addr     147632162
depdate      20564344
i94bir       20564344
i94visa      20564344
count        20564344
dtadfile    165747244
visapost    111169476
occup        82479164
entdepa     149090922
entdepd     141476146
entdepu      82259014
matflag     141476146
biryear      20564344
dtaddto     166828681
gender      141403788
insnum       86202688
airline     149808563
admnum       20564344
fltno       158981610
visatype    151772987
dtype: int64

In [9]:
total = df_memory_usage.sum()
print(f'Total memory usage (MB): {total/10**6}')
print(f'Memory usage per row (kB): {total/10**3/df.shape[0]}')

Total memory usage (MB): 2297.897643
Memory usage per row (kB): 0.8939347223524369


In [10]:
df_memory_usage_types = df_types.memory_usage(index=False, deep=True)
df_memory_usage_types

cicid        23134887
i94yr        23134887
i94mon       23134887
i94cit       23134887
i94res       23134887
i94port      20564344
arrdate      23134887
i94mode      23134887
i94addr      20564344
depdate      23134887
i94bir       23134887
i94visa      23134887
count        23134887
dtadfile    165747244
visapost     20564344
occup        20564344
entdepa      20564344
entdepd      20564344
entdepu      20564344
matflag      20564344
biryear      23134887
dtaddto      20564344
gender       20564344
insnum       86202688
airline      20564344
admnum       23134887
fltno        20564344
visatype     20564344
dtype: int64

In [11]:
# ~3 times more efficient with the custom schema

total = df_memory_usage_types.sum()
print('With custom schema:')
print(f'Total memory usage (MB): {total/10**6}')
print(f'Memory usage per row (kB): {total/10**3/df.shape[0]}')

With custom schema:
Total memory usage (MB): 820.039935
Memory usage per row (kB): 0.3190142841415219
