# Notebook Intentions

The data types used by MEPS are a bit archaic. Additionally since 2017 several files have been formatted in a way such that they can only be accessed using SAS.

In this notebook we will attempt to unpack data contained in the ascii files using the data parameters intended for R developers. If we can successfully do this we won't require the use of pyr2 and the maintainence of a second language in the codebase (R).


In [None]:
import os
from os.path import expanduser

import sys
sys.path.append(os.path.join(expanduser("~"), "meps"))

from zipfile import ZipFile

from meps_db.components.populators import BaseComponentsPopulator as bcp
from meps_db.components.reference import FYCDF_PUF_SSP_LOOKUP
from meps_db.utilities.universal_utilities import UniversalUtilityFunctions as util

In [None]:
# get path
zip_path = bcp.get_zip_path(zip_type="consolidated", year=2018, year_lookup=FYCDF_PUF_SSP_LOOKUP)
print(f"Path to .dat file: {zip_path}")

In [None]:
# unzip
filename = zip_path.split("/")[-1]
unzip_path = zip_path.replace(filename, "")
unzipped_filename = filename.split("dat.zip")[0] + ".dat"

with ZipFile(zip_path,"r") as zip_ref:
    zip_ref.extractall(unzip_path)
    
print(f"Unzipped: {filename} to {unzip_path} as {unzipped_filename}")

In [None]:
# store ascii
with open(os.path.join(unzip_path, unzipped_filename), 'rb') as f:
    ascii_text = f.read()

In [None]:
# load R parameters
puf_params = util.load_data_from_file(
            file_path=os.path.join(
                expanduser("~"), 
                "meps", 
                "meps_dev", 
                "meps_db", 
                "components", 
                "populator_support", 
                unzipped_filename.strip('.dat'),
            ),
            file_format="json"
        )


In [None]:
# test first respondent
resp = 0
row = {}
for start, end, name, dtype in zip(
    puf_params["position_start"], 
    puf_params["postion_end"], 
    puf_params["var_names"], 
    puf_params["var_types"]
):
    val = ascii_text[start-1:end].decode("utf-8").strip()
    typed_val = float(val) if dtype == "n" else str(val)
    row.update({name: typed_val})
    
for key in list(row.keys())[:10]:
    print(f"{key}: {row[key]}")

In [None]:
# test full ascii text
data = []
resp = 0

row_data = ascii_text.decode("utf-8").split("\r\n")
for row in row_data[:-1]:
    resp_dict = {}
    for start, end, name, dtype in zip(
        puf_params["position_start"], 
        puf_params["postion_end"], 
        puf_params["var_names"], 
        puf_params["var_types"]
    ):
        val = row[start-1:end].strip()
        typed_val = float(val) if dtype == "n" else str(val)
        resp_dict.update({name: typed_val})
    
    data.append(resp_dict)