# Notebook Intentions

In this notebook we will attempt to standardize a methodology for converting ascii data and sas programming txt into a list of respondent dictionaries. If successful we can use this method for all data files across all years, and be able to map encoded values to their full form.

## Notes

Extracting variable names, locations in acsii files and building respondent dictionaries is fairly straightforward and standardized. Converting encoded values to lookup dictionaries is feasibly however the code is somewhat not very pretty or readable. As we move forward we will attempt to clean it up, or abandon it.

## Update

There are still many exceptions even with a single .dat file. Since all data is stored as text and there is no clear distinction between int, float and string variables it's difficult to make decisions whether leading zeros should be stripped. It doesn't help that some variables are binned floats with string exceptions. We will need to search for existing SAS to python packages to assist as this type of information is likely encoded in the txt file somewhere

In [None]:
import os
from os.path import expanduser

import sys
sys.path.append(os.path.join(expanduser("~"), "meps", "meps_dev"))

from meps_dev.meps_db.components.populators import BaseComponentsPopulator as bcp
from meps_dev.meps_db.components.reference import FYCDF_PUF_LOOKUP

In [None]:
ascii_text, sas_text = bcp.unpack_data(folder="consolidated", year=2016, year_lookup=FYCDF_PUF_LOOKUP)

In [None]:
# Identify key markers for seperating data
std_sas_text = sas_text.split("\n")
input_index = std_sas_text.index("* INPUT STATEMENTS;")
format_index = std_sas_text.index("* FORMAT STATEMENTS;")
label_index = std_sas_text.index("* LABEL STATEMENTS;")
value_index = std_sas_text.index("* VALUE STATEMENTS;")

In [None]:
# Build map for extracting ascii text
var_name_place = []
for row in std_sas_text[input_index:format_index]:
    # skip headers
    if "@" not in row:
        continue
    split_row = row.strip("INPUT").split()
    var_name_place.append(
        {
            "name": split_row[1],
            "start": int(float(split_row[0].strip("@"))),
            "size": int(float(split_row[2].strip("$")))
        }
    )

In [None]:
var_descriptions = {}
for row in std_sas_text[label_index:value_index]:
    if "=" not in row:
        continue
    split_row = row.strip("LABEL").split("=")
    var_descriptions[split_row[0].strip()] = split_row[1].replace("'", "")

In [None]:
var_descriptions

In [None]:
# extract ascii text
data = []
row_data = ascii_text.decode("utf-8").split("\r\n")
for row in row_data[:-1]: # last row is always empty
    data_dict = {}
    for var_dict in var_name_place:
        # SAS starts lists on 1, python on 0
        val = row[var_dict["start"]-1:var_dict["start"]-1+var_dict["size"]].strip()
        data_dict.update({var_dict["name"]: val})
    
    data.append(data_dict)

In [None]:
# build map between input variable names and format variable name
input_format_lookup = {}
for row in std_sas_text[format_index:label_index-2]:
    split_row = row.strip("FORMAT").split()
    if 1<=len(split_row)<2 or "*" in split_row:
        continue
    input_format_lookup[split_row[0]] = split_row[1].strip(".")

In [None]:
import re

In [None]:
# build lookup of variable name to value statements
value_statement_groups = []
# group on varaibles
value_statement = []
for row in std_sas_text[value_index:]:
    if "*" in row:
        continue
    # identified header
    if "VALUE" in row:
        # store previous group
        value_statement_groups.append(value_statement)
        value_statement = [row]
    else:
        value_statement.append(row)
# get last value statement
value_statement_groups.append(value_statement)


# build proto- map
variable_statements_map = {}
for group in value_statement_groups:
    if len(group)==0:
        continue
    key = group[0].split()[1]
    variable_statements_map[key] = [statement.strip() for statement in group[1:-1]]

var_map = {}
for var, statement_list in variable_statements_map.items():
    # binned values
    if any(len(re.findall(r"\d - \d", statement)) > 0 for statement in statement_list):
        vals = []
        encode_map = {}
        for statement in statement_list:
            # classify bin type
            if any(marker in statement for marker in {"YEAR", "AGE", "WEIGHT"}):
                dtype = "binned_int"
            if any(marker in statement for marker in {"$", "WAGE"}):
                dtype = "binned_currency"
            if any(marker in statement for marker in {"DUID", "DUPERSID"}):
                dtype = "binned_id"
                
            if len(re.findall(r"\d - \d", statement)) == 0:
                map_string = re.search("'(.*)'", statement).group(1)
                map_string_split = map_string.split()
                # handle zero mapping to zero
                if map_string_split[0] in {"0.00", "0.000000"}:
                    vals.extend([0])
                # handle negative number bins
                elif len(re.findall(r"\d - -\d", statement)) >= 1:
                    for substring, sub in [
                        (" - ", " "), (" = ", " "), ("$", ""), ("'", ""), (",", "")
                    ]:
                        statement = statement.replace(substring, sub)
                        statement = statement.split("=")[0]
                    vals.extend([float(val) for val in statement.split()])
                # handle simple exceptions
                else:
                    encode_map[map_string_split[0]] = " ".join(map_string_split[1:])
            
            else:
                for substring, sub in [
                    (" - ", " "), (" = ", " "), ("$", ""), ("'", ""), (",", "")
                ]:
                    statement = statement.replace(substring, sub)
                    statement = statement.split("=")[0]
                vals.extend([float(val) for val in statement.split()])
             
        var_map[var] = {
            "dtype": dtype,
            "min": min(vals),
            "max": max(vals),
            "encode_map": encode_map
        }
    # hot enocoded
    else:
        encode_map = {}
        for statement in statement_list:
            statement_list = statement.split("=")
            key = statement_list[0].strip().replace("'", "")
            val = " ".join(statement_list[1:]).replace(key, "", 1).replace("'", "").strip()
            #map_string_split = statement_list[1].split()
            #map_string_split = [split.replace("'", "") for split in map_string_split]
            #encode_map[map_string_split[0]] = " ".join(map_string_split[1:])
            encode_map[key]= val
        var_map[var] = {
            "dtype": "categorical",
            "encode_map": encode_map
        }
        


In [None]:
var_map

In [None]:
cleaned_resp = {}
for var, value in data[0].items():
    format_var_name = input_format_lookup[var]
    var_params = var_map[format_var_name]
    if var_params["dtype"] == "categorical":
        try:
            cleaned_resp[var] = var_params["encode_map"][value]
        except KeyError:
            
            cleaned_resp[var] = var_params["encode_map"][value.strip("0")]
