In [None]:
import pandas as pd
import numpy as np 
import os

import pyreadstat

wd = "/Users/annie.quinn/git/ford-hackathon/personal/annie/AO21"
filepath = os.path.join(wd, "AO21W1+2_CORE_RELEASE_V1(W)_NoID.zsav")
print(filepath)

data, meta = pyreadstat.read_sav(filepath, row_limit=10000) # Initial read in to get market ids

countries = ["Australia", "USA"]
market_ids = [k for k, v in meta.variable_value_labels["Market"].items() if v in countries]
print(f"Market ids for selected countries {dict(zip(countries, market_ids))}")

In [None]:
# Get dataframe of columns, descriptions and values
col_desc = pd.DataFrame([meta.column_names_to_labels]).melt(var_name = "col", value_name = "description")
col_values = pd.DataFrame([meta.variable_value_labels]).melt(var_name = "col", value_name = "values")
col_desc_values = col_desc.merge(col_values, how = "outer")
col_desc_values = col_desc_values.loc[col_desc_values["col"].isin(data.columns)]
col_desc_values.reset_index(drop=True, inplace=True)
col_desc_values.to_csv(os.path.join(wd, "ao_col_desc_values.csv"))

In [None]:
# Read data from sav - focus on selected market ids
df = []
i = 0
reader = pyreadstat.read_file_in_chunks(pyreadstat.read_sav, filepath, chunksize=10000, multiprocess=True)
for data, _ in reader:
    print(f"Reading chunk {i}")
    mask = (
        (data["Market"].isin(market_ids)) &  # In chosen market(s)
        ((data["PUR03_2_46"] == 1) | (data["PUR03_2_53"] == 1)) &  # Have bought a car in last 12-months
        (data["HV_Section"] == 2)  # Answered deep dive questions         
    )
    df.append(data.copy().loc[mask])
    i+=1

In [None]:
# Flatten to single df
df_all = pd.concat(df)
cols_nan = df_all.columns[df_all.isna().all(axis=0)]
df_all = df_all.drop(columns=cols_nan)
df_all.reset_index(drop=True, inplace=True)
df_all.shape

In [None]:
# Export
df_all.to_csv(os.path.join(wd, "ao_data.csv"))