In [1]:
import os
os.chdir('../quafing/')
print(f"Working directory: {os.getcwd()}")
import quafing as q


import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter


plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams.update({"font.size" : 15, 
                     "figure.dpi" : 100, 
                     "legend.fontsize" : 13, 
                     "grid.alpha" : 0.3, 
                     "axes.grid": True, 
                     "axes.axisbelow" : True, 
                     "figure.figsize":(6, 5)})

Working directory: /Users/charlesdupont/Desktop/Thesis/code/quafing


In [2]:
def load_data(path):
    """
    Loads .dta file using provided path.
    """
    return pd.read_stata(path, convert_categoricals=False)

In [3]:
DATA_DIR = "../../BCCASII/"
SUB_DATA_DIR = "Household/"

In [4]:
all_hhid = set()
for i, filename in enumerate(os.listdir(DATA_DIR + SUB_DATA_DIR)):
    if ".dta" not in filename:
        continue
    data = load_data(DATA_DIR + SUB_DATA_DIR + filename)
    all_hhid = all_hhid.union(set(data["hhid"].unique()))

In [9]:
timea = time.time()

all_dfs = {hhid:[] for hhid in all_hhid}

for file_index, filename in enumerate(sorted(os.listdir(DATA_DIR + SUB_DATA_DIR))):

    if ".dta" in filename:
        
        data = load_data(DATA_DIR + SUB_DATA_DIR + filename)

        dataframes = {}
        max_cols = -1
        max_col_df = None

        for hhid in all_hhid:
            df = data.loc[data.hhid==hhid].drop(columns=["hhid"])  
            num_rows = df.shape[0]
            # create empty row if df is empty
            if num_rows == 0:
                df = pd.DataFrame({col:[np.nan] for col in df.columns})
            series = [df.iloc[i].rename({col:f"{col}_{i}" for col in df.columns}) for i in range(len(df))]
            df = pd.DataFrame(pd.concat(series)).T
            
            dataframes[hhid] = df

            # keep track of df with most entries
            num_cols = df.shape[1]
            if num_cols > max_cols:
                max_cols = num_cols
                max_col_df = df
        
        for hhid, df in dataframes.items():
            supplementary = pd.Series({col:np.nan for col in max_col_df if col not in df}, dtype="object")
            all_dfs[hhid].append(pd.DataFrame(pd.concat([df.squeeze(), supplementary])).T)

print(time.time()-timea)


# check that the dataframe sizes match up 
for f in range(len(all_dfs[1])):
    s = None
    for hhid, dfs in all_dfs.items():
        if s is None:
            s = dfs[f].shape
            continue
        else:
            if dfs[f].shape != s:
                print("error")

114.65274810791016
