In [1]:
from pathlib import Path
from warnings import warn
from typing import Sequence

import pandas as pd
import pyreadstat
from pandas_weighting import weight

pd.DataFrame.weight = weight
pd.Series.weight = weight

In [2]:
path = Path("pyech", "data", "HyP_2019_Terceros.sav")
df, metadata = pyreadstat.read_sav(path)

In [3]:
def tabulate(data, weights, index=None, columns=None, values=None, aggfunc="mean",
             totals=True, apply_labels=True):
    valid = [v for v in [index, columns, values] if v]
    if not index and not columns:
        raise ValueError("must have index or columns")
    weighted = data[valid + ["numero"]].weight(data[weights])
    if not values:
        warn("`aggfunc` set to `len` since `values=None`.")
        aggfunc = len
    output = pd.pivot_table(data=weighted, values=values, index=index, columns=columns,
                            aggfunc=aggfunc, margins=totals, margins_name="Total")
    if apply_labels:
        if columns in metadata.variable_value_labels:
            output.rename(metadata.variable_value_labels[columns], axis=1, inplace=True)
        if index in metadata.variable_value_labels:
            output.rename(metadata.variable_value_labels[index], axis=0, inplace=True)
    return output

In [20]:
def ptiles(data, variable, weights, n=5, labels=False, by=None, result_weighted=False):
    if not isinstance(by, Sequence) or isinstance(by, str):
        by_array = [by]
    else:
        by_array = by
    valid = [v for v in [variable] + by_array if v]
    weighted = data[valid].weight(data[weights])
    if by:
        output = weighted.groupby(by)[variable].transform(func=pd.qcut, q=n, labels=labels)
    else:
        output = pd.qcut(weighted[variable], q=n, labels=labels)
    if result_weighted:
        return output
    else:
        return output.loc[~output.index.duplicated(keep="first")]

In [25]:
df["ptiles"] = ptiles(data=df, weights="pesoano", variable="HT11", by=["nomdpto"], labels=[f"Quintil {n+1}" for n in range(5)])

In [26]:
tabulate(df, index="pobre06", columns="ptiles", weights="pesoano", values="PT2", aggfunc="mean")

ptiles,Quintil 1,Quintil 2,Quintil 3,Quintil 4,Quintil 5,Total
pobre06,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
No pobre,6629.045782,9577.991759,12751.733048,17285.244444,30333.436546,16040.899837
Pobre,2962.684559,3641.935159,3797.465086,2435.675501,,3138.07589
Total,5443.652826,9099.24921,12490.333859,17219.926143,30333.436546,14910.483192
