In [None]:
import numpy as np
import pandas as pd

In [None]:
# General EDA

def calculate_missing(df: pd.DataFrame, show_only_missing: bool = True, format_pct: bool = True):
    """Description"""
    df_missing = (
        df.isna().sum()
        .sort_values(ascending=False)
        .to_frame('missing_count')
        .assign(missing_pct = lambda x: x / len(df))
        .rename_axis('feature')
    )
    
    if show_only_missing:
        df_missing = df_missing.query('missing_count>0')
    
    if format_pct:
        df_missing = df_missing.style.format('{:.2%}', subset='missing_pct')
    
    return df_missing


def compute_counts(df, col, by=None):
    if by:
        grp = df.groupby(by)
        abs_count = grp[col].value_counts(normalize=False).to_frame('abs_count')
        rel_count = grp[col].value_counts(normalize=True).to_frame('rel_count')
        df_counts = pd.concat([abs_count, rel_count], axis=1)
    else:
        abs_count = df[col].value_counts(normalize=False).to_frame('abs_count')
        rel_count = df[col].value_counts(normalize=True).to_frame('rel_count')
        df_counts = pd.concat([abs_count, rel_count], axis=1)
    return df_counts

In [1]:
# Association analysis

def cramers_v(x, y, bias_correction=True):
    """
    Compute Cramer's V statistic for measuring association between two categorical variables.

    Parameters:
    - x, y: two lists/arrays of categorical data
    - bias_correction: whether to apply bias correction, default is True

    Returns:
    - Cramer's V value
    """
    contingency_table = pd.crosstab(x, y)
    chi2 = chi2_contingency(contingency_table)[0]
    n = contingency_table.sum().sum()
    r, k = contingency_table.shape

    if bias_correction:
        phi2 = chi2 / n
        phi2corr = max(0, phi2 - ((k-1)*(r-1)) / (n-1))
        rcorr = r - ((r-1)**2) / (n-1)
        kcorr = k - ((k-1)**2) / (n-1)
        return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
    else:
        return np.sqrt(chi2 / (n * min(k-1, r-1)))
