In [None]:
import numpy as np
import pandas as pd
from typing import Union, Optional, List
from scipy.stats import chi2_contingency

In [None]:
# General EDA

def calculate_missing(df: pd.DataFrame, show_only_missing: bool = True, format_pct: bool = True):
    """Description"""
    df_missing = (
        df.isna().sum()
        .sort_values(ascending=False)
        .to_frame('missing_count')
        .assign(missing_pct = lambda x: x / len(df))
        .rename_axis('feature')
    )
    
    if show_only_missing:
        df_missing = df_missing.query('missing_count>0')
    
    if format_pct:
        df_missing = df_missing.style.format('{:.2%}', subset='missing_pct')
    
    return df_missing


import numpy as np
import pandas as pd
from typing import Union, Optional, List

def compute_counts(
    data: Union[pd.DataFrame, pd.Series, np.ndarray], 
    col: Optional[str] = None, 
    by: Optional[Union[str, List[str]]] = None
) -> pd.DataFrame:
    """
    Compute the absolute and relative frequency counts

    Parameters
    ----------
    data : Union[pd.DataFrame, pd.Series, np.ndarray]
        The input data. This can be a pandas DataFrame, Series, or a numpy array.
    col : Optional[str], optional
        The column name in the DataFrame for which to compute the counts. This parameter 
        is not used if the input data is a Series or numpy array. By default None.
    by : Optional[Union[str, List[str]]], optional
        The column name or list of column names to group the DataFrame by before computing counts. 
        This parameter is only used if the input data is a DataFrame. By default None.

    Returns
    -------
    pd.DataFrame
        A DataFrame with the absolute ('abs_count') and relative ('rel_count') frequency 
        counts of the values in the specified column or the entire data set if 'col' is None.

    Raises
    ------
    AssertionError
        If the input data is not a DataFrame, Series, or numpy array.
        If 'col' is None when input data is a DataFrame.
    """

    # Check if input data is either a dataframe, series, or numpy array
    assert type(data) in (pd.DataFrame, pd.Series, np.ndarray)
    
    # Compute counts for dataframe
    if isinstance(data, pd.DataFrame):
        assert col is not None, 'The `col` parameter should not be None when the input data is a dataframe'
        if by:
            grp = data.groupby(by)
            abs_count = grp[col].value_counts(normalize=False).to_frame('abs_count')
            rel_count = grp[col].value_counts(normalize=True).to_frame('rel_count')
        else:
            abs_count = data[col].value_counts(normalize=False).to_frame('abs_count')
            rel_count = data[col].value_counts(normalize=True).to_frame('rel_count')
    # Compute counts for series or numpy array
    else:
        data = pd.Series(data) if not isinstance(data, pd.Series) else data
        abs_count = data.value_counts(normalize=False).to_frame('abs_count')
        rel_count = data.value_counts(normalize=True).to_frame('rel_count')
    
    # Aggregate absolute and relative counts
    df_counts = pd.concat([abs_count, rel_count], axis=1)
    
    return df_counts

In [1]:
# Association analysis

def cramers_v(x, y, data=None, bias_correction=True):
    """
    Compute Cramer's V statistic for measuring association between two categorical variables.

    Parameters:
    - x, y: two lists/arrays of categorical data or column names if data is provided
    - data: Optional pandas DataFrame
    - bias_correction: whether to apply bias correction, default is True

    Returns:
    - Cramer's V value
    """
    if data is not None:
        x = data[x]
        y = data[y]
        
    contingency_table = pd.crosstab(x, y)
    chi2 = chi2_contingency(contingency_table)[0]
    n = contingency_table.sum().sum()
    r, k = contingency_table.shape

    if bias_correction:
        phi2 = chi2 / n
        phi2corr = max(0, phi2 - ((k-1)*(r-1)) / (n-1))
        rcorr = r - ((r-1)**2) / (n-1)
        kcorr = k - ((k-1)**2) / (n-1)
        return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
    else:
        return np.sqrt(chi2 / (n * min(k-1, r-1)))


def goodman_kruskal_lambda(df, x_col, y_col):
    """
    Compute Goodman-Kruskal's lambda for two categorical columns in a dataframe.

    Parameters:
    - df: pandas dataframe containing the data
    - x_col: name of the predictor variable column
    - y_col: name of the response variable column

    Returns:
    - lambda_value: Goodman-Kruskal's lambda
    """
    
    # Overall proportion of most frequent category of y_col
    overall_prop = df[y_col].value_counts(normalize=True).max()
    
    max_proportions = []
    for x_val in df[x_col].unique():
        # Proportion of most frequent category of y_col given a specific value of x_col
        subset_prop = df[df[x_col] == x_val][y_col].value_counts(normalize=True).max()
        max_proportions.append(subset_prop)
    
    lambda_value = (max(max_proportions) - overall_prop) / (1 - overall_prop)
    return lambda_value


# def goodman_kruskals_lambda(x, y, data=None):
#     """
#     Compute Goodman and Kruskal's Lambda for nominal data.
    
#     Parameters:
#     - x: list/array of predictor values or column name if data is provided
#     - y: list/array of target values or column name if data is provided
#     - data: Optional pandas DataFrame
    
#     Returns:
#     - lambda value
#     """
    
#     # Determine if data is provided or individual lists/arrays
#     if data is not None:
#         x = data[x]
#         y = data[y]
    
#     # Create a crosstab
#     ct = pd.crosstab(x, y)
    
#     # Proportion of errors without using the predictor
#     total_entries = ct.sum().sum()
#     max_in_each_row = ct.max(axis=1).sum()
#     pe = (total_entries - max_in_each_row) / total_entries
    
#     # Proportion of errors using the predictor
#     total_for_each_row = ct.sum(axis=1)
#     max_proportion_for_each_row = ct.max(axis=1) / total_for_each_row
#     po = 1 - max_proportion_for_each_row.mean()
    
#     # Compute lambda
#     lambda_val = (pe - po) / pe
#     return lambda_val
