In [31]:
import numpy as np
import pandas as pd
import itertools

rng = np.random.default_rng(42)

import sys
import os
if os.getcwd().find('Users/derekdewald/Doc')!=-1:
    sys.path.append("/Users/derekdewald/Documents/Python/Github_Repo/d_py_functions")
    from feature_engineering import binary_complex_equivlance
else:
    sys.path.append('K:\\INFORMATION_SYSTEMS\\Reporting and Analytics\\Derek\\BEEM_PY\\')
    sys.path.append('K:\\INFORMATION_SYSTEMS\\Reporting and Analytics\\Derek\\d_py_functions\\')

In [20]:
def create_member_df(series, number_of_mbrs=180_000):
    cols = series.tolist()
    data = rng.uniform(size=(number_of_mbrs, len(cols)))
    df = pd.DataFrame(data, columns=cols)
    df.insert(0, "MEMBERNBR", np.arange(number_of_mbrs, dtype=np.int32))
    return df

def class_weight_combos(classes, weight_list=(0, 5, 10, 20, 40), total=100):
    classes = list(classes)
    for ws in itertools.product(weight_list, repeat=len(classes)):
        if sum(ws) == total:
            yield dict(zip(classes, ws))

def within_weights_equal(features):
    k = len(features)
    return {f: 1.0 / k for f in features}

def within_weights_double_one(features, f_double):
    raw = {f: 1.0 for f in features}
    raw[f_double] = 2.0
    denom = sum(raw.values())
    return {f: raw[f] / denom for f in features}

def build_scenario_df(
    class_attribute_dict,
    class_weight_values=(0, 5, 10, 20, 40),
    total=100,
    include_baseline=True,
    include_one_class_double=True,
):
    classes = list(class_attribute_dict.keys())
    scenario_rows = []
    scenario_id = 0
    class_combo_id = 0

    for cw in class_weight_combos(classes, weight_list=class_weight_values, total=total):
        cw_norm = {c: cw[c] / 100.0 for c in classes}

        def emit_scenario(preset_by_class):
            nonlocal scenario_id
            for c in classes:
                feats = class_attribute_dict[c]
                preset = preset_by_class[c]

                if preset == "equal":
                    w_in = within_weights_equal(feats)
                elif preset.startswith("double__"):
                    f_double = preset.split("double__", 1)[1]
                    w_in = within_weights_double_one(feats, f_double)
                else:
                    raise ValueError(f"Unknown preset: {preset}")

                for f in feats:
                    scenario_rows.append({
                        "scenario_id": scenario_id,
                        "class_combo_id": class_combo_id,
                        "Class": c,
                        "feature": f,
                        "class_weight": cw_norm[c],
                        "within_weight": w_in[f],
                        "FinalWeight": cw_norm[c] * w_in[f],
                        "preset": preset,
                    })
            scenario_id += 1

        if include_baseline:
            emit_scenario({c: "equal" for c in classes})

        if include_one_class_double:
            for target_class in classes:
                for f_double in class_attribute_dict[target_class]:
                    preset_by_class = {c: "equal" for c in classes}
                    preset_by_class[target_class] = f"double__{f_double}"
                    emit_scenario(preset_by_class)

        class_combo_id += 1

    scen = pd.DataFrame(scenario_rows)
    scen["FinalWeight"] = scen["FinalWeight"] / scen.groupby("scenario_id")["FinalWeight"].transform("sum")
    return scen

def score_all_scenarios(df_clients, scenario_df, member_col="MEMBERNBR", feature_prefix=""):
    all_features = (
        scenario_df[["feature"]]
        .drop_duplicates()
        .sort_values("feature")["feature"]
        .tolist()
    )
    feature_cols = [feature_prefix + f for f in all_features]

    X = df_clients[feature_cols].to_numpy(dtype=np.float32)

    W = (
        scenario_df
        .pivot_table(index="scenario_id", columns="feature", values="FinalWeight", fill_value=0.0)
        .reindex(columns=all_features, fill_value=0.0)
        .to_numpy(dtype=np.float32)
    )

    S = X @ W.T

    out = pd.DataFrame({
        member_col: np.repeat(df_clients[member_col].to_numpy(), S.shape[1]),
        "scenario_id": np.tile(scenario_df["scenario_id"].unique(), S.shape[0]),
        "total_score": S.ravel()
    })
    return out


In [21]:
class_attribute_dict = {
    'liquidity':['liqudity_ck_deposit','liqudity_ck_sav_deposit','liquidity_td_deposit','liqudity_mutual_funds','liqudity_options_trading'],
    'liability':['liability_loan','liability_overdraft','liability_credit_card','liability_mortgage','liability_options_trading'],
    'complexity':['complexity_factor1','complexity_factor2','complexity_factor3'],
    'other_attributes':['other_attributes1','other_attributes2','other_attributes3','other_attributes4','other_attributes5',
                        'other_attributes6','other_attributes7'],
    'test_attribute':['test_attributes1','test_attributes2','test_attributes3','test_attributes4','test_attributes5','test_attributes6','test_attributes7'],
}

col_df = (
    pd.Series(class_attribute_dict)
      .explode()
      .reset_index()
      .rename(columns={"index": "Class", 0: "Subclass"})
)

df = create_member_df(col_df["Subclass"], number_of_mbrs=180_000)
scenario_df = build_scenario_df(class_attribute_dict)

In [71]:
df.head()

Unnamed: 0,MEMBERNBR,liqudity_ck_deposit,liqudity_ck_sav_deposit,liquidity_td_deposit,liqudity_mutual_funds,liqudity_options_trading,liability_loan,liability_overdraft,liability_credit_card,liability_mortgage,...,other_attributes5,other_attributes6,other_attributes7,test_attributes1,test_attributes2,test_attributes3,test_attributes4,test_attributes5,test_attributes6,test_attributes7
0,0,0.773956,0.438878,0.858598,0.697368,0.094177,0.975622,0.76114,0.786064,0.128114,...,0.063817,0.827631,0.631664,0.758088,0.354526,0.970698,0.893121,0.778383,0.194639,0.466721
1,1,0.043804,0.154289,0.683049,0.744762,0.96751,0.325825,0.37046,0.469556,0.189471,...,0.83226,0.804764,0.387478,0.288328,0.682496,0.139752,0.199908,0.007362,0.786924,0.664851
2,2,0.705165,0.780729,0.458916,0.568741,0.139797,0.11453,0.668403,0.471096,0.565236,...,0.408529,0.853403,0.233939,0.058303,0.281384,0.293594,0.661917,0.557032,0.783898,0.664314
3,3,0.406387,0.81402,0.166973,0.022712,0.090048,0.722359,0.461877,0.161272,0.501045,...,0.118006,0.961898,0.908581,0.699707,0.26587,0.969176,0.778751,0.71689,0.449362,0.272242
4,4,0.096391,0.902602,0.455776,0.202363,0.305957,0.57922,0.176773,0.856614,0.75852,...,0.493991,0.329861,0.144524,0.103403,0.587645,0.170593,0.92512,0.581061,0.34687,0.590915


In [72]:
scenario_df.head()

Unnamed: 0,scenario_id,class_combo_id,Class,feature,class_weight,within_weight,FinalWeight,preset
0,0,0,liquidity,liqudity_ck_deposit,0.0,0.2,0.0,equal
1,0,0,liquidity,liqudity_ck_sav_deposit,0.0,0.2,0.0,equal
2,0,0,liquidity,liquidity_td_deposit,0.0,0.2,0.0,equal
3,0,0,liquidity,liqudity_mutual_funds,0.0,0.2,0.0,equal
4,0,0,liquidity,liqudity_options_trading,0.0,0.2,0.0,equal


In [30]:
scenario_df.pivot_table(index=['scenario_id','class_combo_id'],columns='feature',values='FinalWeight',aggfunc='sum')

Unnamed: 0_level_0,feature,complexity_factor1,complexity_factor2,complexity_factor3,liability_credit_card,liability_loan,liability_mortgage,liability_options_trading,liability_overdraft,liqudity_ck_deposit,liqudity_ck_sav_deposit,...,other_attributes5,other_attributes6,other_attributes7,test_attributes1,test_attributes2,test_attributes3,test_attributes4,test_attributes5,test_attributes6,test_attributes7
scenario_id,class_combo_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,0.066667,0.066667,0.066667,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143
1,0,0.066667,0.066667,0.066667,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143
2,0,0.066667,0.066667,0.066667,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143
3,0,0.066667,0.066667,0.066667,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143
4,0,0.066667,0.066667,0.066667,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143,0.057143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3943,140,0.066667,0.066667,0.066667,0.08,0.08,0.08,0.08,0.08,0.08,0.08,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3944,140,0.066667,0.066667,0.066667,0.08,0.08,0.08,0.08,0.08,0.08,0.08,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3945,140,0.066667,0.066667,0.066667,0.08,0.08,0.08,0.08,0.08,0.08,0.08,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3946,140,0.066667,0.066667,0.066667,0.08,0.08,0.08,0.08,0.08,0.08,0.08,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [62]:
import numpy as np
import pandas as pd

def member_decile_hist_all_scenarios(df_clients, scenario_df, member_col="MEMBERNBR", feature_prefix="", batch_scenarios=64):
    # feature order
    features = scenario_df["feature"].drop_duplicates().sort_values().tolist()
    X = df_clients[[feature_prefix + f for f in features]].to_numpy(dtype=np.float32)
    members = df_clients[member_col].to_numpy()

    # weights matrix with stable scenario ordering
    W_df = (scenario_df
            .pivot_table(index="scenario_id", columns="feature", values="FinalWeight", fill_value=0.0)
            .reindex(columns=features, fill_value=0.0)
            .sort_index())

    scen_ids = W_df.index.to_numpy()
    W = W_df.to_numpy(dtype=np.float32)

    n_members = X.shape[0]
    n_scenarios = W.shape[0]

    hist = np.zeros((n_members, 10), dtype=np.uint32)

    # process in batches of scenarios to control memory
    for start in range(0, n_scenarios, batch_scenarios):
        end = min(start + batch_scenarios, n_scenarios)
        Sb = X @ W[start:end].T  # (n_members x batch)

        # for each scenario column, compute deciles and update histogram
        for j in range(Sb.shape[1]):
            col = Sb[:, j]
            order = np.argsort(col, kind="mergesort")
            ranks = np.empty_like(order)
            ranks[order] = np.arange(n_members, dtype=np.int32)  # 0..n-1
            pct = (ranks + 1) / n_members                        # (0,1]
            dec = np.minimum((pct * 10).astype(np.int32), 9)      # 0..9
            hist[np.arange(n_members), dec] += 1

    out = pd.DataFrame(hist, columns=[f"decile_{i}_count" for i in range(1, 11)])
    out.insert(0, member_col, members)
    out["n_scenarios"] = n_scenarios
    out["top_decile_rate"] = out["decile_10_count"] / n_scenarios
    return out

member_decile_hist = member_decile_hist_all_scenarios(df, scenario_df)

dec_cols = [f"decile_{i}_count" for i in range(1, 11)]
dec_values = np.arange(1, 11)

member_decile_hist["avg_decile"] = (
    member_decile_hist[dec_cols].values @ dec_values
) / member_decile_hist["n_scenarios"]


In [69]:
member_decile_hist

Unnamed: 0,MEMBERNBR,decile_1_count,decile_2_count,decile_3_count,decile_4_count,decile_5_count,decile_6_count,decile_7_count,decile_8_count,decile_9_count,decile_10_count,n_scenarios,top_decile_rate,avg_decile,SEGMENT_INDEX,TEXT_SEGMENT
0,0,0,0,0,0,0,0,16,196,1564,2172,3948,0.550152,9.492401,9,"10) GTE $8.89, LT $10.00"
1,1,445,910,794,614,507,386,215,72,5,0,3948,0.000000,3.566363,4,"5) GTE $3.33, LT $4.44"
2,2,1,259,636,621,718,945,709,59,0,0,3948,0.000000,4.966059,5,"6) GTE $4.44, LT $5.56"
3,3,462,864,810,692,521,396,185,18,0,0,3948,0.000000,3.497467,4,"5) GTE $3.33, LT $4.44"
4,4,455,616,664,661,609,454,352,135,2,0,3948,0.000000,3.965046,4,"5) GTE $3.33, LT $4.44"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179995,179995,0,3,132,559,737,851,748,616,298,4,3948,0.001013,6.158815,6,"7) GTE $5.56, LT $6.67"
179996,179996,965,509,515,416,474,399,210,257,200,3,3948,0.000760,3.878419,4,"5) GTE $3.33, LT $4.44"
179997,179997,0,0,0,0,27,193,600,470,885,1773,3948,0.449088,8.852077,8,"9) GTE $7.78, LT $8.89"
179998,179998,8,259,440,647,586,603,489,437,406,73,3948,0.018490,5.644630,6,"7) GTE $5.56, LT $6.67"


In [70]:
column_segmenter(member_decile_hist,'avg_decile',bin_list=np.linspace(0,10,10))

Unnamed: 0_level_0,TEXT_SEGMENT,avg_decile,avg_decile,avg_decile,avg_decile
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,max,min
0,"10) GTE $8.89, LT $10.00",9.382814,12662,9.999747,8.889058
1,11) GT $10.00,10.0,75,10.0,10.0
2,"2) GTE $0.00, LT $1.11",1.045212,1136,1.110942,1.0
3,"3) GTE $1.11, LT $2.22",1.733416,13194,2.222138,1.111196
4,"4) GTE $2.22, LT $3.33",2.809191,21075,3.33308,2.222644
5,"5) GTE $3.33, LT $4.44",3.900035,26677,4.444276,3.333333
6,"6) GTE $4.44, LT $5.56",5.002953,29698,5.555471,4.444529
7,"7) GTE $5.56, LT $6.67",6.10787,28810,6.666413,5.555724
8,"8) GTE $6.67, LT $7.78",7.20375,26336,7.777609,6.666667
9,"9) GTE $7.78, LT $8.89",8.304441,20337,8.888804,7.777862
