In [23]:
import numpy as np
import pandas as pd
import pyarrow.feather as feather
import os
from sklearn.metrics import (
    classification_report, accuracy_score,
    roc_curve, auc, log_loss, roc_auc_score
)
from sklearn.preprocessing import label_binarize


from catboost import CatBoostClassifier


In [2]:
df_features_sample_load = pd.read_feather("data_for_students_features_sample_400cols.feather")
df_features_sample_load = df_features_sample_load.rename(columns = {'smt_decision_makerid_hash':'smt_decision_makerid'})

In [4]:
df_campaigns_load = pd.read_feather("for_students/campaigns.feather")

df_campaigns_load['report_date'] = df_campaigns_load['delivery_date'].apply(
    lambda x: x - pd.to_timedelta((x.weekday() + 1) % 7, unit='D')
)

In [31]:
df_targets_load = pd.read_feather("for_students/targets.feather")
df_targets_load['report_date'] = df_targets_load['report_date'].apply(
    lambda x: x - pd.to_timedelta((x.weekday() + 1) % 7, unit='D')
)
#df_targets_load['report_date'] = df_targets_load['report_date'].dt.date

# берём строки из duf только до 15.12, потому что таргет до конца 24
#df_targets_load = df_targets_load[df_targets_load.report_date <= pd.to_datetime('2024-12-15')]

# убираем повторяющиеся фичи юзера, которые получились в датасете в результате нескольких рассылок

df_targets_load.drop_duplicates(subset=['smt_decision_makerid', 'report_date'], inplace=True)

In [8]:
x_test_set_for_metrics_check =  pd.read_excel('x_test_set_for_metrics_check.xlsx')
x_val_set_for_params_tuning =  pd.read_excel('x_val_set_for_params_tuning.xlsx')

x_test_set_for_metrics_check['report_date'] = pd.to_datetime(x_test_set_for_metrics_check['report_date'])
x_val_set_for_params_tuning['report_date'] = pd.to_datetime(x_val_set_for_params_tuning['report_date'])
df_features_sample_load['report_date'] = pd.to_datetime(df_features_sample_load['report_date'])

# keys_metrics = set(zip(
#     x_test_set_for_metrics_check['smt_decision_makerid'],
#     x_test_set_for_metrics_check['report_date']
# ))
# keys_params = set(zip(
#     x_val_set_for_params_tuning['smt_decision_makerid'],
#     x_val_set_for_params_tuning['report_date']
# ))

# pairs = list(zip(
#     df_features_sample_load['smt_decision_makerid'],
#     df_features_sample_load['report_date']
# ))
# mask_metrics = [p in keys_metrics for p in pairs]
# mask_params  = [p in keys_params  for p in pairs]

# df_features_sample_for_metrics_check = df_features_sample_load[mask_metrics].reset_index(drop=True)
# df_features_sample_for_params_tuning  = df_features_sample_load[mask_params].reset_index(drop=True)
# df_features_sample  = df_features_sample_load[ [not (m or p) for m, p in zip(mask_metrics, mask_params)] ] \
#                     .reset_index(drop=True)

# print(f"Пересечение с metrics_check: {len(df_features_sample_for_metrics_check)} строк ({len(df_features_sample_for_metrics_check)/len(df_features_sample_load):.2%})")
# print(f"Пересечение с params_tuning: {len(df_features_sample_for_params_tuning)} строк ({len(df_features_sample_for_params_tuning)/len(df_features_sample_load):.2%})")
# print(f"Осталось для обучения     : {len(df_features_sample)} строк ({len(df_features_sample)/len(df_features_sample_load):.2%})")


df_features_sample = df_features_sample_load[df_features_sample_load.report_date < '2024-12-08']
df_features_sample_for_params_tuning = df_features_sample_load.merge(x_val_set_for_params_tuning, on = ['smt_decision_makerid', 'report_date'])

df_features_sample_for_metrics_check = df_features_sample_load.merge(x_test_set_for_metrics_check, on = ['smt_decision_makerid', 'report_date'])

print(f"Пересечение с metrics_check: {len(df_features_sample_for_metrics_check)} строк ({len(df_features_sample_for_metrics_check)/len(df_features_sample_load):.2%})")
print(f"Пересечение с params_tuning: {len(df_features_sample_for_params_tuning)} строк ({len(df_features_sample_for_params_tuning)/len(df_features_sample_load):.2%})")
print(f"Осталось для обучения     : {len(df_features_sample)} строк ({len(df_features_sample)/len(df_features_sample_load):.2%})")


Пересечение с metrics_check: 176233 строк (12.67%)
Пересечение с params_tuning: 135293 строк (9.73%)
Осталось для обучения     : 900578 строк (64.75%)


In [28]:
# мерж даатфреймов кампаний и юзеров
df_campaigns_load['report_date'] = pd.to_datetime(df_campaigns_load['report_date'], errors='coerce')
df_features_sample['report_date'] = pd.to_datetime(df_features_sample['report_date'], errors='coerce')

df_merged = pd.merge(df_campaigns_load, df_features_sample, on=["smt_decision_makerid", "report_date"], how="inner")
df_merged_test = pd.merge(df_campaigns_load, df_features_sample_for_params_tuning, on=["smt_decision_makerid", "report_date"], how="inner")
df_merged_val = pd.merge(df_campaigns_load, df_features_sample_for_metrics_check, on=["smt_decision_makerid", "report_date"], how="inner")

print("Merged dataset shape:", df_merged.shape)

# убираем скоррелированные фичи
numeric_cols = df_features_sample.select_dtypes(include=['float32']).columns
#numeric_cols = filter_high_correlation(df_merged, numeric_cols, threshold=0.95)
numeric_cols_filtered = numeric_cols

# df_merged[target_tags] = df_merged[target_tags].fillna('no_tag')
# df_merged_test[target_tags] = df_merged_test[target_tags].fillna('no_tag')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features_sample['report_date'] = pd.to_datetime(df_features_sample['report_date'], errors='coerce')


Merged dataset shape: (1087341, 379)


In [94]:
df_merged = df_merged_val

In [95]:
df_merged['delivery_date_merge'] = pd.to_datetime(df_merged['report_date'])
df_targets_load['delivery_date_targets'] = pd.to_datetime(df_targets_load['report_date'])

target_event_list = ['call', 'appointment', 'sale', 'reservation', 'deal']
df_targets_load.loc[df_targets_load.event == 'deal', 'event'] = 'sale'
df_targets_load = df_targets_load.query("event in @target_event_list")

df_join = df_merged.merge(
    df_targets_load,
    on='smt_decision_makerid',
    how='left'
)

# считаем разницу дней между событиями
df_join['delta_days'] = (
    df_join['delivery_date_targets']
    - df_join['delivery_date_merge']
).dt.days

# оставляем только события в [0,14] дня с момента delivery_date_merge
df_valid = df_join[
    (df_join['delta_days'] >= 0) &
    (df_join['delta_days'] <= 14)
].copy()

# для каждой исходной строки берём самое позднее событие (по delivery_date_targets)
df_valid = (
    df_valid
    .sort_values(['smt_decision_makerid','delivery_date_targets'], ascending=[True, False])
    .drop_duplicates(subset='smt_decision_makerid', keep='first')
)[['smt_decision_makerid', 'delivery_date_merge', 'delivery_date_targets', 'event', 'delta_days']]

df_merged_w_target = df_merged.merge(df_valid, on = ['smt_decision_makerid', 'delivery_date_merge'], how = 'left')

df_merged_w_target = df_merged_w_target.rename(columns = {'event':'event_target_14_day'})
df_merged_w_target['event_target_14_day'] = df_merged_w_target['event_target_14_day'].fillna('no_event')

# target_event_list = ['no_event', 'call', 'appointment', 'sale', 'reservation', 'deal']
# df_merged_w_target.loc[df_merged_w_target.query("event_target_14_day not in @target_event_list").index, 'event_target_14_day'] = 'no_event'
df_merged_w_target['event_target_14_day'] = df_merged_w_target['event_target_14_day'].fillna('no_event')

# сколько строк получили такое событие
matched = df_merged_w_target['delivery_date_targets'].notna().sum()
total   = len(df_merged_w_target)
print(f"Присоединились {matched} из {total} строк ({matched/total:.2%})")


Присоединились 14308 из 185111 строк (7.73%)


In [103]:
MODEL_DIR = "saved_models_timesplit"

target_tags = [
    'tag_channel',
    'tag_category',
    'tag_campaign_type',
    'tag_product',
    'tag_project_type',
    #'tag_project_region',
    'tag_discount_flag',
    'tag_chain_flag',
    'tag_model_flag',
    'target_event',
    'tag_realestate_flag',
    'event_target_14_day'
]

# тут подаются модели
models = {}
for tgt in target_tags:
    m = CatBoostClassifier()
    if tgt == 'event_target_14_day':
        MODEL_DIR = 'saved_models_timesplit'
    m.load_model(os.path.join(MODEL_DIR, f"catboost_{tgt}.cbm"))
    models[tgt] = m

df_predict = df_merged_w_target
X_pred     = df_predict[m.feature_names_]

results = df_predict.copy()

for tgt, model in models.items():
    raw_pred = model.predict(X_pred)
    pred = np.ravel(raw_pred)
    
    proba = model.predict_proba(X_pred)
    
    # Добавим колонку с предсказанным классом
    results[f"{tgt}_pred"] = pred
    
    # Добавим вероятности для каждого класса
    for cls_idx, cls in enumerate(model.classes_):
        results[f"{tgt}_proba_{cls}"] = proba[:, cls_idx]

display_cols = (
    ["smt_decision_makerid", "delivery_date"] +
    [c for c in results.columns if any(c.startswith(t) for t in target_tags)]
)
results

Unnamed: 0,smt_decision_makerid,delivery_date_x,hash,campaign_name,segment_name,ab_group,tag_channel,tag_category,tag_campaign_type,tag_product,...,target_event_proba_клик,tag_realestate_flag_pred,tag_realestate_flag_proba_not_realestate,tag_realestate_flag_proba_realestate,event_target_14_day_pred,event_target_14_day_proba_appointment,event_target_14_day_proba_call,event_target_14_day_proba_no_event,event_target_14_day_proba_reservation,event_target_14_day_proba_sale
0,340dd300a3093e8120264d701ca9f73c,2024-12-18,ef401fc6177a7592861c437cb6ce728b,[batch] [241203] [SMS] [МСК] Первичная недвижи...,CMPG-3843,control,sms,RealEstate,Active,realestate,...,0.031230,realestate,0.007569,0.992431,no_event,0.131431,0.188334,0.518386,0.069402,0.092447
1,c0be83012484d66019866e06d97b4927,2024-12-16,55129dae7f2b8bd58234d46dcbdecabb,[batch] [241205] [SMS call] Первичная недвижим...,DSML-790,target,sms,RealEstate,Sell,nbo,...,0.028490,realestate,0.000981,0.999019,no_event,0.181284,0.186912,0.537948,0.043438,0.050418
2,77873e820eb1c26b60600d39aba5ce59,2024-12-16,9c81fb5da5ee4318448829aec9dfd810,[batch] [241205] [SMS call] Первичная недвижим...,DSML-790,target,sms,RealEstate,Sell,nbo,...,0.038894,realestate,0.005171,0.994829,no_event,0.167612,0.209525,0.474349,0.055116,0.093398
3,43e8ce583566dec4c7bcd41993bfe27a,2024-12-17,94a33b125b24bcc1a782fb028856496c,[prod] [batch] [241202] [SMS] Первичная недвиж...,DSML-1466,target,sms,RealEstate,Active,mortgage,...,0.088828,realestate,0.000344,0.999656,sale,0.071009,0.064284,0.019654,0.226035,0.619019
4,7a2ce15332b5cb4c9033b4c62ce84213,2024-12-19,3ab43665ab5c68d98a4264a29ccaa04f,[batch] [241204] [SMS] ИЖС Цепочка 0-360,CMPG-3854,control,sms,RealEstate,Active,izhs,...,0.025198,realestate,0.013231,0.986769,no_event,0.183650,0.110385,0.465651,0.113997,0.126318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185106,3c8b83d1c65999cb51d43c4b2ddef0f2,2024-12-19,a8759dd372f645f880b90e0ffd21778b,batch_241216_Samolet_Bonus_Zhiteli_i_Ozhidayus...,CMPG-3913,target,email,RealEstate,Active,not_defined,...,0.338916,realestate,0.001835,0.998165,no_event,0.188253,0.149099,0.242921,0.215640,0.204087
185107,13ed88d99f4d2a37a10c7ce4ab3f6f22,2024-12-20,47fa93ae1563ba03f50728f041d9384a,[batch] [241217] [EMAIL] Первичная недвижимост...,CMPG-3687,target,email,RealEstate,Active,realestate,...,0.294699,realestate,0.001666,0.998334,no_event,0.099105,0.138221,0.571794,0.082114,0.108767
185108,e0a23b3d91acc62f324905c962d6c092,2024-12-19,f0aa55c4ab5fe4afea4630e5e44f8753,batch_241216_Samolet_Bonus_Zhiteli_i_Ozhidayus...,CMPG-3913,control,email,RealEstate,Active,not_defined,...,0.442903,realestate,0.004708,0.995292,reservation,0.098120,0.061385,0.171239,0.509378,0.159879
185109,e1b732abf935aa7b7724c16925669d94,2024-12-19,5228090e2a6e672cdee441a091d34252,batch_241216_Samolet_Bonus_Zhiteli_i_Ozhidayus...,CMPG-3913,target,email,RealEstate,Active,not_defined,...,0.453715,realestate,0.001503,0.998497,reservation,0.127523,0.101763,0.173153,0.360916,0.236645


In [104]:

target_event = 'event_target_14_day'  # имя колонки с флагом события
results[target_event] = results[target_event].fillna('no_event')

# остальные теги, без target_event
other_tags = [t for t in target_tags if t != target_event]
pred_other = [f"{t}_pred" for t in other_tags]
pred_event = f"{target_event}_pred"

# Число совпадений по другим тегам (вектор)
match_counts_other = (
    results[other_tags].values == results[pred_other].values
).sum(axis=1)
results['match_count_other'] = match_counts_other

# Печать распределения "k совпадений и более"
total = len(results)
m     = len(other_tags)

print(f"Всего строк: {total}\n")
print("Совпадения по остальным тегам (k и более):")
for k in range(1, m+1):
    cnt = np.sum(match_counts_other >= k)
    print(f" {k:2d} тегов: {cnt:6d} строк ({cnt/total:.2%})")

# Сколько правильно угадали self.target_event
event_matches = np.sum(
    results[target_event].values == results[pred_event].values
)
print(f"\nТочный прогноз события '{target_event}': {event_matches} из {total} строк "
      f"({event_matches/total:.2%})")

# Сколько строк, где и >50% остальных тегов, и событие угадано
half = m / 2
both = np.sum(
    (match_counts_other > half) &
    (results[target_event].values == results[pred_event].values)
)
print(f">50% тегов и правильно событие : {both} из {total} строк "
      f"({both/total:.2%})")


Всего строк: 185111

Совпадения по остальным тегам (k и более):
  1 тегов: 185111 строк (100.00%)
  2 тегов: 185111 строк (100.00%)
  3 тегов: 185111 строк (100.00%)
  4 тегов: 185111 строк (100.00%)
  5 тегов: 185102 строк (100.00%)
  6 тегов: 184913 строк (99.89%)
  7 тегов: 175568 строк (94.84%)
  8 тегов: 134218 строк (72.51%)
  9 тегов:  94982 строк (51.31%)
 10 тегов:  54091 строк (29.22%)

Точный прогноз события 'event_target_14_day': 125417 из 185111 строк (67.75%)
>50% тегов и правильно событие : 125327 из 185111 строк (67.70%)


In [105]:
from sklearn.metrics import classification_report

# Убедимся, что в колонке нет NaN
results['event_target_14_day'] = results['event_target_14_day'].fillna('no_event')

# Истинные и предсказанные метки
y_true = results['event_target_14_day']
y_pred = results['event_target_14_day_pred']

# Печатаем отчёт
print("Classification Report for event_target_14_day:\n")
print(classification_report(y_true, y_pred, digits=4))


Classification Report for event_target_14_day:

              precision    recall  f1-score   support

 appointment     0.0044    0.1114    0.0085       341
        call     0.2266    0.4554    0.3026     13396
    no_event     0.9659    0.6971    0.8098    170803
 reservation     0.0014    0.2167    0.0028       120
        sale     0.0226    0.4035    0.0427       451

    accuracy                         0.6775    185111
   macro avg     0.2442    0.3768    0.2333    185111
weighted avg     0.9077    0.6775    0.7692    185111



In [85]:
MODEL_DIR     = "saved_models_timesplit"
OPT_DIR       = "saved_models_timesplit"
metrics_dir   = "metrics_timesplit"

target_tags = [
    'tag_channel',
    'tag_category',
    'tag_campaign_type',
    'tag_product',
    'tag_project_type',
    #'tag_project_region',
    'tag_discount_flag',
    'tag_chain_flag',
    'tag_model_flag',
    'tag_realestate_flag',
    'event_target_14_day'
]

models = {}
for tgt in target_tags:
    model = CatBoostClassifier()
    dir_ = OPT_DIR if tgt == 'event_target_14_day' else MODEL_DIR
    model.load_model(os.path.join(dir_, f"catboost_{tgt}.cbm"))
    models[tgt] = model

df = df_merged_w_target.copy()
X_pred = df[models[target_tags[0]].feature_names_]
results = df.copy()

# делаем предсказания по всем моделям
for tgt, model in models.items():
    results[f"{tgt}_pred"] = model.predict(X_pred).ravel()
    # гарантируем порядок колонок proba по model.classes_
    proba_cols = [f"{tgt}_proba_{cls}" for cls in model.classes_]
    proba = model.predict_proba(X_pred)
    for idx, cls in enumerate(model.classes_):
        results[f"{tgt}_proba_{cls}"] = proba[:, idx]

records = []
for tgt in target_tags:
    model = models[tgt]
    y_true = results[tgt]
    y_pred = results[f"{tgt}_pred"]
    proba_cols = [f"{tgt}_proba_{cls}" for cls in model.classes_]
    y_score = results[proba_cols].values

    # Accuracy
    acc = accuracy_score(y_true, y_pred)
    # Logloss с явным указанием labels
    ll  = log_loss(y_true, y_score, labels=model.classes_)

    # ROC-AUC micro и macro
    y_true_bin = label_binarize(y_true, classes=model.classes_)
    try:
        roc_micro = roc_auc_score(y_true_bin, y_score,
                                  average="micro", multi_class="ovr")
        roc_macro = roc_auc_score(y_true_bin, y_score,
                                  average="macro", multi_class="ovr")
    except ValueError:
        roc_micro = roc_macro = None

    # Macro-F1
    cr = classification_report(y_true, y_pred, output_dict=True)
    f1_macro = cr.get("macro avg", {}).get("f1-score", None)

    records.append({
        "target": tgt,
        "accuracy": acc,
        "multiclass_logloss": ll,
        "roc_auc_micro": roc_micro,
        "roc_auc_macro": roc_macro,
        "f1_macro": f1_macro
    })

df_summary = pd.DataFrame(records).set_index("target").reindex(target_tags)

df_summary

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Unnamed: 0_level_0,accuracy,multiclass_logloss,roc_auc_micro,roc_auc_macro,f1_macro
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tag_channel,0.833214,0.455188,,,0.302142
tag_category,0.999995,0.004929,,,0.499999
tag_campaign_type,0.864335,0.29791,,,0.447345
tag_product,0.5991,2.072481,,,0.161982
tag_project_type,0.960959,0.137591,,,0.63876
tag_discount_flag,0.991438,0.075058,,,0.49785
tag_chain_flag,0.634225,0.68435,,,0.633951
tag_model_flag,0.968354,0.129597,,,0.538963
tag_realestate_flag,0.9994,0.011182,,,0.49985
event_target_14_day,0.677523,1.129449,0.871372,0.744506,0.233279


In [52]:
results = results[results.event_target_14_day_pred != 'no_event']

target_event = 'event_target_14_day'  # имя колонки с флагом события
results[target_event] = results[target_event].fillna('no_event')

# остальные теги, без target_event
other_tags = [t for t in target_tags if t != target_event]
pred_other = [f"{t}_pred" for t in other_tags]
pred_event = f"{target_event}_pred"

# Число совпадений по другим тегам (вектор)
match_counts_other = (
    results[other_tags].values == results[pred_other].values
).sum(axis=1)
results['match_count_other'] = match_counts_other

# Печать распределения "k совпадений и более"
total = len(results)
m     = len(other_tags)

print(f"Всего строк: {total}\n")
print("Совпадения по остальным тегам (k и более):")
for k in range(1, m+1):
    cnt = np.sum(match_counts_other >= k)
    print(f" {k:2d} тегов: {cnt:6d} строк ({cnt/total:.2%})")

# Сколько правильно угадали self.target_event
event_matches = np.sum(
    results[target_event].values == results[pred_event].values
)
print(f"\nТочный прогноз события '{target_event}': {event_matches} из {total} строк "
      f"({event_matches/total:.2%})")

# Сколько строк, где и >50% остальных тегов, и событие угадано
half = m / 2
both = np.sum(
    (match_counts_other > half) &
    (results[target_event].values == results[pred_event].values)
)
print(f">50% тегов и правильно событие : {both} из {total} строк "
      f"({both/total:.2%})")


Всего строк: 3334

Совпадения по остальным тегам (k и более):
  1 тегов:   3334 строк (100.00%)
  2 тегов:   3334 строк (100.00%)
  3 тегов:   3334 строк (100.00%)
  4 тегов:   3334 строк (100.00%)
  5 тегов:   3334 строк (100.00%)
  6 тегов:   3334 строк (100.00%)
  7 тегов:   3183 строк (95.47%)
  8 тегов:   1735 строк (52.04%)
  9 тегов:    403 строк (12.09%)
 10 тегов:     14 строк (0.42%)

Точный прогноз события 'event_target_14_day': 2941 из 3334 строк (88.21%)
>50% тегов и правильно событие : 2941 из 3334 строк (88.21%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results[target_event] = results[target_event].fillna('no_event')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['match_count_other'] = match_counts_other


In [54]:
#а если предсказывать самый частый
import pandas as pd

# уже получили `results` с колонками <tgt> и <tgt>_pred
# вот ваши таргеты:
target_tags = [
    'tag_channel',
    'tag_category',
    'tag_campaign_type',
    'tag_product',
    'tag_project_type',
    'tag_discount_flag',
    'tag_chain_flag',
    'tag_model_flag',
    'target_event',
    'tag_realestate_flag',
    'event_target_14_day'
]

baselines = []
for tgt in target_tags:
    # истинные метки
    y_true = results[tgt]
    # наиболее частый класс
    mode = y_true.mode().iat[0]
    # accuracy базовой модели
    acc_base = (y_true == mode).mean()
    baselines.append({
        "target": tgt,
        "majority_class": mode,
        "baseline_accuracy": acc_base
    })

df_baseline = pd.DataFrame(baselines).set_index('target')
print(df_baseline)


                    majority_class  baseline_accuracy
target                                               
tag_channel                    sms           0.688662
tag_category            RealEstate           1.000000
tag_campaign_type           Active           0.891122
tag_product               mortgage           0.447211
tag_project_type          not_izhs           0.994301
tag_discount_flag     not_discount           1.000000
tag_chain_flag           not_chain           0.945111
tag_model_flag           not_model           0.665867
target_event           appointment           0.481104
tag_realestate_flag     realestate           1.000000
event_target_14_day           call           0.882124


In [59]:
# 1) Имя колонки с флагом события
target_event = 'event_target_14_day'

# 2) Заменяем NaN на 'no_event'
results[target_event] = results[target_event].fillna('no_event')

# 3) Остальные теги и соответствующие предикты
other_tags = [t for t in target_tags if t != target_event]
pred_other = [f"{t}_pred" for t in other_tags]
pred_event = f"{target_event}_pred"

# 4) Вычисляем, сколько совпадений по остальным тегам в каждой строке
match_counts_other = (results[other_tags].values == results[pred_other].values).sum(axis=1)
results['match_count_other'] = match_counts_other

# 5) Собираем таблицу "k совпадений и более"
total = len(results)
m     = len(other_tags)
table = []
for k in range(1, m+1):
    cnt = np.sum(match_counts_other >= k)
    pct = cnt / total * 100
    table.append({'k_or_more': k, 'count': cnt, 'percent': pct})

df_k = pd.DataFrame(table)
print("Совпадения по остальным тегам (k и более):")
print(df_k.to_string(index=False, formatters={'percent':'{:.2f}%'.format}))

# 6) Точное предсказание события
event_correct = np.sum(results[target_event] == results[pred_event])
event_pct     = event_correct / total * 100
print(f"\nТочный прогноз события '{target_event}': {event_correct} из {total} ({event_pct:.2f}%)")

# 7) >50% тегов и событие угадано
half        = m / 2
both_mask   = (match_counts_other > half) & (results[target_event] == results[pred_event])
both_count  = np.sum(both_mask)
both_pct    = both_count / total * 100
print(f">50% тегов и правильно событие: {both_count} из {total} ({both_pct:.2f}%)")


Совпадения по остальным тегам (k и более):
 k_or_more  count percent
         1 185111 100.00%
         2 185111 100.00%
         3 185111 100.00%
         4 185111 100.00%
         5 185110 100.00%
         6 184963  99.92%
         7 176420  95.30%
         8 140465  75.88%
         9  80511  43.49%
        10   9359   5.06%

Точный прогноз события 'event_target_14_day': 173474 из 185111 (93.71%)
>50% тегов и правильно событие: 173338 из 185111 (93.64%)


In [None]:
#вероятность дойти до sale

In [109]:
# Параметры задачи и финансовый эффект
price = 8_000_000  # средняя стоимость квартиры
total = 187839     # общее число примеров (support из отчёта)
support_sale = 451 # реальное число продаж
recall_sale = 0.49 # Recall для класса 'sale'
precision_sale = 0.02 # Precision для класса 'sale'

# Рассчитываем число истинно-положительных и предсказанных положительных
TP = support_sale * recall_sale
pred_pos = TP / precision_sale

# Базовая конверсия продаж при случайном обзвоне всех
baseline_rate = support_sale / total

# Выручка модели (обращаемся к TP лидам)
rev_model = TP * price

# Выручка при случайном обзвоне того же числа лидов
rev_random = pred_pos * baseline_rate * price

# Финансовый эффект
gain = rev_model - rev_random

import math
print(f"True Positives (TP)            : {TP:.0f} продаж")
print(f"Predicted Positives            : {pred_pos:.0f} лидов")
print(f"Baseline конверсия             : {baseline_rate:.4%}")
print(f"Выручка модели                 : {rev_model/1e9:.3f} млрд")
print(f"Выручка при случайном обзвоне  : {rev_random/1e9:.3f} млрд")
print(f"Дополнительный эффект (gain)   : {gain/1e9:.3f} млрд")


True Positives (TP)            : 221 продаж
Predicted Positives            : 11050 лидов
Baseline конверсия             : 0.2401%
Выручка модели                 : 1.768 млрд
Выручка при случайном обзвоне  : 0.212 млрд
Дополнительный эффект (gain)   : 1.556 млрд
