# Общий код между ноутбуками

# Дискретизация

In [1]:
def discretize(data, variables, icat, icont, contdiscstrategy="kmeans", n_bins=3):
    transformers_data = dict()
    
    if icat is None: # в датасете только непрерывные фичи (и предиктор)
        pipeline = make_pipeline(
            KBinsDiscretizer(n_bins=n_bins, encode="ordinal", random_state=42, strategy=contdiscstrategy), 
            FunctionTransformer(lambda x: x.astype("int")))
    elif icont is None: # в датасете только категориальные фичи (и предиктор)
        pipeline = make_pipeline(
            OrdinalEncoder(categories="auto"), 
            FunctionTransformer(lambda x: x.astype("int")))
    else:
        pipeline = make_pipeline(make_union(
        make_pipeline(FunctionTransformer(lambda x: x.iloc[:, icat]), OrdinalEncoder(categories="auto")),
        make_pipeline(FunctionTransformer(lambda x: x.loc[:, icont]), KBinsDiscretizer(n_bins=n_bins, encode="ordinal", random_state=42, strategy=contdiscstrategy))
    ),
        FunctionTransformer(lambda x: x.astype("int")))
    
    
    pipeline.fit(data)
    encoded_data = pd.DataFrame(pipeline.transform(data), columns=data.columns if icat is None or icont is None\
                                                                                        else data.columns[icat+icont])
    encoded_data = encoded_data[variables] # в общем случае пайплайн переставляет признаки, возвращаем их на их места здесь

    for feat in encoded_data.columns:
        if entropy(encoded_data[feat]) < 0.5:
            print(f"Warning: feature {feat} has practically degenerate states and low entropy")
    transformers_data["transformer"] = pipeline
    
    if icat is None:
        transformers_data["cont_features"]       = list(pipeline.steps[0][1].get_feature_names_out())
        transformers_data["cont_features_edges"] = pipeline.steps[0][1].bin_edges_
        
    elif icont is None:
        transformers_data["cat_features"]            = list(pipeline.steps[0][1].get_feature_names_out())
        transformers_data["cat_features_categories"] = pipeline.steps[0][1].categories_
        
    else:
        transformers_data["cont_features"]           = list(pipeline.steps[0][1].named_transformers["pipeline-2"].steps[1][1].get_feature_names_out())
        transformers_data["cont_features_edges"]     = pipeline.steps[0][1].named_transformers["pipeline-2"].steps[1][1].bin_edges_
        transformers_data["cat_features"]            = list(pipeline.steps[0][1].named_transformers["pipeline-1"].steps[1][1].get_feature_names_out())
        transformers_data["cat_features_categories"] = pipeline.steps[0][1].named_transformers["pipeline-1"].steps[1][1].categories_
    
    #kmeanspipeline.steps[0][1].named_transformers["pipeline-1"].steps[1][1].categories_[0]
    return encoded_data, transformers_data

#disc_data, pipeline_data = discretize(data, variables, icat=None, icont=[0, 1])


## Сборка таблиц усл. вероятностей по БС с градациями

In [None]:
def collect_all_cpds(bn_info, distributions, n_states_map):
    cpds = list()
    for index, row in bn_info.iterrows():
        feat = row["name"].name
        if len(row["parents"]) == 0:
            # cpd is just a pd
            cpd = TabularCPD(feat, n_states_map[feat], [[e] for e in distributions[feat]["cprob"]])
            cpds.append(cpd)
        else:
            cpd_list = [probs for probs in distributions[feat]["cprob"].values()]
            #cpd_list = [probs for i, probs in distributions[feat]["cprob"].items() if i[0]!="["]
            #print(cpd_list)
            nrows = len(cpd_list)
            ncols = len(cpd_list[0])
            cpd_list = [[cpd_list[i][j] for i in range(nrows)] for j in range(ncols)]
            #print(feat, row["parents"])
            #print(cpd_list)
            cpd = TabularCPD(feat, n_states_map[feat], cpd_list, evidence=row["parents"], evidence_card=[n_states_map[p] for p in row["parents"]])
            cpds.append(cpd)
    return cpds

In [None]:
def bootstrap_sampling(data,
                       d_dict,# словарь списков фактор-градация
                       b_sample_size,
                       metrics_list, trials=1000, alpha=0, incl_random_removal=False,
                       mode='regr',
                       drop_mode='random'):
    
    n_random=0

    metrics_results={
        k: [list() for _ in range(len(metrics_list))] for k in d_dict.keys()
    }
    metrics_results['init'] = [list() for _ in range(len(metrics_list))]

    if incl_random_removal:
        metrics_results['random'] = [list() for _ in range(len(metrics_list))]


    n_dropped_stats={k: list() for k in metrics_results.keys()}

    #пробуем побутсрапировать выборку, чтобы оценить значимость различий в ошибках регрессии
    np.random.seed(42)
    for i in tqdm(range(trials)):
        indexes = np.random.choice(data.index, size=b_sample_size)
        bsample = data.loc[indexes]
        
        model_base = copy(model)
        model_base.fit(bsample[features], bsample[target])

        y_pred = model_base.predict(bsample[features]) # depends on task (regr/classif)
        
        data_errors = bsample.copy(deep=True)
        data_errors["ape_error"] = np.abs((bsample[target]-y_pred)/bsample[target])
        data_errors.drop(columns=[target], inplace=True)

        bsample_disc = pd.DataFrame(pipeline_data["transformer"].transform(data_errors.loc[indexes]), columns=data_errors.columns)
        
        mask = False

        for g in metrics_results.keys():
            
            if g == 'init':
                mask = None
                samp = bsample
            elif g == 'random':
                mask=None
                indexes_to_drop = np.random.choice(bsample.index, size=n_random, replace=False)
                samp = samp = bsample.drop(index=indexes_to_drop)
            else:
                mask=False
                for feat, cat in zip(d_dict[g][0], d_dict[g][1]):
                    mask = mask | (bsample_disc[feat]==cat)
                
                if drop_mode == 'random':
                    indexes_to_drop = np.random.choice(bsample[mask].index, size=int(np.floor(bsample[mask].shape[0]*(1-alpha))), replace=False)
                elif drop_mode == 'metric':
                    errors = data_errors[mask]['ape_error']
                    errors_sorted = errors.sort_values(ascending=False)
                    size = int(np.floor(errors.shape[0]*(1-alpha)))
                    thresh = errors_sorted[:size].min()
                    indexes_to_drop = errors[errors>=thresh].index

                
                samp = bsample.drop(index=indexes_to_drop)
                if g=='bn':
                    n_random=indexes_to_drop.shape[0]
                    
                n_dropped_stats[g].append(indexes_to_drop.shape[0])
        # --
            
            X1, y1 = samp[features], samp[target]

            model1 = copy(model)
            model1.fit(X1, y1)  

            y1_pred = model1.predict(X1)

            for k, metric in enumerate(metrics_list):
                metrics_results[g][k].append(metric(y1, y1_pred))
        
    return {'metrics': metrics_results,
             'n_dropped': n_dropped_stats,
             'avg_n_dropped': {k: sum(v)/len(v) if len(v) >0 else -1 for k, v in n_dropped_stats.items()}}