In [111]:
import pyreadr
import pandas as pd
import numpy as np
import tqdm
import random as rnd
from itertools import product


from sklearn.metrics import root_mean_squared_error


from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor


from sklearn.preprocessing import LabelEncoder


import warnings
warnings.filterwarnings("ignore")


from sklearn.impute import KNNImputer
from missforest import MissForest


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [112]:
result = pyreadr.read_r("spark_23.rda")

In [113]:
df = result["spark_23"]

In [114]:
df["Код основного вида деятельности (SPLIT)"] = df["Код основного вида деятельности"].apply(lambda x: x.split(".")[0] if pd.notna(x) else np.nan)

In [159]:
class RecoverMissingData:

    def __init__(self, df: pd.DataFrame, cat_cols: list[str], num_cols: list[str], share_of_missing: float):

        np.random.seed(42)

        self.df = df
        self.num_cols = num_cols # IF FILLING MISSING VALUES BY ROWS INPUT COLUMN NAMES OF ONE VARIABLE
        self.cat_cols = cat_cols # INPUT ONLY TWO OR THREE CATEGORICAL COLUMNS
        self.cols = cat_cols + num_cols

        self.num_of_cats = len(cat_cols)

        problist = [0 if share_of_missing * 100 > i else 1 for i in range(100)]
        np.random.shuffle(problist)
        self.problist = problist

# ===========================================================================================================        
# КОЛОНОЧНЫЕ МЕТОДЫ

    def TEST_fill_with_mean_by_column(self):

        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)

        np.random.seed(42)
        
        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        fix_df = pd.DataFrame()
        comp_df = pd.DataFrame()
        cat_cols = self.cat_cols

        if self.num_of_cats == 2:
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):
                for j in set(del_df[cat_cols[1]].to_list()):

                    fix_in_loop_df = del_df[(del_df[cat_cols[0]] == i) & (del_df[cat_cols[1]] == j)]
                    comp_in_loop_df = new_df[(new_df[cat_cols[0]] == i) & (new_df[cat_cols[1]] == j)]
                        
                    for num_col in self.num_cols:
                        m = np.nanmean(fix_in_loop_df[num_col])
                        fix_in_loop_df[num_col] = fix_in_loop_df[num_col].apply(lambda x: x if pd.notna(x) else m)

                    fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                    comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)

        elif self.num_of_cats == 3: 
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):
                for j in set(del_df[cat_cols[1]].to_list()):
                    for k in set(del_df[cat_cols[2]].to_list()):

                        fix_in_loop_df = del_df[(del_df[cat_cols[0]] == i) & (del_df[cat_cols[1]] == j) & (del_df[cat_cols[2]] == k)]
                        comp_in_loop_df = new_df[(new_df[cat_cols[0]] == i) & (new_df[cat_cols[1]] == j) & (new_df[cat_cols[2]] == k)]
                            
                        for num_col in self.num_cols:
                            m = np.nanmean(fix_in_loop_df[num_col])
                            fix_in_loop_df[num_col] = fix_in_loop_df[num_col].apply(lambda x: x if pd.notna(x) else m)

                        fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                        comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)
        else: 
            raise ValueError("number of columns with categorical data must be 2 or 3")


        for num_col in tqdm.tqdm(self.num_cols):
            m = np.nanmean(fix_df[num_col].to_list())
            fix_df[num_col] = fix_df[num_col].apply(lambda x: x if pd.notna(x) else m)


        self.comp_df = comp_df.sort_index()
        self.fix_df = fix_df.sort_index()
        self.del_df = del_df
        pass

# ===========================================================================================================        

    def TEST_fill_with_median_by_column(self):

        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)

        np.random.seed(42)

        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        fix_df = pd.DataFrame()
        comp_df = pd.DataFrame()
        cat_cols = self.cat_cols

        if self.num_of_cats == 2:
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):
                for j in set(del_df[cat_cols[1]].to_list()):

                    fix_in_loop_df = del_df[(del_df[cat_cols[0]] == i) & (del_df[cat_cols[1]] == j)]
                    comp_in_loop_df = new_df[(new_df[cat_cols[0]] == i) & (new_df[cat_cols[1]] == j)]
                        
                    for num_col in self.num_cols:
                        m = np.nanmedian(fix_in_loop_df[num_col])
                        fix_in_loop_df[num_col] = fix_in_loop_df[num_col].apply(lambda x: x if pd.notna(x) else m)

                    fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                    comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)

        elif self.num_of_cats == 3: 
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):
                for j in set(del_df[cat_cols[1]].to_list()):
                    for k in set(del_df[cat_cols[2]].to_list()):

                        fix_in_loop_df = del_df[(del_df[cat_cols[0]] == i) & (del_df[cat_cols[1]] == j) & (del_df[cat_cols[2]] == k)]
                        comp_in_loop_df = new_df[(new_df[cat_cols[0]] == i) & (new_df[cat_cols[1]] == j) & (new_df[cat_cols[2]] == k)]
                            
                        for num_col in self.num_cols:
                            m = np.nanmedian(fix_in_loop_df[num_col])
                            fix_in_loop_df[num_col] = fix_in_loop_df[num_col].apply(lambda x: x if pd.notna(x) else m)

                        fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                        comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)
        else: 
            raise ValueError("number of columns with categorical data must be 2 or 3")

        for num_col in tqdm.tqdm(self.num_cols):
            m = np.nanmedian(fix_df[num_col].to_list())
            fix_df[num_col] = fix_df[num_col].apply(lambda x: x if pd.notna(x) else m)


        self.comp_df = comp_df.sort_index()
        self.fix_df = fix_df.sort_index()
        self.del_df = del_df
        pass

# ===========================================================================================================  
# ПОСТРОЧНЫЕ МЕТОДЫ      

    def TEST_fill_with_mean_by_row(self):

        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        col_m = np.mean(
            [j for i in self.num_cols for j in new_df[i].to_list()]
        )
        
        np.random.seed(42)

        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        self.comp_df = new_df
        fix_dict = {}

        del_dict = del_df.to_dict("index")

        for i in tqdm.tqdm(del_dict.keys()):

            m = np.nanmean([del_dict[i][j] for j in self.num_cols]) 

            if pd.isna(m):
                m = col_m

            in_loop_dict = {i: {}}

            for col in self.cols:
                if pd.notna(del_dict[i][col]):
                    in_loop_dict[i][col] = del_dict[i][col]
                else:
                    in_loop_dict[i][col] = m

            fix_dict[i] = in_loop_dict[i]


        self.fix_df =  pd.DataFrame.from_dict(fix_dict, orient="index")
        self.del_df = del_df
        pass

# ===========================================================================================================  

    def TEST_fill_with_median_by_row(self):

        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        col_m = np.median(
            [j for i in self.num_cols for j in new_df[i].to_list()]
        )
        
        np.random.seed(42)

        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        self.comp_df = new_df
        fix_dict = {}

        del_dict = del_df.to_dict("index")

        for i in tqdm.tqdm(del_dict.keys()):

            m = np.nanmedian([del_dict[i][j] for j in self.num_cols]) 

            if pd.isna(m):
                m = col_m

            in_loop_dict = {i: {}}

            for col in self.cols:
                if pd.notna(del_dict[i][col]):
                    in_loop_dict[i][col] = del_dict[i][col]
                else:
                    in_loop_dict[i][col] = m

            fix_dict[i] = in_loop_dict[i]


        self.fix_df =  pd.DataFrame.from_dict(fix_dict, orient="index")
        self.del_df = del_df
        pass

# ===========================================================================================================  
# МЕТОДЫ НА ОСНОВЕ МО

    def TEST_KNN_Imputer(self, n_neighbours: int, strategy_for_exceptions: str):

        if strategy_for_exceptions not in ["mean", "median"]:
            raise ValueError("Can only take 'median' or 'mean' as arguments.")
        if type(n_neighbours) != int:
            raise TypeError("n_neighbours must be 'int'")


        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)

        np.random.seed(42)

        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        fix_df = pd.DataFrame()
        comp_df = pd.DataFrame()
        cat_cols = self.cat_cols

        if self.num_of_cats == 1:
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):

                    fix_in_loop_df = del_df[del_df[cat_cols[0]] == i]
                    comp_in_loop_df = new_df[new_df[cat_cols[0]] == i]

                    try:
                        imputed_values = KNNImputer(n_neighbors=n_neighbours).fit_transform(fix_in_loop_df[self.num_cols])
                        imputed_values_index = fix_in_loop_df[self.num_cols].index
                        fix_in_loop_df[self.num_cols] = pd.DataFrame(imputed_values, columns=self.num_cols, index=imputed_values_index)  
                    except:
                        pass

                    fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                    comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)

        elif self.num_of_cats == 2:
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):
                for j in set(del_df[cat_cols[1]].to_list()):

                    fix_in_loop_df = del_df[(del_df[cat_cols[0]] == i) & (del_df[cat_cols[1]] == j)]
                    comp_in_loop_df = new_df[(new_df[cat_cols[0]] == i) & (new_df[cat_cols[1]] == j)]

                    try:
                        imputed_values = KNNImputer(n_neighbors=n_neighbours).fit_transform(fix_in_loop_df[self.num_cols])
                        imputed_values_index = fix_in_loop_df[self.num_cols].index
                        fix_in_loop_df[self.num_cols] = pd.DataFrame(imputed_values, columns=self.num_cols, index=imputed_values_index)  
                    except:
                        pass

                    fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                    comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)

        elif self.num_of_cats == 3:
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):
                for j in set(del_df[cat_cols[1]].to_list()):
                    for k in set(del_df[cat_cols[2]].to_list()):

                        fix_in_loop_df = del_df[(del_df[cat_cols[0]] == i) & (del_df[cat_cols[1]] == j) & (del_df[cat_cols[2]] == k)]
                        comp_in_loop_df = new_df[(new_df[cat_cols[0]] == i) & (new_df[cat_cols[1]] == j) & (new_df[cat_cols[2]] == k)]

                        try:
                            imputed_values = KNNImputer(n_neighbors=n_neighbours).fit_transform(fix_in_loop_df[self.num_cols])
                            imputed_values_index = fix_in_loop_df[self.num_cols].index
                            fix_in_loop_df[self.num_cols] = pd.DataFrame(imputed_values, columns=self.num_cols, index=imputed_values_index)  
                        except:
                            pass

                        fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                        comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)

        if strategy_for_exceptions == "mean":
            m_func = np.nanmean
        elif strategy_for_exceptions == "median":
            m_func = np.nanmedian

        for num_col in tqdm.tqdm(self.num_cols):
            m = m_func(fix_df[num_col].to_list())
            fix_df[num_col] = fix_df[num_col].apply(lambda x: x if pd.notna(x) else m)

        self.comp_df = comp_df.sort_index()
        self.fix_df = fix_df.sort_index()
        self.del_df = del_df
        pass 

# ===========================================================================================================     

    def TEST_KNN_wrapper(self, n_neighbours: int):

        if type(n_neighbours) != int:
            raise TypeError("n_neighbours must be 'int'")
        
        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)

        np.random.seed(42)

        print("randomly deleting values")
        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        fix_df = pd.DataFrame()
        comp_df = pd.DataFrame()

        print("imputing missing values")
        imputed_values = KNNImputer(n_neighbors=n_neighbours).fit_transform(del_df[self.num_cols])
        fix_df[self.num_cols] = pd.DataFrame(imputed_values, columns=self.num_cols)

        self.fix_df = fix_df
        self.comp_df = new_df[self.num_cols]
        self.del_df = del_df
        pass

# ===========================================================================================================    

    def TEST_Miss_Forest(self, classifier, regressor, early_stopping: bool, verbose: int, max_iter: int):

        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)

        np.random.seed(42)

        print("randomly deleting values")
        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        fix_df = pd.DataFrame()
        comp_df = pd.DataFrame()

        print("encoding categorical values")
        for i in self.cat_cols:
            encoded_col = pd.DataFrame({i: LabelEncoder().fit_transform(del_df[i])})
            del_df[i], comp_df[i] = encoded_col, encoded_col



        mf = MissForest(clf=classifier, rgr=regressor, categorical=self.cat_cols, early_stopping=early_stopping, verbose=verbose, max_iter=max_iter)
        imputed_values = mf.fit_transform(del_df[self.cols])
        fix_df[self.cols] = pd.DataFrame(fix_df, columns=self.cols)

        self.fix_df = fix_df
        self.comp_df = new_df
        self.del_df = del_df
        pass

# ===========================================================================================================    

    def TEST_Iterative_Imputer(self, estimator, missing_value, max_iter: int, n_nearest_features: int, tol: float, verbose: int, initial_strategy: str):

        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)

        np.random.seed(42)

        print("randomly deleting values")
        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        imputer = IterativeImputer(
            estimator=estimator,
            missing_values=missing_value,
            tol=tol,
            n_nearest_features=n_nearest_features,
            verbose=verbose,
            initial_strategy=initial_strategy,
            random_state=42
        )
        
        imputed_values = imputer.fit_transform(del_df[self.num_cols])


        self.fix_df = pd.DataFrame(imputed_values, columns=self.num_cols)
        self.comp_df = new_df[self.num_cols]
        self.del_df = del_df
        pass

# ===========================================================================================================       
   
    def TEST_error(self):
            
        coor_list = []

        for num_col in self.num_cols:
            for y in range(self.del_df.shape[0]):

                if pd.isna(self.del_df.loc[y, num_col]):
                    coor_list.append((y, num_col))

        true_list = []
        fill_list = []

        for coor in tqdm.tqdm(coor_list):

            comp_df_value = self.comp_df.loc[*coor]
            true_list.append(comp_df_value)

            fix_df_value = self.fix_df.loc[*coor]
            fill_list.append(fix_df_value)

        def res(col_name: str, true_list: list, fill_list: list):
            
            error_data_dict = {}

            error_data_dict["RMSE"] = "{:.8f}".format(float(root_mean_squared_error(true_list, fill_list)))
            error_data_dict["True mean"] = "{:.8f}".format(float(np.nanmean(true_list)))
            error_data_dict["Imputed mean"] = "{:.8f}".format(float(np.nanmean(fill_list)))
            error_data_dict["True median"] = "{:.8f}".format(float(np.nanmedian(true_list)))
            error_data_dict["Imputed median"] = "{:.8f}".format(float(np.nanmedian(fill_list)))
            error_data_dict["True std"] = "{:.8f}".format(float(np.std(true_list)))
            error_data_dict["Imputed std"] = "{:.8f}".format(float(np.std(fill_list)))

            error_data_df = pd.DataFrame(error_data_dict, index=[col_name], columns=error_data_dict.keys())
             
            return error_data_df

        error_data_df = pd.DataFrame()

        error_data_df = pd.concat([error_data_df, res("Вся выборка", true_list, fill_list)], ignore_index=False)

        for num_col in self.num_cols:
            
            error_data_df = pd.concat([error_data_df, res(num_col, self.comp_df[num_col].to_list(), self.fix_df[num_col].to_list())], ignore_index=False)

        
        return error_data_df.transpose()

In [160]:
share_of_misssing = 0.05

In [161]:
# df.to_markdown()

# КОЛОНОЧНЫЕ МЕТОДЫ

## СРЕДНЕЕ

In [162]:
mean_fill = RecoverMissingData(
    df=df,
    cat_cols=["Код основного вида деятельности (SPLIT)", "Размер компании"],
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

mean_fill.TEST_fill_with_mean_by_column()

100%|██████████| 20/20 [00:03<00:00,  6.62it/s]
100%|██████████| 4/4 [00:00<00:00, 18.66it/s]


In [164]:
error_df = mean_fill.TEST_error()
error_df

100%|██████████| 21216/21216 [00:00<00:00, 45331.65it/s]


Unnamed: 0,Вся выборка,"2019, Доходы, RUB","2020, Доходы, RUB","2021, Доходы, RUB","2022, Доходы, RUB"
RMSE,724525045.1754906,124131308.85525584,97142540.25401792,199050981.33748096,201083879.99871236
True mean,184375318.6635558,153005575.34954363,164799499.25196153,211030183.2839865,241499337.5319732
Imputed mean,191078408.78055105,153489519.56136382,164947834.0822478,210842334.8091761,242394426.40030387
True median,18675500.0,15123000.0,17313000.0,20930000.0,23445000.0
Imputed median,31814565.3511811,17364000.0,19611000.0,23547000.0,25366000.0
True std,931955557.8478724,1231091684.355565,898382632.7889482,1556224091.6118104,2017886375.1195784
Imputed std,707108617.9794093,1225736105.8589325,894202424.7323325,1545496190.838882,2012981862.575771


In [169]:
mean_fill = RecoverMissingData(
    df=df, 
    cat_cols=["Регион регистрации", "Размер компании", "Код основного вида деятельности (SPLIT)"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

mean_fill.TEST_fill_with_mean_by_column()

100%|██████████| 84/84 [05:46<00:00,  4.12s/it]
100%|██████████| 4/4 [00:00<00:00, 18.50it/s]


In [170]:
error_df = mean_fill.TEST_error()
error_df

100%|██████████| 21216/21216 [00:00<00:00, 70978.99it/s]


Unnamed: 0,Вся выборка,"2019, Доходы, RUB","2020, Доходы, RUB","2021, Доходы, RUB","2022, Доходы, RUB"
RMSE,810120183.6293031,136114766.8429329,108884766.07493916,227434032.38719463,221454327.8547341
True mean,184375318.6635558,153005575.34954363,164799499.25196153,211030183.2839865,241499337.5319732
Imputed mean,178968298.67327896,153072906.02113616,164597837.26366088,209971890.91301164,241611443.3928955
True median,18675500.0,15123000.0,17313000.0,20930000.0,23445000.0
Imputed median,31497551.94805195,16624000.0,18627285.71428571,22641000.0,24985000.0
True std,931955557.8478724,1231091684.355565,898382632.7889482,1556224091.6118104,2017886375.1195784
Imputed std,728585226.8677084,1226301530.7444735,894638823.2315477,1545281912.5383215,2014161285.3288448


## МЕДИАНА

In [172]:
median_fill = RecoverMissingData(
    df=df,
    cat_cols=["Код основного вида деятельности (SPLIT)", "Размер компании"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

median_fill.TEST_fill_with_median_by_column()

100%|██████████| 20/20 [00:03<00:00,  6.28it/s]
100%|██████████| 4/4 [00:00<00:00, 19.46it/s]


In [173]:
error_df = median_fill.TEST_error()
error_df

100%|██████████| 21216/21216 [00:00<00:00, 66804.37it/s]


Unnamed: 0,Вся выборка,"2019, Доходы, RUB","2020, Доходы, RUB","2021, Доходы, RUB","2022, Доходы, RUB"
RMSE,725400856.1779953,127153572.20491192,101340425.81887916,203492233.02188283,193152885.79894948
True mean,184375318.6635558,153005575.34954363,164799499.25196153,211030183.2839865,241499337.5319732
Imputed mean,123067150.122549,150558621.3143538,162188027.2677103,207051156.70009515,238285204.39052624
True median,18675500.0,15123000.0,17313000.0,20930000.0,23445000.0
Imputed median,12474000.0,14095000.0,16080000.0,19590000.0,21734000.0
True std,931955557.8478724,1231091684.355565,898382632.7889482,1556224091.6118104,2017886375.1195784
Imputed std,433695245.97567487,1223193348.0069888,890771147.3224654,1539201055.589864,2005646818.8116827


In [177]:
median_fill = RecoverMissingData(
    df=df,
    cat_cols=["Регион регистрации", "Код основного вида деятельности (SPLIT)", "Размер компании"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

median_fill.TEST_fill_with_median_by_column()

100%|██████████| 84/84 [05:37<00:00,  4.02s/it]
100%|██████████| 4/4 [00:00<00:00, 18.59it/s]


In [178]:
error_df = median_fill.TEST_error()
error_df

100%|██████████| 21216/21216 [00:00<00:00, 55266.32it/s]


Unnamed: 0,Вся выборка,"2019, Доходы, RUB","2020, Доходы, RUB","2021, Доходы, RUB","2022, Доходы, RUB"
RMSE,795545167.2982346,135985241.01435232,108909889.38777946,224148952.96186924,214192888.137794
True mean,184375318.6635558,153005575.34954363,164799499.25196153,211030183.2839865,241499337.5319732
Imputed mean,126822748.2128582,150717571.88279787,162304065.0572306,207219990.60878617,238591886.22669
True median,18675500.0,15123000.0,17313000.0,20930000.0,23445000.0
Imputed median,12885000.0,14327000.0,16307000.0,19782000.0,21948000.0
True std,931955557.8478724,1231091684.355565,898382632.7889482,1556224091.6118104,2017886375.1195784
Imputed std,551026145.7878662,1224274055.6108687,891782737.0164059,1542014316.2238526,2008102889.0741537


# ПОСТРОЧНЫЕ МЕТОДЫ 

## СРЕДНЕЕ

In [180]:
mean_by_row_fill = RecoverMissingData(
    df=df,
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
) 

mean_by_row_fill.TEST_fill_with_mean_by_row()

100%|██████████| 111674/111674 [00:03<00:00, 36392.87it/s]


In [181]:
error_df = mean_by_row_fill.TEST_error()
error_df

100%|██████████| 22279/22279 [00:00<00:00, 71891.70it/s]


Unnamed: 0,Вся выборка,"2019, Доходы, RUB","2020, Доходы, RUB","2021, Доходы, RUB","2022, Доходы, RUB"
RMSE,1148610968.3120587,497098473.8415812,60132891.70453614,76199780.84218523,81688131.82570301
True mean,166924638.70618072,148012915.19900784,158004422.45994592,203098422.1175744,230069658.5141931
Imputed mean,179934624.3501784,152325392.9413209,159593624.33545175,202005239.49864253,227856657.96177185
True median,17058000.0,14037500.0,15763000.0,18733500.0,20622000.0
Imputed median,19143000.0,14313500.0,15966500.0,18747000.0,20453500.0
True std,861830672.3423774,1217101036.6190648,877861632.7582511,1588316407.5889184,1968458234.36784
Imputed std,1463913003.8824296,1335712519.3998234,883835105.9248381,1580300624.0114253,1966424435.273854


## МЕДИАНА

In [183]:
median_by_row_fill = RecoverMissingData(
    df=df,
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing 
) 

median_by_row_fill.TEST_fill_with_median_by_row()

100%|██████████| 111674/111674 [00:03<00:00, 31249.56it/s]


In [184]:
error_df = median_by_row_fill.TEST_error()
error_df

100%|██████████| 22279/22279 [00:00<00:00, 71916.82it/s]


Unnamed: 0,Вся выборка,"2019, Доходы, RUB","2020, Доходы, RUB","2021, Доходы, RUB","2022, Доходы, RUB"
RMSE,406891574.30451447,123693096.02924192,58356849.58097465,83606367.61773998,85638172.10026193
True mean,166924638.70618072,148012915.19900784,158004422.45994592,203098422.1175744,230069658.5141931
Imputed mean,166221389.8248575,150833815.26877788,159324056.01841968,201332686.52727133,227554562.0914268
True median,17058000.0,14037500.0,15763000.0,18733500.0,20622000.0
Imputed median,17451000.0,14249500.0,15896000.0,18626500.0,20371000.0
True std,861830672.3423774,1217101036.6190648,877861632.7582511,1588316407.5889184,1968458234.36784
Imputed std,884806901.5172191,1233047234.341076,883141873.7653435,1578872407.297444,1965943140.3569927


# МЕТОДЫ НА ОСНОВЕ МО

## ЗАПОЛНЕНИЕ С ИСПОЛЬЗОВАНИЕМ К-БЛИЖАЙШИХ

### К-БЛИЖАЙШИХ C РАЗБИЕНИЕМ НА КАТЕГОРИИ 

In [186]:
knn_with_median = RecoverMissingData(
    df=df, 
    cat_cols=["Регион регистрации"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)
 
knn_with_median.TEST_KNN_Imputer(n_neighbours=10, strategy_for_exceptions="median")

100%|██████████| 84/84 [00:07<00:00, 11.71it/s]
100%|██████████| 4/4 [00:00<00:00, 18.45it/s]


In [187]:
error_df = knn_with_median.TEST_error()
error_df 

100%|██████████| 22279/22279 [00:00<00:00, 74257.86it/s]


Unnamed: 0,Вся выборка,"2019, Доходы, RUB","2020, Доходы, RUB","2021, Доходы, RUB","2022, Доходы, RUB"
RMSE,561696727.7913239,117165381.05207635,72599719.24430603,183960456.27603945,100514180.60939217
True mean,166924638.70618072,148012915.19900784,158004422.45994592,203098422.1175744,230069658.5141931
Imputed mean,141570748.17007113,147036932.60657153,156718201.21445665,201208086.12740085,229164088.78004876
True median,17058000.0,14037500.0,15763000.0,18733500.0,20622000.0
Imputed median,18494700.0,14188500.0,15775833.05,18778000.0,20780500.0
True std,861830672.3423774,1217101036.6190648,877861632.7582511,1588316407.5889184,1968458234.36784
Imputed std,548564799.3712082,1210976906.9750245,872015967.0988938,1571542603.2431767,1965985061.3179293


In [189]:
knn_with_median = RecoverMissingData(
    df=df,
    cat_cols=["Код основного вида деятельности (SPLIT)", "Размер компании"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing    
) 

knn_with_median.TEST_KNN_Imputer(n_neighbours=3, strategy_for_exceptions="median")

100%|██████████| 20/20 [00:09<00:00,  2.02it/s]
100%|██████████| 4/4 [00:00<00:00, 18.40it/s]


In [190]:
error_df = knn_with_median.TEST_error()
error_df

100%|██████████| 21216/21216 [00:00<00:00, 74319.54it/s]


Unnamed: 0,Вся выборка,"2019, Доходы, RUB","2020, Доходы, RUB","2021, Доходы, RUB","2022, Доходы, RUB"
RMSE,399920947.3690318,108230978.81285717,61443419.11425448,104665741.48232764,74276405.32441418
True mean,184375318.6635558,153005575.34954363,164799499.25196153,211030183.2839865,241499337.5319732
Imputed mean,172125064.11267692,152880743.0702831,164083733.24956492,209721546.34176528,241200529.5733989
True median,18675500.0,15123000.0,17313000.0,20930000.0,23445000.0
Imputed median,18938666.66666666,15171000.0,17288000.0,20905000.0,23472000.0
True std,931955557.8478724,1231091684.355565,898382632.7889482,1556224091.6118104,2017886375.1195784
Imputed std,838643634.2084689,1233517378.55994,896211753.2365692,1548691676.7248282,2015006098.5288928


In [192]:
knn_with_median = RecoverMissingData(
    df=df, 
    cat_cols=["Регион регистрации", "Код основного вида деятельности (SPLIT)", "Размер компании"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
) 

knn_with_median.TEST_KNN_Imputer(n_neighbours=1, strategy_for_exceptions="median")

100%|██████████| 84/84 [06:00<00:00,  4.29s/it]
100%|██████████| 4/4 [00:00<00:00, 16.00it/s]


In [193]:
error_df = knn_with_median.TEST_error()
error_df

100%|██████████| 21216/21216 [00:00<00:00, 67591.64it/s]


Unnamed: 0,Вся выборка,"2019, Доходы, RUB","2020, Доходы, RUB","2021, Доходы, RUB","2022, Доходы, RUB"
RMSE,731938360.0562851,107720792.24009994,87951173.1585137,239586137.99895737,174121377.92549834
True mean,184375318.6635558,153005575.34954363,164799499.25196153,211030183.2839865,241499337.5319732
Imputed mean,150733274.65195543,151643235.01150137,163489554.8203293,208481371.36714008,239997539.1027792
True median,18675500.0,15123000.0,17313000.0,20930000.0,23445000.0
Imputed median,16703500.0,15040000.0,17227500.0,20812000.0,23350000.0
True std,931955557.8478724,1231091684.355565,898382632.7889482,1556224091.6118104,2017886375.1195784
Imputed std,695911844.8449463,1225782641.8937647,893858485.399595,1547568581.292126,2010840692.5692167


### К-БЛИЖАЙШИХ БЕЗ РАЗБИЕНИЯ НА КАТЕГОРИИ 

In [195]:
knn = RecoverMissingData(
    df=df, 
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
) 

knn.TEST_KNN_wrapper(n_neighbours=5)
# approximate runtime for ~106000x4 dataframe: 8:30 minutes

randomly deleting values
imputing missing values


In [196]:
error_df = knn.TEST_error()
error_df

100%|██████████| 22279/22279 [00:00<00:00, 74830.99it/s]


Unnamed: 0,Вся выборка,"2019, Доходы, RUB","2020, Доходы, RUB","2021, Доходы, RUB","2022, Доходы, RUB"
RMSE,311947246.1854263,73029992.7650817,51973689.44436746,67251954.20743406,82801714.94083628
True mean,166924638.70618072,148012915.19900784,158004422.45994592,203098422.1175744,230069658.5141931
Imputed mean,156157489.09753886,147828456.17518613,157549473.43207136,202215653.73328444,229443785.03586677
True median,17058000.0,14037500.0,15763000.0,18733500.0,20622000.0
Imputed median,18397000.0,14174000.0,15779000.0,18756000.0,20766000.0
True std,861830672.3423774,1217101036.6190648,877861632.7582511,1588316407.5889184,1968458234.36784
Imputed std,795122674.9804192,1215216587.5911686,875853558.3041371,1584442803.4085436,1968075546.1327353


## ПОТЕРЯННЫЙ(?) ЛЕС

In [None]:
mf = RecoverMissingData(
    df=df,
    cat_cols=["Регион регистрации", "Код основного вида деятельности (SPLIT)", "Размер компании"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
) 

mf.TEST_Miss_Forest(
    classifier=RandomForestClassifier(n_jobs=-1), 
    regressor=RandomForestRegressor(n_jobs=-1), 
    early_stopping=True, 
    verbose=1, 
    max_iter=3
)

In [None]:
mf.TEST_error()

## ИТЕРАТИВНОЕ ЗАПОЛНЕНИЕ С ИСПОЛЬЗОВАНИЕМ РАЗЛИЧНЫХ ОЦЕНЩИКОВ

### ИТЕРАТИВНОЕ ЗАПОЛНЕНИЕ С ИСПОЛЬЗОВАНИЕМ ГРАДИЕНТНОГО БУСТИНГА

In [198]:
gb = RecoverMissingData(
    df=df,
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

hgbr = HistGradientBoostingRegressor(
    max_depth=2,
    learning_rate=0.1,
    random_state=42
)

gb.TEST_Iterative_Imputer(
    estimator=hgbr,
    missing_value=np.nan,
    n_nearest_features=3,
    max_iter=100,
    tol=0.0001,
    verbose=2,
    initial_strategy="median"
)

randomly deleting values
[IterativeImputer] Completing matrix with shape (111674, 4)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.63
[IterativeImputer] Change: 38408891127.44298, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 2/10, elapsed time 1.29
[IterativeImputer] Change: 12973671919.16134, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 3/10, elapsed time 1.90
[IterativeImputer] Change: 1405984590.627125, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 4/10, elapsed time 2.50
[IterativeImputer] Change: 586398550.4316025, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 5/10, elapsed time 3.13
[IterativeImputer] Change: 613125167.076992, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 6/10, elapsed time 3.73
[IterativeImputer] Change: 503206227.1752751, scaled tolerance: 47314016.300000004 
[Iterative

In [199]:
error_df = gb.TEST_error()
error_df

100%|██████████| 22279/22279 [00:00<00:00, 74568.60it/s]


Unnamed: 0,Вся выборка,"2019, Доходы, RUB","2020, Доходы, RUB","2021, Доходы, RUB","2022, Доходы, RUB"
RMSE,497246843.0283296,80320426.47568637,46481812.95744891,167418454.51629153,112634233.39251152
True mean,166924638.70618072,148012915.19900784,158004422.45994592,203098422.1175744,230069658.5141931
Imputed mean,177892642.2123623,148458667.21544695,158225992.68417123,203190979.21502155,231497899.50950035
True median,17058000.0,14037500.0,15763000.0,18733500.0,20622000.0
Imputed median,34957839.95898432,16158000.0,15979000.0,21591500.0,20967000.0
True std,861830672.3423774,1217101036.6190648,877861632.7582511,1588316407.5889184,1968458234.36784
Imputed std,817025026.8920671,1215447638.848637,877881017.1727036,1576932716.7946737,1974764939.1256304


### ИТЕРАТИВНОЕ ЗАПОЛНЕНИЕ С ИСПОЛЬЗОВАНИЕМ СЛУЧАЙНОГО ЛЕСА

In [None]:
rf = RecoverMissingData(
    df=df,
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

rfr = RandomForestRegressor(
    n_estimators=100,
    bootstrap=True,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

rf.TEST_Iterative_Imputer(
    estimator=rfr,
    missing_value=np.nan,
    n_nearest_features=3,
    max_iter=50,
    tol=0.0001,
    verbose=2,
    initial_strategy="median"
)

randomly deleting values
[IterativeImputer] Completing matrix with shape (111674, 4)
[IterativeImputer] Ending imputation round 1/10, elapsed time 71.24
[IterativeImputer] Change: 58106674100.0, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 2/10, elapsed time 143.75
[IterativeImputer] Change: 5391218150.0, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 3/10, elapsed time 217.20
[IterativeImputer] Change: 3192124380.0, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 4/10, elapsed time 291.58
[IterativeImputer] Change: 2568352020.0, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 5/10, elapsed time 364.28
[IterativeImputer] Change: 2918319814.619999, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 6/10, elapsed time 444.11
[IterativeImputer] Change: 3097893384.619999, scaled tolerance: 47314016.300000004 


### ИТЕРАТИВНОЕ ЗАПОЛНЕНИЕ С ИСПОЛЬЗОВАНИЕМ ЛИНЕЙНОГО МЕТОДА ОПОРНЫХ ВЕКТОРОВ

In [201]:
lsv = RecoverMissingData(
    df=df,
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

lsvr = LinearSVR(
    random_state=42
)

lsv.TEST_Iterative_Imputer(
    estimator=lsvr,
    missing_value=np.nan,
    n_nearest_features=3,
    max_iter=100,
    tol=0.0001,
    verbose=1,
    initial_strategy="median"
)

randomly deleting values
[IterativeImputer] Completing matrix with shape (111674, 4)
[IterativeImputer] Change: 71570588141.19485, scaled tolerance: 47314016.300000004 
[IterativeImputer] Change: 10594018734.230152, scaled tolerance: 47314016.300000004 
[IterativeImputer] Change: 689513757.634265, scaled tolerance: 47314016.300000004 
[IterativeImputer] Change: 151838622.42597485, scaled tolerance: 47314016.300000004 
[IterativeImputer] Change: 37667252.91960716, scaled tolerance: 47314016.300000004 
[IterativeImputer] Early stopping criterion reached.


In [202]:
error_df = lsv.TEST_error()
error_df

100%|██████████| 22279/22279 [00:00<00:00, 72078.35it/s]


Unnamed: 0,Вся выборка,"2019, Доходы, RUB","2020, Доходы, RUB","2021, Доходы, RUB","2022, Доходы, RUB"
RMSE,686091734.083398,262768192.4921693,48157425.21751962,113519023.1828257,98266674.55676655
True mean,166924638.70618072,148012915.19900784,158004422.45994592,203098422.1175744,230069658.5141931
Imputed mean,128096351.4617151,148182298.84197405,156581652.1627103,200196188.47329172,226479022.64282688
True median,17058000.0,14037500.0,15763000.0,18733500.0,20622000.0
Imputed median,13496786.21116898,14077000.0,15596000.0,18464259.71339152,20131000.0
True std,861830672.3423774,1217101036.6190648,877861632.7582511,1588316407.5889184,1968458234.36784
Imputed std,862879831.3394198,1240945845.636419,874055715.7529594,1575475871.2354279,1965907286.4556825


### ИТЕРАТИВНОЕ ЗАПОЛНЕНИЕ С ИСПОЛЬЗОВАНИЕМ К-БЛИЖАЙШИХ СОСЕДЕЙ

In [204]:
knn = RecoverMissingData(
    df=df,
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

knnr = KNeighborsRegressor(
    n_neighbors=10,
    algorithm="kd_tree",
    n_jobs=-1
)

knn.TEST_Iterative_Imputer(
    estimator=knnr,
    missing_value=np.nan,
    n_nearest_features=3,
    max_iter=1000,
    tol=0.000001,
    verbose=1,
    initial_strategy="median"
)

randomly deleting values
[IterativeImputer] Completing matrix with shape (111674, 4)
[IterativeImputer] Change: 44439533100.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 5750283700.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 5748791400.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 5469151600.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 5411445500.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 5395528800.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 2607319200.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 4251017900.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 5401608700.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 2607319200.0, scaled tolerance: 473140.163 


In [205]:
error_df = knn.TEST_error()
error_df

100%|██████████| 22279/22279 [00:00<00:00, 75808.69it/s]


Unnamed: 0,Вся выборка,"2019, Доходы, RUB","2020, Доходы, RUB","2021, Доходы, RUB","2022, Доходы, RUB"
RMSE,335009684.37897164,88909642.79825234,33428125.05681912,81819089.35290615,81691441.75535016
True mean,166924638.70618072,148012915.19900784,158004422.45994592,203098422.1175744,230069658.5141931
Imputed mean,172695372.24897882,148491492.90609452,158103394.78795153,203037836.83218837,230703957.01824328
True median,17058000.0,14037500.0,15763000.0,18733500.0,20622000.0
Imputed median,19767900.0,14225000.0,15855500.0,18841000.0,20829000.0
True std,861830672.3423774,1217101036.6190648,877861632.7582511,1588316407.5889184,1968458234.36784
Imputed std,846772780.3866515,1219706907.5639489,877335755.208827,1582411412.4706671,1970518046.2078123
