In [1]:
import pyreadr
import pandas as pd
import numpy as np
import tqdm
import random as rnd
from itertools import product


from sklearn.metrics import root_mean_squared_error


from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor


from sklearn.preprocessing import LabelEncoder


import warnings
warnings.filterwarnings("ignore")


from sklearn.impute import KNNImputer
from missforest import MissForest


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
result = pyreadr.read_r("spark_23.rda")

In [3]:
df = result["spark_23"]

In [4]:
df["Код основного вида деятельности (SPLIT)"] = df["Код основного вида деятельности"].apply(lambda x: x.split(".")[0] if pd.notna(x) else np.nan)

In [22]:
class RecoverMissingData:

    def __init__(self, df: pd.DataFrame, cat_cols: list[str], num_cols: list[str], share_of_missing: float):

        np.random.seed(42)

        self.df = df
        self.num_cols = num_cols # IF FILLING MISSING VALUES BY ROWS INPUT COLUMN NAMES OF ONE VARIABLE
        self.cat_cols = cat_cols # INPUT ONLY TWO OR THREE CATEGORICAL COLUMNS
        self.cols = cat_cols + num_cols

        self.num_of_cats = len(cat_cols)

        problist = [0 if share_of_missing * 100 > i else 1 for i in range(100)]
        np.random.shuffle(problist)
        self.problist = problist

# ===========================================================================================================        
# КОЛОНОЧНЫЕ МЕТОДЫ

    def TEST_fill_with_mean_by_column(self):

        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)

        np.random.seed(42)
        
        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        fix_df = pd.DataFrame()
        comp_df = pd.DataFrame()
        cat_cols = self.cat_cols

        if self.num_of_cats == 2:
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):
                for j in set(del_df[cat_cols[1]].to_list()):

                    fix_in_loop_df = del_df[(del_df[cat_cols[0]] == i) & (del_df[cat_cols[1]] == j)]
                    comp_in_loop_df = new_df[(new_df[cat_cols[0]] == i) & (new_df[cat_cols[1]] == j)]
                        
                    for num_col in self.num_cols:
                        m = np.nanmean(fix_in_loop_df[num_col])
                        fix_in_loop_df[num_col] = fix_in_loop_df[num_col].apply(lambda x: x if pd.notna(x) else m)

                    fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                    comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)

        elif self.num_of_cats == 3: 
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):
                for j in set(del_df[cat_cols[1]].to_list()):
                    for k in set(del_df[cat_cols[2]].to_list()):

                        fix_in_loop_df = del_df[(del_df[cat_cols[0]] == i) & (del_df[cat_cols[1]] == j) & (del_df[cat_cols[2]] == k)]
                        comp_in_loop_df = new_df[(new_df[cat_cols[0]] == i) & (new_df[cat_cols[1]] == j) & (new_df[cat_cols[2]] == k)]
                            
                        for num_col in self.num_cols:
                            m = np.nanmean(fix_in_loop_df[num_col])
                            fix_in_loop_df[num_col] = fix_in_loop_df[num_col].apply(lambda x: x if pd.notna(x) else m)

                        fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                        comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)
        else: 
            raise ValueError("number of columns with categorical data must be 2 or 3")


        for num_col in tqdm.tqdm(self.num_cols):
            m = np.nanmean(fix_df[num_col].to_list())
            fix_df[num_col] = fix_df[num_col].apply(lambda x: x if pd.notna(x) else m)


        self.comp_df = comp_df.sort_index()
        self.fix_df = fix_df.sort_index()
        self.del_df = del_df
        pass

# ===========================================================================================================        

    def TEST_fill_with_median_by_column(self):

        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)

        np.random.seed(42)

        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        fix_df = pd.DataFrame()
        comp_df = pd.DataFrame()
        cat_cols = self.cat_cols

        if self.num_of_cats == 2:
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):
                for j in set(del_df[cat_cols[1]].to_list()):

                    fix_in_loop_df = del_df[(del_df[cat_cols[0]] == i) & (del_df[cat_cols[1]] == j)]
                    comp_in_loop_df = new_df[(new_df[cat_cols[0]] == i) & (new_df[cat_cols[1]] == j)]
                        
                    for num_col in self.num_cols:
                        m = np.nanmedian(fix_in_loop_df[num_col])
                        fix_in_loop_df[num_col] = fix_in_loop_df[num_col].apply(lambda x: x if pd.notna(x) else m)

                    fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                    comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)

        elif self.num_of_cats == 3: 
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):
                for j in set(del_df[cat_cols[1]].to_list()):
                    for k in set(del_df[cat_cols[2]].to_list()):

                        fix_in_loop_df = del_df[(del_df[cat_cols[0]] == i) & (del_df[cat_cols[1]] == j) & (del_df[cat_cols[2]] == k)]
                        comp_in_loop_df = new_df[(new_df[cat_cols[0]] == i) & (new_df[cat_cols[1]] == j) & (new_df[cat_cols[2]] == k)]
                            
                        for num_col in self.num_cols:
                            m = np.nanmedian(fix_in_loop_df[num_col])
                            fix_in_loop_df[num_col] = fix_in_loop_df[num_col].apply(lambda x: x if pd.notna(x) else m)

                        fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                        comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)
        else: 
            raise ValueError("number of columns with categorical data must be 2 or 3")

        for num_col in tqdm.tqdm(self.num_cols):
            m = np.nanmedian(fix_df[num_col].to_list())
            fix_df[num_col] = fix_df[num_col].apply(lambda x: x if pd.notna(x) else m)


        self.comp_df = comp_df.sort_index()
        self.fix_df = fix_df.sort_index()
        self.del_df = del_df
        pass

# ===========================================================================================================  
# ПОСТРОЧНЫЕ МЕТОДЫ      

    def TEST_fill_with_mean_by_row(self):

        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        col_m = np.mean(
            [j for i in self.num_cols for j in new_df[i].to_list()]
        )
        
        np.random.seed(42)

        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        self.comp_df = new_df
        fix_dict = {}

        del_dict = del_df.to_dict("index")

        for i in tqdm.tqdm(del_dict.keys()):

            m = np.nanmean([del_dict[i][j] for j in self.num_cols]) 

            if pd.isna(m):
                m = col_m

            in_loop_dict = {i: {}}

            for col in self.cols:
                if pd.notna(del_dict[i][col]):
                    in_loop_dict[i][col] = del_dict[i][col]
                else:
                    in_loop_dict[i][col] = m

            fix_dict[i] = in_loop_dict[i]


        self.fix_df =  pd.DataFrame.from_dict(fix_dict, orient="index")
        self.del_df = del_df
        pass

# ===========================================================================================================  

    def TEST_fill_with_median_by_row(self):

        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        col_m = np.median(
            [j for i in self.num_cols for j in new_df[i].to_list()]
        )
        
        np.random.seed(42)

        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        self.comp_df = new_df
        fix_dict = {}

        del_dict = del_df.to_dict("index")

        for i in tqdm.tqdm(del_dict.keys()):

            m = np.nanmedian([del_dict[i][j] for j in self.num_cols]) 

            if pd.isna(m):
                m = col_m

            in_loop_dict = {i: {}}

            for col in self.cols:
                if pd.notna(del_dict[i][col]):
                    in_loop_dict[i][col] = del_dict[i][col]
                else:
                    in_loop_dict[i][col] = m

            fix_dict[i] = in_loop_dict[i]


        self.fix_df =  pd.DataFrame.from_dict(fix_dict, orient="index")
        self.del_df = del_df
        pass

# ===========================================================================================================  
# МЕТОДЫ НА ОСНОВЕ МО

    def TEST_KNN_Imputer(self, n_neighbours: int, strategy_for_exceptions: str):

        if strategy_for_exceptions not in ["mean", "median"]:
            raise ValueError("Can only take 'median' or 'mean' as arguments.")
        if type(n_neighbours) != int:
            raise TypeError("n_neighbours must be 'int'")


        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)

        np.random.seed(42)

        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        fix_df = pd.DataFrame()
        comp_df = pd.DataFrame()
        cat_cols = self.cat_cols

        if self.num_of_cats == 1:
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):

                    fix_in_loop_df = del_df[del_df[cat_cols[0]] == i]
                    comp_in_loop_df = new_df[new_df[cat_cols[0]] == i]

                    try:
                        imputed_values = KNNImputer(n_neighbors=n_neighbours).fit_transform(fix_in_loop_df[self.num_cols])
                        imputed_values_index = fix_in_loop_df[self.num_cols].index
                        fix_in_loop_df[self.num_cols] = pd.DataFrame(imputed_values, columns=self.num_cols, index=imputed_values_index)  
                    except:
                        pass

                    fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                    comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)

        elif self.num_of_cats == 2:
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):
                for j in set(del_df[cat_cols[1]].to_list()):

                    fix_in_loop_df = del_df[(del_df[cat_cols[0]] == i) & (del_df[cat_cols[1]] == j)]
                    comp_in_loop_df = new_df[(new_df[cat_cols[0]] == i) & (new_df[cat_cols[1]] == j)]

                    try:
                        imputed_values = KNNImputer(n_neighbors=n_neighbours).fit_transform(fix_in_loop_df[self.num_cols])
                        imputed_values_index = fix_in_loop_df[self.num_cols].index
                        fix_in_loop_df[self.num_cols] = pd.DataFrame(imputed_values, columns=self.num_cols, index=imputed_values_index)  
                    except:
                        pass

                    fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                    comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)

        elif self.num_of_cats == 3:
            for i in tqdm.tqdm(list(set(del_df[cat_cols[0]].to_list()))):
                for j in set(del_df[cat_cols[1]].to_list()):
                    for k in set(del_df[cat_cols[2]].to_list()):

                        fix_in_loop_df = del_df[(del_df[cat_cols[0]] == i) & (del_df[cat_cols[1]] == j) & (del_df[cat_cols[2]] == k)]
                        comp_in_loop_df = new_df[(new_df[cat_cols[0]] == i) & (new_df[cat_cols[1]] == j) & (new_df[cat_cols[2]] == k)]

                        try:
                            imputed_values = KNNImputer(n_neighbors=n_neighbours).fit_transform(fix_in_loop_df[self.num_cols])
                            imputed_values_index = fix_in_loop_df[self.num_cols].index
                            fix_in_loop_df[self.num_cols] = pd.DataFrame(imputed_values, columns=self.num_cols, index=imputed_values_index)  
                        except:
                            pass

                        fix_df = pd.concat([fix_df, fix_in_loop_df], ignore_index=False)
                        comp_df = pd.concat([comp_df, comp_in_loop_df], ignore_index=False)

        if strategy_for_exceptions == "mean":
            m_func = np.nanmean
        elif strategy_for_exceptions == "median":
            m_func = np.nanmedian

        for num_col in tqdm.tqdm(self.num_cols):
            m = m_func(fix_df[num_col].to_list())
            fix_df[num_col] = fix_df[num_col].apply(lambda x: x if pd.notna(x) else m)

        self.comp_df = comp_df.sort_index()
        self.fix_df = fix_df.sort_index()
        self.del_df = del_df
        pass 

# ===========================================================================================================     

    def TEST_KNN_wrapper(self, n_neighbours: int):

        if type(n_neighbours) != int:
            raise TypeError("n_neighbours must be 'int'")
        
        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)

        np.random.seed(42)

        print("randomly deleting values")
        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        fix_df = pd.DataFrame()
        comp_df = pd.DataFrame()

        print("imputing missing values")
        imputed_values = KNNImputer(n_neighbors=n_neighbours).fit_transform(del_df[self.num_cols])
        fix_df[self.num_cols] = pd.DataFrame(imputed_values, columns=self.num_cols)

        self.fix_df = fix_df
        self.comp_df = new_df[self.num_cols]
        self.del_df = del_df
        pass

# ===========================================================================================================    

    def TEST_Miss_Forest(self, classifier, regressor, early_stopping: bool, verbose: int, max_iter: int):

        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)

        np.random.seed(42)

        print("randomly deleting values")
        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        fix_df = pd.DataFrame()
        comp_df = pd.DataFrame()

        print("encoding categorical values")
        for i in self.cat_cols:
            encoded_col = pd.DataFrame({i: LabelEncoder().fit_transform(del_df[i])})
            del_df[i], comp_df[i] = encoded_col, encoded_col



        mf = MissForest(clf=classifier, rgr=regressor, categorical=self.cat_cols, early_stopping=early_stopping, verbose=verbose, max_iter=max_iter)
        imputed_values = mf.fit_transform(del_df[self.cols])
        fix_df[self.cols] = pd.DataFrame(fix_df, columns=self.cols)

        self.fix_df = fix_df
        self.comp_df = new_df
        self.del_df = del_df
        pass

# ===========================================================================================================    

    def TEST_Iterative_Imputer(self, estimator, missing_value, max_iter: int, n_nearest_features: int, tol: float, verbose: int, initial_strategy: str):

        new_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)
        del_df = self.df[self.cols].dropna().reset_index().drop("index", axis=1)

        np.random.seed(42)

        print("randomly deleting values")
        for col in self.num_cols:

            del_df[col] = del_df[col].apply(lambda x: np.nan if np.random.choice(self.problist) == 0 else x)

        imputer = IterativeImputer(
            estimator=estimator,
            missing_values=missing_value,
            tol=tol,
            n_nearest_features=n_nearest_features,
            verbose=verbose,
            initial_strategy=initial_strategy,
            random_state=42
        )
        
        imputed_values = imputer.fit_transform(del_df[self.num_cols])


        self.fix_df = pd.DataFrame(imputed_values, columns=self.num_cols)
        self.comp_df = new_df[self.num_cols]
        self.del_df = del_df
        pass

# ===========================================================================================================       
   
    def TEST_error(self):
            
        # bool_df = self.comp_df == self.fix_df

        coor_list = []

        for num_col in self.num_cols:
            for y in range(self.del_df.shape[0]):

                if pd.isna(self.del_df.loc[y, num_col]):
                    coor_list.append((y, num_col))

        true_list = []
        fill_list = []

        for coor in tqdm.tqdm(coor_list):

            comp_df_value = self.comp_df.loc[*coor]
            true_list.append(comp_df_value)

            fix_df_value = self.fix_df.loc[*coor]
            fill_list.append(fix_df_value)

        print(root_mean_squared_error(true_list, fill_list))
        pass

In [23]:
share_of_misssing = 0.05

# КОЛОНОЧНЫЕ МЕТОДЫ

## СРЕДНЕЕ

In [24]:
mean_fill = RecoverMissingData(
    df=df,
    cat_cols=["Код основного вида деятельности (SPLIT)", "Размер компании"],
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

mean_fill.TEST_fill_with_mean_by_column()

100%|██████████| 20/20 [00:03<00:00,  6.59it/s]
100%|██████████| 4/4 [00:00<00:00, 18.55it/s]


In [25]:
mean_fill.TEST_error()

100%|██████████| 21216/21216 [00:00<00:00, 67718.90it/s]

724525045.1754906





In [26]:
mean_fill = RecoverMissingData(
    df=df, 
    cat_cols=["Регион регистрации", "Размер компании", "Код основного вида деятельности (SPLIT)"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

mean_fill.TEST_fill_with_mean_by_column()

100%|██████████| 84/84 [05:29<00:00,  3.92s/it]
100%|██████████| 4/4 [00:00<00:00, 18.97it/s]


In [27]:
mean_fill.TEST_error()

100%|██████████| 21216/21216 [00:00<00:00, 71295.80it/s]

810120183.629303





## МЕДИАНА

In [28]:
median_fill = RecoverMissingData(
    df=df,
    cat_cols=["Код основного вида деятельности (SPLIT)", "Размер компании"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

median_fill.TEST_fill_with_median_by_column()

100%|██████████| 20/20 [00:03<00:00,  6.49it/s]
100%|██████████| 4/4 [00:00<00:00, 18.42it/s]


In [29]:
median_fill.TEST_error()

100%|██████████| 21216/21216 [00:00<00:00, 57999.15it/s]

725400856.1779954





In [30]:
median_fill = RecoverMissingData(
    df=df,
    cat_cols=["Регион регистрации", "Код основного вида деятельности (SPLIT)", "Размер компании"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

median_fill.TEST_fill_with_median_by_column()

100%|██████████| 84/84 [05:27<00:00,  3.90s/it]
100%|██████████| 4/4 [00:00<00:00, 18.50it/s]


In [31]:
median_fill.TEST_error()

100%|██████████| 21216/21216 [00:00<00:00, 71173.42it/s]

795545167.2982346





# ПОСТРОЧНЫЕ МЕТОДЫ 

## СРЕДНЕЕ

In [32]:
mean_by_row_fill = RecoverMissingData(
    df=df,
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
) 

mean_by_row_fill.TEST_fill_with_mean_by_row()

100%|██████████| 111674/111674 [00:02<00:00, 37443.73it/s]


In [33]:
mean_by_row_fill.TEST_error()

100%|██████████| 22279/22279 [00:00<00:00, 60483.82it/s]

1148610968.312059





## МЕДИАНА

In [34]:
median_by_row_fill = RecoverMissingData(
    df=df,
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing 
) 

median_by_row_fill.TEST_fill_with_median_by_row()

100%|██████████| 111674/111674 [00:03<00:00, 31317.74it/s]


In [35]:
median_by_row_fill.TEST_error()

100%|██████████| 22279/22279 [00:00<00:00, 70006.40it/s]

406891574.3045145





# МЕТОДЫ НА ОСНОВЕ МО

## ЗАПОЛНЕНИЕ С ИСПОЛЬЗОВАНИЕМ К-БЛИЖАЙШИХ

### К-БЛИЖАЙШИХ C РАЗБИЕНИЕМ НА КАТЕГОРИИ 

In [36]:
knn_with_median = RecoverMissingData(
    df=df, 
    cat_cols=["Регион регистрации"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)
 
knn_with_median.TEST_KNN_Imputer(n_neighbours=10, strategy_for_exceptions="median")

100%|██████████| 84/84 [00:06<00:00, 12.17it/s]
100%|██████████| 4/4 [00:00<00:00, 18.32it/s]


In [37]:
knn_with_median.TEST_error()

100%|██████████| 22279/22279 [00:00<00:00, 70256.31it/s]

561696727.7913239





In [38]:
knn_with_median = RecoverMissingData(
    df=df,
    cat_cols=["Код основного вида деятельности (SPLIT)", "Размер компании"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing    
) 

knn_with_median.TEST_KNN_Imputer(n_neighbours=3, strategy_for_exceptions="median")

100%|██████████| 20/20 [00:09<00:00,  2.03it/s]
100%|██████████| 4/4 [00:00<00:00, 17.64it/s]


In [39]:
knn_with_median.TEST_error()

100%|██████████| 21216/21216 [00:00<00:00, 74836.43it/s]

399920947.3690318





In [40]:
knn_with_median = RecoverMissingData(
    df=df, 
    cat_cols=["Регион регистрации", "Код основного вида деятельности (SPLIT)", "Размер компании"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
) 

knn_with_median.TEST_KNN_Imputer(n_neighbours=1, strategy_for_exceptions="median")

100%|██████████| 84/84 [05:59<00:00,  4.28s/it]
100%|██████████| 4/4 [00:00<00:00, 18.35it/s]


In [41]:
knn_with_median.TEST_error()

100%|██████████| 21216/21216 [00:00<00:00, 70692.31it/s]

731938360.0562853





### К-БЛИЖАЙШИХ БЕЗ РАЗБИЕНИЯ НА КАТЕГОРИИ 

In [42]:
knn = RecoverMissingData(
    df=df, 
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
) 

knn.TEST_KNN_wrapper(n_neighbours=5)
# approximate runtime for ~106000x4 dataframe: 8:30 minutes

randomly deleting values
imputing missing values


In [43]:
knn.TEST_error()

100%|██████████| 22279/22279 [00:00<00:00, 44012.88it/s]

311947246.1854263





## ПОТЕРЯННЫЙ(?) ЛЕС

In [None]:
mf = RecoverMissingData(
    df=df,
    cat_cols=["Регион регистрации", "Код основного вида деятельности (SPLIT)", "Размер компании"], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
) 

mf.TEST_Miss_Forest(
    classifier=RandomForestClassifier(n_jobs=-1), 
    regressor=RandomForestRegressor(n_jobs=-1), 
    early_stopping=True, 
    verbose=1, 
    max_iter=3
)

In [None]:
mf.TEST_error()

## ИТЕРАТИВНОЕ ЗАПОЛНЕНИЕ С ИСПОЛЬЗОВАНИЕМ РАЗЛИЧНЫХ ОЦЕНЩИКОВ

### ИТЕРАТИВНОЕ ЗАПОЛНЕНИЕ С ИСПОЛЬЗОВАНИЕМ ГРАДИЕНТНОГО БУСТИНГА

In [44]:
gb = RecoverMissingData(
    df=df,
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

hgbr = HistGradientBoostingRegressor(
    max_depth=2,
    learning_rate=0.1,
    random_state=42
)

gb.TEST_Iterative_Imputer(
    estimator=hgbr,
    missing_value=np.nan,
    n_nearest_features=3,
    max_iter=100,
    tol=0.0001,
    verbose=2,
    initial_strategy="median"
)

randomly deleting values
[IterativeImputer] Completing matrix with shape (111674, 4)
[IterativeImputer] Ending imputation round 1/10, elapsed time 0.69
[IterativeImputer] Change: 38408891127.44298, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 2/10, elapsed time 1.35
[IterativeImputer] Change: 12973671919.16134, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 3/10, elapsed time 1.95
[IterativeImputer] Change: 1405984590.627125, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 4/10, elapsed time 2.57
[IterativeImputer] Change: 586398550.4316025, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 5/10, elapsed time 3.18
[IterativeImputer] Change: 613125167.076992, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 6/10, elapsed time 3.81
[IterativeImputer] Change: 503206227.1752751, scaled tolerance: 47314016.300000004 
[Iterative

In [45]:
gb.TEST_error()

100%|██████████| 22279/22279 [00:00<00:00, 74243.87it/s]

497246843.0283296





### ИТЕРАТИВНОЕ ЗАПОЛНЕНИЕ С ИСПОЛЬЗОВАНИЕМ СЛУЧАЙНОГО ЛЕСА

In [None]:
rf = RecoverMissingData(
    df=df,
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

rfr = RandomForestRegressor(
    n_estimators=100,
    bootstrap=True,
    n_jobs=-1,
    random_state=42,
    verbose=0
)

rf.TEST_Iterative_Imputer(
    estimator=rfr,
    missing_value=np.nan,
    n_nearest_features=3,
    max_iter=50,
    tol=0.0001,
    verbose=2,
    initial_strategy="median"
)

randomly deleting values
[IterativeImputer] Completing matrix with shape (111674, 4)
[IterativeImputer] Ending imputation round 1/10, elapsed time 71.24
[IterativeImputer] Change: 58106674100.0, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 2/10, elapsed time 143.75
[IterativeImputer] Change: 5391218150.0, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 3/10, elapsed time 217.20
[IterativeImputer] Change: 3192124380.0, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 4/10, elapsed time 291.58
[IterativeImputer] Change: 2568352020.0, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 5/10, elapsed time 364.28
[IterativeImputer] Change: 2918319814.619999, scaled tolerance: 47314016.300000004 
[IterativeImputer] Ending imputation round 6/10, elapsed time 444.11
[IterativeImputer] Change: 3097893384.619999, scaled tolerance: 47314016.300000004 


### ИТЕРАТИВНОЕ ЗАПОЛНЕНИЕ С ИСПОЛЬЗОВАНИЕМ ЛИНЕЙНОГО МЕТОДА ОПОРНЫХ ВЕКТОРОВ

In [46]:
lsv = RecoverMissingData(
    df=df,
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

lsvr = LinearSVR(
    random_state=42
)

lsv.TEST_Iterative_Imputer(
    estimator=lsvr,
    missing_value=np.nan,
    n_nearest_features=3,
    max_iter=100,
    tol=0.0001,
    verbose=1,
    initial_strategy="median"
)

randomly deleting values
[IterativeImputer] Completing matrix with shape (111674, 4)
[IterativeImputer] Change: 71570588141.19485, scaled tolerance: 47314016.300000004 
[IterativeImputer] Change: 10594018734.230152, scaled tolerance: 47314016.300000004 
[IterativeImputer] Change: 689513757.634265, scaled tolerance: 47314016.300000004 
[IterativeImputer] Change: 151838622.42597485, scaled tolerance: 47314016.300000004 
[IterativeImputer] Change: 37667252.91960716, scaled tolerance: 47314016.300000004 
[IterativeImputer] Early stopping criterion reached.


In [47]:
lsv.TEST_error()

100%|██████████| 22279/22279 [00:00<00:00, 66794.16it/s]

686091734.083398





### ИТЕРАТИВНОЕ ЗАПОЛНЕНИЕ С ИСПОЛЬЗОВАНИЕМ К-БЛИЖАЙШИХ СОСЕДЕЙ

In [48]:
knn = RecoverMissingData(
    df=df,
    cat_cols=[], 
    num_cols=["2019, Доходы, RUB", "2020, Доходы, RUB", "2021, Доходы, RUB", "2022, Доходы, RUB"],
    share_of_missing=share_of_misssing
)

knnr = KNeighborsRegressor(
    n_neighbors=10,
    algorithm="kd_tree",
    n_jobs=-1
)

knn.TEST_Iterative_Imputer(
    estimator=knnr,
    missing_value=np.nan,
    n_nearest_features=3,
    max_iter=1000,
    tol=0.000001,
    verbose=1,
    initial_strategy="median"
)

randomly deleting values
[IterativeImputer] Completing matrix with shape (111674, 4)
[IterativeImputer] Change: 44439533100.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 5750283700.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 5748791400.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 5469151600.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 5411445500.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 5395528800.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 2607319200.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 4251017900.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 5401608700.0, scaled tolerance: 473140.163 
[IterativeImputer] Change: 2607319200.0, scaled tolerance: 473140.163 


In [49]:
knn.TEST_error()

100%|██████████| 22279/22279 [00:00<00:00, 63913.61it/s]

335009684.37897164



