# Import libs

In [2]:
import pandas as pd
import pickle
from sklearn.preprocessing import OneHotEncoder

In [3]:
df = pd.read_parquet('prepared_full.parquet')

# OHE test

In [4]:
# Columns to encode
columns_to_change = ['education', 'employment_status', 'value',
                     'gender', 'family_status', 'snils', 'merch_code', 'goods_category']

In [5]:
# OHE
one_hot_encoder = OneHotEncoder(drop='first')

In [6]:
# Df with columns to encode
X = df[columns_to_change]

In [7]:
# Fit and transform the specified columns
ohe_fitted = one_hot_encoder.fit(X)

In [8]:
# Save the one-hot encoding model
with open('ohe_model.pkl', 'wb') as f:
    pickle.dump(ohe_fitted, f)

In [9]:
# Load the one-hot encoding model
with open('ohe_model.pkl', 'rb') as f:
    ohe_fitted_restored = pickle.load(f)

## OHE transform full dataset

In [10]:
ohe_transformed = ohe_fitted_restored.transform(X)
column_names = ohe_fitted_restored.get_feature_names_out()

In [11]:
# Convert the transformed array to a DataFrame
ohe_result = pd.DataFrame(ohe_transformed.toarray(), columns=column_names)

In [12]:
# Add the original columns back to the DataFrame
ohe_df = pd.concat([df.reset_index(drop=True).drop(columns_to_change, axis=1), ohe_result], axis=1)

In [13]:
# Print the first row of the transformed DataFrame
print(ohe_df[ohe_df.index == 0])

  birth_date job_start_date     position  month_profit  month_expense  \
0 1988-07-21     2013-09-01  начальник п        180000          90000   

   child_count bank_a_decision bank_b_decision bank_c_decision  \
0            0         success         success         success   

  bank_d_decision  ... merch_code_77  merch_code_78  merch_code_79  \
0         success  ...           1.0            0.0            0.0   

   merch_code_80  goods_category_Fitness  goods_category_Furniture  \
0            0.0                     0.0                       1.0   

   goods_category_Medical_services  goods_category_Mobile_devices  \
0                              0.0                            0.0   

   goods_category_Other  goods_category_Travel  
0                   0.0                    0.0  

[1 rows x 121 columns]


## OHE transform one line

In [14]:
# Create dataset with one line only
one_df = df[df.index == 0]
one_X = one_df[columns_to_change]

In [15]:
one_ohe_transformed = ohe_fitted_restored.transform(one_X)
one_column_names = ohe_fitted_restored.get_feature_names_out()

In [16]:
one_ohe_result = pd.DataFrame(one_ohe_transformed.toarray(), columns=one_column_names)
one_ohe_df = pd.concat([one_df.reset_index(drop=True).drop(columns_to_change, axis=1), one_ohe_result], axis=1)

In [17]:
print(one_ohe_df)

  birth_date job_start_date     position  month_profit  month_expense  \
0 1988-07-21     2013-09-01  начальник п        180000          90000   

   child_count bank_a_decision bank_b_decision bank_c_decision  \
0            0         success         success         success   

  bank_d_decision  ... merch_code_77  merch_code_78  merch_code_79  \
0         success  ...           1.0            0.0            0.0   

   merch_code_80  goods_category_Fitness  goods_category_Furniture  \
0            0.0                     0.0                       1.0   

   goods_category_Medical_services  goods_category_Mobile_devices  \
0                              0.0                            0.0   

   goods_category_Other  goods_category_Travel  
0                   0.0                    0.0  

[1 rows x 121 columns]


In [18]:
print(ohe_df[ohe_df.index == 0].equals(one_ohe_df))

True
