In [1]:
import numpy as np
import pandas as pd

In [2]:
data = np.array([[ 1., -1.,  2., 1., 3.],
                 [ 2.,  np.nan,  0., 1., np.nan],
                 [ 0.,  1., np.nan, 1., np.nan],
                 [ np.nan,  1., 3., 1., np.nan],
                 [ 3.,  1., 1., np.nan, np.nan],
                 
                 ])

data

array([[ 1., -1.,  2.,  1.,  3.],
       [ 2., nan,  0.,  1., nan],
       [ 0.,  1., nan,  1., nan],
       [nan,  1.,  3.,  1., nan],
       [ 3.,  1.,  1., nan, nan]])

In [3]:
df = pd.DataFrame({
    "col1":data[:,0],
    "col2":data[:,1],
    "col3":data[:,2],
    "col4":data[:,3],
    "col5":data[:,4],
    "col5":['red', 'red', 'green','yellow', np.nan]
})
df

Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,-1.0,2.0,1.0,red
1,2.0,,0.0,1.0,red
2,0.0,1.0,,1.0,green
3,,1.0,3.0,1.0,yellow
4,3.0,1.0,1.0,,


In [4]:
class CustomSimpleImputer:

    def __init__(self, strategy: str='mean', fill_value:str|float=None):
        self.strategy = strategy
        self.fill_value = fill_value

    def fit_transform(self,data: pd.DataFrame):
        cols = list(data.columns)
        num_data = data.select_dtypes(exclude='object')
        cat_data = data.select_dtypes(include='object')
        num_cols = list(num_data.columns)
        cat_cols = list(cat_data.columns)

        for col in num_cols:
            if self.strategy == "mean":
                data[col].fillna(value= data[col].mean(), inplace = True) 

            if self.strategy == "median":
                data[col].fillna(value= data[col].median(), inplace=True)

        for col in cat_cols :
            if self.strategy == 'most_frequent':
                cols= list(data[col].dropna())
                uniq = list(np.unique(cols))
                counts = [cols.count(item) for item in uniq]
                assert len(uniq) == len(counts)
                index = counts.index(max(counts))
                data[col].fillna(value = uniq[index], inplace=True)

        if self.strategy == 'constant' :
            data.fillna(value = self.fill_value, inplace=True)
        return data

In [5]:
cust_imputer = CustomSimpleImputer(strategy='mean')
cust_imputer1 = CustomSimpleImputer(strategy='most_frequent')

clean_data = cust_imputer.fit_transform(df)
clean_data

Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,-1.0,2.0,1.0,red
1,2.0,0.5,0.0,1.0,red
2,0.0,1.0,1.5,1.0,green
3,1.5,1.0,3.0,1.0,yellow
4,3.0,1.0,1.0,1.0,


In [6]:
cleaned_data = cust_imputer1.fit_transform(clean_data)
cleaned_data

Unnamed: 0,col1,col2,col3,col4,col5
0,1.0,-1.0,2.0,1.0,red
1,2.0,0.5,0.0,1.0,red
2,0.0,1.0,1.5,1.0,green
3,1.5,1.0,3.0,1.0,yellow
4,3.0,1.0,1.0,1.0,red


In [7]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')

imp_data = imputer.fit_transform(df[['col1', 'col2', 'col3', 'col4']])

imp_data

array([[ 1. , -1. ,  2. ,  1. ],
       [ 2. ,  0.5,  0. ,  1. ],
       [ 0. ,  1. ,  1.5,  1. ],
       [ 1.5,  1. ,  3. ,  1. ],
       [ 3. ,  1. ,  1. ,  1. ]])

In [8]:
imputer1 = SimpleImputer(strategy='most_frequent')
imp_data1 = imputer1.fit_transform(df[['col5']])
imp_data1

array([['red'],
       ['red'],
       ['green'],
       ['yellow'],
       ['red']], dtype=object)