# Replacing missing values with an arbitrary number

Arbitrary number imputation consists of replacing missing data with an arbitrary value. Commonly used values include 999, 9999, or -1 for positive distributions. This method is suitable for numerical variables. For categorical variables, the equivalent method is to replace missing data with an arbitrary string, as described in the Imputing categorical variables recipe

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from feature_engine.imputation import ArbitraryNumberImputer


data = pd.read_csv("we_need_to_clean.csv")

X_train,X_test,Y_train,Y_test = train_test_split(data.drop("target",axis=1),data["target"],test_size=(0.3),random_state=10)

fill = X_train[['A2','A3', 'A8', 'A11']].max()

print(fill)
X_train[["A2", "A3", "A8", "A11"]] = X_train[["A2", "A3", "A8", "A11"]].fillna(99)

X_test[["A2", "A3", "A8", "A11"]] = X_test[["A2", "A3", "A8", "A11"]].fillna(99)

In [2]:
#using sklearn

imputer = SimpleImputer(strategy="constant",fill_value=99)

imputer.fit(X_train[["A2", "A3", "A8", "A11"]])

In [3]:
X_train[["A2", "A3", "A8", "A11"]] = imputer.transform(X_train[["A2", "A3", "A8", "A11"]])

X_test[["A2", "A3", "A8", "A11"]] = imputer.transform(X_test[["A2", "A3", "A8", "A11"]])

In [4]:
#using feature_engine

imputer = ArbitraryNumberImputer(arbitrary_number=99 ,variables=["A2", "A3", "A8", "A11"])

X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)



# Finding extreme values for imputation


In [5]:
from feature_engine.imputation import EndTailImputer

numeric_vars = [var for var in data.select_dtypes(exclude="O").columns.to_list() if var !="target"]

numeric_vars

In [6]:
#spliting the data using train test split
X_train, X_test, y_train, y_test = train_test_split(data[numeric_vars],data["target"],test_size=0.3,random_state=0)

X_train.head()

In [7]:
#we use the interqutail range 

IQR = X_train.quantile(0.75) - X_train.quantile(0.25)

IQR

In [8]:
imputation_dict = (X_train.quantile(0.75) + 1.5 * IQR).to_dict()


X_train = X_train.fillna(value=imputation_dict)
X_test = X_test.fillna(value=imputation_dict)



In [9]:
#using feature engine 
imputer = EndTailImputer(imputation_method="iqr",tail="right",fold=3,variables=None)

#we can set also tail right  ,left 
"""imputation_method="Gaussian". We can use 'left' or 'right' in the tail argument to specify the side of the distribution"""

imputer.fit(X_train)

imputer.imputer_dict_

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)



# setting up the pipe line

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from feature_engine.imputation import(AddMissingIndicator, CategoricalImputer,MeanMedianImputer)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data.drop("target", axis=1),data["target"],test_size=0.3,random_state=24)


#we can set the variables name in a list
varnames =["A1", "A3", "A4", "A5", "A6", "A7", "A8"]

indicates = [f"{var}_na" for var in varnames]

#it will add ones are zeros one is null values and zero null values
X_train[indicates] = X_train[varnames].isna().astype(int)

X_test[indicates] = X_test[varnames].isna().astype(int)



In [12]:
imputer  = AddMissingIndicator(variables=None, missing_only=True)

imputer.fit(X_train)

imputer.variables_


In [13]:
X_train = imputer.transform(X_train)

X_test = imputer.transform(X_test)

In [16]:
pipe = Pipeline(

    [

        ("ind",

        AddMissingIndicator(missing_only=True)

        ),

        ("cat",

        CategoricalImputer(

            imputation_method="frequent")

        ),

        ("num", MeanMedianImputer()),

    ]

)

In [17]:
X_train = pipe.fit_transform(X_train)

X_test = pipe.transform(X_test)