# Replacing missing values with an arbitrary number

Arbitrary number imputation consists of replacing missing data with an arbitrary value. Commonly used values include 999, 9999, or -1 for positive distributions. This method is suitable for numerical variables. For categorical variables, the equivalent method is to replace missing data with an arbitrary string, as described in the Imputing categorical variables recipe

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from feature_engine.imputation import ArbitraryNumberImputer


data = pd.read_csv("we_need_to_clean.csv")

X_train,X_test,Y_train,Y_test = train_test_split(data.drop("target",axis=1),data["target"],test_size=(0.3),random_state=10)

fill = X_train[['A2','A3', 'A8', 'A11']].max()

print(fill)
X_train[["A2", "A3", "A8", "A11"]] = X_train[["A2", "A3", "A8", "A11"]].fillna(99)

X_test[["A2", "A3", "A8", "A11"]] = X_test[["A2", "A3", "A8", "A11"]].fillna(99)

A2     80.25
A3     25.00
A8     20.00
A11    67.00
dtype: float64


In [2]:
#using sklearn

imputer = SimpleImputer(strategy="constant",fill_value=99)

imputer.fit(X_train[["A2", "A3", "A8", "A11"]])

In [3]:
X_train[["A2", "A3", "A8", "A11"]] = imputer.transform(X_train[["A2", "A3", "A8", "A11"]])

X_test[["A2", "A3", "A8", "A11"]] = imputer.transform(X_test[["A2", "A3", "A8", "A11"]])

In [4]:
#using feature_engine

imputer = ArbitraryNumberImputer(arbitrary_number=99 ,variables=["A2", "A3", "A8", "A11"])

X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)



# Finding extreme values for imputation


In [5]:
from feature_engine.imputation import EndTailImputer

numeric_vars = [var for var in data.select_dtypes(exclude="O").columns.to_list() if var !="target"]

numeric_vars

['A2', 'A3', 'A8', 'A11', 'A15', 'A16']

In [6]:
#spliting the data using train test split
X_train, X_test, y_train, y_test = train_test_split(data[numeric_vars],data["target"],test_size=0.3,random_state=0)

X_train.head()

Unnamed: 0,A2,A3,A8,A11,A15,A16
596,46.08,3.0,2.0,8,4159,1
303,15.92,2.0,0.0,0,0,0
204,36.33,2.0,0.0,1,1187,1
351,22.17,0.0,0.0,0,0,0
118,57.83,7.0,14.0,6,1332,1


In [7]:
#we use the interqutail range 

IQR = X_train.quantile(0.75) - X_train.quantile(0.25)

IQR

A2      16.42
A3       6.50
A8       3.00
A11      3.00
A15    450.00
A16      1.00
dtype: float64

In [8]:
imputation_dict = (X_train.quantile(0.75) + 1.5 * IQR).to_dict()


X_train = X_train.fillna(value=imputation_dict)
X_test = X_test.fillna(value=imputation_dict)



In [9]:
#using feature engine 
imputer = EndTailImputer(imputation_method="iqr",tail="right",fold=3,variables=None)

#we can set also tail right  ,left 
"""imputation_method="Gaussian". We can use 'left' or 'right' in the tail argument to specify the side of the distribution"""

imputer.fit(X_train)

imputer.imputer_dict_

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)



# setting up the pipe line

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from feature_engine.imputation import(AddMissingIndicator, CategoricalImputer,MeanMedianImputer)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data.drop("target", axis=1),data["target"],test_size=0.3,random_state=24)


# #we can set the variables name in a list
# varnames =["A1", "A3", "A4", "A5", "A6", "A7", "A8"]

# indicates = [f"{var}_na" for var in varnames]

# #it will add ones are zeros one is null values and zero null values
# X_train[indicates] = X_train[varnames].isna().astype(int)

# X_test[indicates] = X_test[varnames].isna().astype(int)



In [12]:
# imputer  = AddMissingIndicator(variables=None, missing_only=True)

# imputer.fit(X_train)

# imputer.variables_


In [13]:
# X_train = imputer.transform(X_train)

# X_test = imputer.transform(X_test)

In [14]:
pipe = Pipeline([
    #("ind", AddMissingIndicator(missing_only=True)),
    ("cat", CategoricalImputer(imputation_method="frequent")),
    ("num", MeanMedianImputer())
])


In [15]:
X_train = pipe.fit_transform(X_train)

X_test = pipe.transform(X_test)

X_train.isnull().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A15    0
A16    0
dtype: int64

# Performing multivariate imputation by chained equations

Multivariate imputation methods, as opposed to univariate imputation, use multiple variables to estimate the missing values. In other words, the missing values of a variable are modeled based on the other variables in the dataset. Multivariate Imputation by Chained Equations (MICE) models each variable with missing values as a function of the remaining variables and uses that estimate for imputation

In [16]:
from sklearn.linear_model import BayesianRidge
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [17]:
variables = ["A2", "A3", "A8", "A11",  "A15", "target"]
data = pd.read_csv("we_need_to_clean.csv",usecols=variables)
data.head()

Unnamed: 0,A2,A3,A8,A11,target,A15
0,30.83,0.0,1.0,1,202.0,0
1,58.67,4.0,3.0,6,43.0,560
2,24.5,,,0,280.0,824
3,27.83,1.0,3.0,5,100.0,3
4,20.17,5.0,1.0,0,120.0,0


In [18]:
X_train, X_test, y_train, y_test = train_test_split(data.drop("target", axis=1),data["target"],test_size=0.3,random_state=0)

In [29]:

imputer = IterativeImputer(estimator= BayesianRidge(),max_iter=10,random_state=0,)

imputer.fit(X_train)

X_train = imputer.transform(X_train)

X_test = imputer.transform(X_test)


pd.DataFrame(X_train).isnull().sum()

0    0
1    0
2    0
3    0
4    0
dtype: int64

# Estimating missing data with nearest neighbors

In [30]:
from sklearn.impute import KNNImputer


imputer = KNNImputer(n_neighbors=5,weights="distance")

Set up the imputer to find the closest 5 neighbors, utilizing Euclidean distance and weighting the neighbors so that the furthest neighbors have a smaller influence over the value of the replacement

In [33]:
imputer.fit(X_train)

X_train = imputer.transform(X_train)

X_test = imputer.transform(X_test)

pd.DataFrame(X_train).isnull().sum()

0    0
1    0
2    0
3    0
4    0
dtype: int64