In [21]:
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Lasso, ElasticNet
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Auto MPG dataset from UCI

In [29]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
auto_mpg = fetch_ucirepo(id=9) 
  
# data (as pandas dataframes) 
X = auto_mpg.data.features 
y = auto_mpg.data.targets 
  
# metadata 
print(auto_mpg.metadata) 
  
# variable information 
auto_mpg.variables

{'uci_id': 9, 'name': 'Auto MPG', 'repository_url': 'https://archive.ics.uci.edu/dataset/9/auto+mpg', 'data_url': 'https://archive.ics.uci.edu/static/public/9/data.csv', 'abstract': 'Revised from CMU StatLib library, data concerns city-cycle fuel consumption', 'area': 'Other', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 398, 'num_features': 7, 'feature_types': ['Real', 'Categorical', 'Integer'], 'demographics': [], 'target_col': ['mpg'], 'index_col': ['car_name'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1993, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5859H', 'creators': ['R. Quinlan'], 'intro_paper': None, 'additional_info': {'summary': 'This dataset is a slightly modified version of the dataset provided in the StatLib library.  In line with the use by Ross Quinlan (1993) in predicting the attribute "mpg", 8 of the original instances were removed because they had unknown values for th

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,displacement,Feature,Continuous,,,,no
1,mpg,Target,Continuous,,,,no
2,cylinders,Feature,Integer,,,,no
3,horsepower,Feature,Continuous,,,,yes
4,weight,Feature,Continuous,,,,no
5,acceleration,Feature,Continuous,,,,no
6,model_year,Feature,Integer,,,,no
7,origin,Feature,Integer,,,,no
8,car_name,ID,Categorical,,,,no


In [24]:
X.isnull().sum()

displacement    0
cylinders       0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

## Simple Imputer

In [25]:
imp = SimpleImputer(strategy='mean').set_output(transform='pandas')
auto_mpg_df = imp.fit_transform(X)
auto_mpg_df


Unnamed: 0,displacement,cylinders,horsepower,weight,acceleration,model_year,origin
0,307.0,8.0,130.0,3504.0,12.0,70.0,1.0
1,350.0,8.0,165.0,3693.0,11.5,70.0,1.0
2,318.0,8.0,150.0,3436.0,11.0,70.0,1.0
3,304.0,8.0,150.0,3433.0,12.0,70.0,1.0
4,302.0,8.0,140.0,3449.0,10.5,70.0,1.0
...,...,...,...,...,...,...,...
393,140.0,4.0,86.0,2790.0,15.6,82.0,1.0
394,97.0,4.0,52.0,2130.0,24.6,82.0,2.0
395,135.0,4.0,84.0,2295.0,11.6,82.0,1.0
396,120.0,4.0,79.0,2625.0,18.6,82.0,1.0


In [27]:
auto_mpg_df.isnull().sum()

displacement    0
cylinders       0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [None]:

X_train, X_test, y_train, y_test = train_test_split(auto_mpg_df, y, test_size=0.3, random_state=24)
alphas = [0.01,0.1,0.5,1,1.4,2,2.5,5]
l_ratios = [0.001,0.3,0.5,0.7,0.9]
scores = []
for i in alphas:
    for j in l_ratios:
        el = ElasticNet(alpha=i, l1_ratio=j)
        el.fit(X_train, y_train)
        y_pred = el.predict(X_test)
        scores.append({"r2_score": r2_score(y_test, y_pred), "alpha": i, "l_ratio": j})

yeild_df = pd.DataFrame(data=scores)
yeild_df = yeild_df.sort_values(by='r2_score', ascending=False)
yeild_df.head(1)

Unnamed: 0,r2_score,alpha,l_ratio
9,0.763432,0.1,0.9


In [30]:
imp = SimpleImputer(strategy='median').set_output(transform='pandas')
auto_mpg_df = imp.fit_transform(X)
auto_mpg_df

Unnamed: 0,displacement,cylinders,horsepower,weight,acceleration,model_year,origin
0,307.0,8.0,130.0,3504.0,12.0,70.0,1.0
1,350.0,8.0,165.0,3693.0,11.5,70.0,1.0
2,318.0,8.0,150.0,3436.0,11.0,70.0,1.0
3,304.0,8.0,150.0,3433.0,12.0,70.0,1.0
4,302.0,8.0,140.0,3449.0,10.5,70.0,1.0
...,...,...,...,...,...,...,...
393,140.0,4.0,86.0,2790.0,15.6,82.0,1.0
394,97.0,4.0,52.0,2130.0,24.6,82.0,2.0
395,135.0,4.0,84.0,2295.0,11.6,82.0,1.0
396,120.0,4.0,79.0,2625.0,18.6,82.0,1.0


In [31]:
X_train, X_test, y_train, y_test = train_test_split(auto_mpg_df, y, test_size=0.3, random_state=24)
alphas = [0.01,0.1,0.5,1,1.4,2,2.5,5]
l_ratios = [0.001,0.3,0.5,0.7,0.9]
scores = []
for i in alphas:
    for j in l_ratios:
        el = ElasticNet(alpha=i, l1_ratio=j)
        el.fit(X_train, y_train)
        y_pred = el.predict(X_test)
        scores.append({"r2_score": r2_score(y_test, y_pred), "alpha": i, "l_ratio": j})

yeild_df = pd.DataFrame(data=scores)
yeild_df = yeild_df.sort_values(by='r2_score', ascending=False)
yeild_df.head(1)

Unnamed: 0,r2_score,alpha,l_ratio
9,0.763377,0.1,0.9


## Iterative Imputer

In [32]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

`IterativeImputer` is a method for imputing missing values in a dataset, available in **scikit-learn**. It is part of the `sklearn.impute` module and provides a sophisticated approach to handling missing data by using a model-based imputation strategy.

 Key Features:
- **Iterative Imputation**: The `IterativeImputer` performs imputation by modeling each feature with missing values as a function of the other features. It iteratively fills in missing values based on predictions from other features until convergence is achieved.
- **Multivariate Imputation**: Unlike simpler methods like mean or median imputation, which treat each feature independently, `IterativeImputer` uses the relationships between features to make imputation decisions.

 How it Works:
1. **Initialization**: The imputation process begins by assigning initial estimates to missing values, typically using the mean or median of the observed values for each feature.
2. **Modeling**: For each feature with missing values, `IterativeImputer` treats the other features as predictors and fits a regression model (default is a Bayesian Ridge Regression model) to predict the missing values based on the other features.
3. **Iteration**: The imputer then updates the missing values iteratively, refining the imputation in each iteration using the relationships between features. This is repeated until the imputed values converge or a maximum number of iterations is reached.
4. **Convergence**: The process stops either when the changes in the imputed values are sufficiently small or after a specified number of iterations.

 Parameters:

- **`estimator`**: The model used to predict the missing values. By default, it uses `BayesianRidge`, but you can provide any regression model (e.g., `DecisionTreeRegressor`, `LinearRegression`, etc.).
- **`max_iter`**: The maximum number of iterations to perform. Default is 10.
- **`n_nearest_features`**: The number of features to use for predicting each feature with missing values. Default is all features.
- **`initial_strategy`**: How to initialize the missing values. Can be `'mean'`, `'median'`, or `'most_frequent'`.
- **`min_value` and `max_value`**: Optionally, you can specify constraints on the imputed values.
- **`random_state`**: Seed for random number generation for reproducibility.
- **`add_indicator`**: If `True`, adds binary indicators for missing values, which could be useful for some machine learning models.

 Example of Usage:

```python
from sklearn.experimental import enable_iterative_imputer  # Import to enable IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.datasets import make_regression
import numpy as np

# Create sample data with missing values
X, y = make_regression(n_samples=100, n_features=5, noise=0.1)
X[::10, 2] = np.nan  # Introduce missing values in the 3rd column

# Initialize IterativeImputer
imputer = IterativeImputer(max_iter=10, random_state=0)

# Impute missing values
X_imputed = imputer.fit_transform(X)

# Check the imputed data
print(X_imputed)
```

 Advantages of IterativeImputer:
- **Model-based approach**: Unlike simple imputation methods (like mean, median, or mode), `IterativeImputer` accounts for the relationships between features, making it more sophisticated and potentially more accurate.
- **Works for multivariate data**: It can handle missing data across multiple features simultaneously, considering their dependencies.
- **Can use different estimators**: You can change the underlying model for imputation based on the nature of your data, allowing more flexibility.

 Disadvantages:
- **Computationally intensive**: Since it involves iterating and fitting models, it can be more computationally expensive, especially for large datasets.
- **Sensitivity to model choice**: The quality of imputation depends on the underlying model (estimator) used for predictions. If the model is not well-chosen, it may lead to poor imputations.
- **Potential overfitting**: If not controlled properly (e.g., setting too many iterations or not choosing a good estimator), iterative imputation could lead to overfitting, where the imputed values overfit the observed data.

 Comparison to Other Imputation Methods:
- **Mean/Median Imputation**: Simple and fast but does not take into account relationships between features, leading to less accurate imputation.
- **KNN Imputation**: Also considers relationships between features but relies on distance metrics between samples and may be slower for large datasets.
- **MICE (Multiple Imputation by Chained Equations)**: Similar to `IterativeImputer` but specifically designed for multiple imputation to handle uncertainty in missing data.

In conclusion, **`IterativeImputer`** is a powerful tool for handling missing data, especially when there are complex relationships between features. However, it should be used with care, as it can be computationally expensive and sensitive to the choice of model.

In [33]:
it_imp = IterativeImputer(random_state=24)
X_imputed = it_imp.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_imputed,y,test_size=0.3, random_state=24)

In [34]:
alphas = [0.01 , 0.1 , 0.5 , 1 , 2.5 , 3,5]
l_ratios = [0.001,0.3,0.5,0.7,0.9]
scores = []
for i in alphas:
    for j in l_ratios:
        el = ElasticNet(alpha=i, l1_ratio=j)
        el.fit(X_train, y_train)
        y_pred = el.predict(X_test)
        scores.append({"r2_score": r2_score(y_test, y_pred), "alpha": i, "l_ratio": j})

yeild_df = pd.DataFrame(data=scores)
yeild_df = yeild_df.sort_values(by='r2_score', ascending=False)
yeild_df.head(1)

Unnamed: 0,r2_score,alpha,l_ratio
9,0.762388,0.1,0.9


## Forest Fires Dataset

In [44]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
forest_fires = fetch_ucirepo(id=162) 
  
# data (as pandas dataframes) 
X = forest_fires.data.features 
y = forest_fires.data.targets 
  
# metadata 
print(forest_fires.metadata) 
  
# variable information 
print(forest_fires.variables) 


{'uci_id': 162, 'name': 'Forest Fires', 'repository_url': 'https://archive.ics.uci.edu/dataset/162/forest+fires', 'data_url': 'https://archive.ics.uci.edu/static/public/162/data.csv', 'abstract': 'This is a difficult regression task, where the aim is to predict the burned area of forest fires, in the northeast region of Portugal, by using meteorological and other data (see details at: http://www.dsi.uminho.pt/~pcortez/forestfires).', 'area': 'Climate and Environment', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 517, 'num_features': 12, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['area'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2007, 'last_updated': 'Thu Jan 11 2024', 'dataset_doi': '10.24432/C5D88D', 'creators': ['Paulo Cortez', 'Anbal Morais'], 'intro_paper': {'ID': 368, 'type': 'NATIVE', 'title': 'A data mining approach to predict forest fires using meteorological da

In [47]:
X_encoded = pd.get_dummies(X, drop_first=True)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded,y,test_size=0.3, random_state=24)
alphas = [0.01 , 0.1 , 0.5 , 1 , 2.5 , 3,5]
l_ratios = [0.001,0.3,0.5,0.7,0.9]
scores = []
for i in alphas:
    for j in l_ratios:
        el = ElasticNet(alpha=i, l1_ratio=j)
        el.fit(X_train, y_train)
        y_pred = el.predict(X_test)
        scores.append({"r2_score": r2_score(y_test, y_pred), "alpha": i, "l_ratio": j})

yeild_df = pd.DataFrame(data=scores)
yeild_df = yeild_df.sort_values(by='r2_score', ascending=False)
yeild_df.head(1)

Unnamed: 0,r2_score,alpha,l_ratio
4,0.008284,0.01,0.9


    - In [Cortez and Morais, 2007], the output 'area' was first transformed with a ln(x+1) function.

Applying `ln(1+y)` to `y`

In [49]:
y_trn_ln = np.log1p(y_train)


In [55]:
y_pred = np.expm1(y_pred)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded,y,test_size=0.3, random_state=24)
alphas = [0.01 , 0.1 , 0.5 , 1 , 2.5 , 3,5]
l_ratios = [0.001,0.3,0.5,0.7,0.9]
scores = []
for i in alphas:
    for j in l_ratios:
        el = ElasticNet(alpha=i, l1_ratio=j)
        el.fit(X_train, y_trn_ln)
        y_pred = el.predict(X_test)
        y_pred = np.expm1(y_pred)
        scores.append({"r2_score": r2_score(y_test, y_pred), "alpha": i, "l_ratio": j})

yeild_df = pd.DataFrame(data=scores)
yeild_df = yeild_df.sort_values(by='r2_score', ascending=False)
yeild_df.head(1)

Unnamed: 0,r2_score,alpha,l_ratio
0,-0.021953,0.01,0.001


## MPG dataset

applying `log(1+y)` to `y` for auto mpg dataset 

In [58]:
X = auto_mpg.data.features 
y = auto_mpg.data.targets
imp = SimpleImputer(strategy='mean').set_output(transform='pandas')
auto_mpg_df = imp.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(auto_mpg_df, y, test_size=0.3, random_state=24)
y_trn_ln = np.log1p(y_train)
alphas = [0.01,0.1,0.5,1,1.4,2,2.5,5]
l_ratios = [0.001,0.3,0.5,0.7,0.9]
scores = []
for i in alphas:
    for j in l_ratios:
        el = ElasticNet(alpha=i, l1_ratio=j)
        el.fit(X_train, y_trn_ln)
        y_pred = el.predict(X_test)
        y_pred = np.expm1(y_pred)
        scores.append({"r2_score": r2_score(y_test, y_pred), "alpha": i, "l_ratio": j})

yeild_df = pd.DataFrame(data=scores)
yeild_df = yeild_df.sort_values(by='r2_score', ascending=False)
yeild_df.head(1)

Unnamed: 0,r2_score,alpha,l_ratio
4,0.805925,0.01,0.9
