## How to use stastical imputation.

In [2]:
import pandas as pd
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'
# Load the CSV file
df = pd.read_csv(url, header=None, na_values='?')

# Create a summary DataFrame
missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Missing Values': df.isnull().sum().values,
    'Percentage (%)': (df.isnull().sum().values / df.shape[0]) * 100
})

# Optionally sort by most missing values
missing_summary = missing_summary.sort_values(by='Missing Values', ascending=False).reset_index(drop=True)

print(missing_summary)


    Column  Missing Values  Percentage (%)
0       15             247       82.333333
1       21             198       66.000000
2       20             165       55.000000
3       17             118       39.333333
4       14             106       35.333333
5       13             104       34.666667
6       16             102       34.000000
7        7              69       23.000000
8        3              60       20.000000
9        5              58       19.333333
10      12              56       18.666667
11       6              56       18.666667
12      10              55       18.333333
13       8              47       15.666667
14      11              44       14.666667
15      19              33       11.000000
16       9              32       10.666667
17      18              29        9.666667
18       4              24        8.000000
19      22               1        0.333333
20       0               1        0.333333
21       1               0        0.000000
22       2 

In [3]:
df.head(6)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2
5,2.0,1,528355,,,,2.0,1.0,3.0,1.0,...,,,,,1.0,2,0,0,0,2


In [4]:
import pandas as pd
import numpy as np
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'
df = pd.read_csv(url, header= None)
df.dtypes
df_converted = df.copy()
for col in df_converted.columns:
    if df_converted[col].dtype == 'object':
        df_converted[col] = pd.to_numeric(df_converted[col], errors='coerce')

# Show data types after conversion
converted_dtypes = df_converted.dtypes
converted_dtypes
df.shape

(300, 28)

In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from numpy import mean, std
from sklearn.utils.multiclass import type_of_target
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'
# Load the dataset, treating '?' as NaN
converted_df = pd.read_csv(url, header=None, na_values='?')

# Drop rows where the target column is NaN
target_col = 23
converted_df_cleaned = converted_df.dropna(subset=[target_col])

# Split into input (X) and output (y)
X = converted_df_cleaned.drop(columns=[target_col])
y = converted_df_cleaned[target_col]

# Convert y to numeric and drop NaNs
y = pd.to_numeric(y, errors='coerce')
non_nan_indices = ~pd.isnull(y)
X = X.loc[non_nan_indices]
y = y.loc[non_nan_indices]

# Convert y to integer for classification
y = y.astype(int)

# Define the pipeline
imputer = SimpleImputer(strategy='mean')
model = RandomForestClassifier()
pipeline = Pipeline(steps=[('i', imputer), ('m', model)])

# Cross-validation strategy
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# Evaluate the model
scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1)

# Report results
mean_accuracy = mean(scores)
std_accuracy = std(scores)

print(f"Mean Accuracy: {mean_accuracy:.3f}")
print(f"Standard Deviation: {std_accuracy:.3f}")


Mean Accuracy: 0.871
Standard Deviation: 0.048


In [6]:
df.dtypes

0     object
1      int64
2      int64
3     object
4     object
5     object
6     object
7     object
8     object
9     object
10    object
11    object
12    object
13    object
14    object
15    object
16    object
17    object
18    object
19    object
20    object
21    object
22    object
23     int64
24     int64
25     int64
26     int64
27     int64
dtype: object

### Imputation Using KNNeighbours 
If input variables are numeric, then regression models can be used for prediction, and this case is quite common. A range of dfferent models can be used, although a simple k-neares  neighbor (KNN) model has proven to beffeective in experiments. The use of a KNN model predict all missing values is referred to as Nearest Neighbor Imputation or KNN imputation.

In [8]:
from pandas import read_csv
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'
dataframe = read_csv(url , header=None, na_values='?')
# summarize the first few rows
print(dataframe.head())
# summarize the number of rows with missing values for each column
for i in range(dataframe.shape[1]):
# count number of rows with missing values
    n_miss = dataframe[[i]].isnull().sum()
    perc = n_miss / dataframe.shape[0] * 100
    print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))

    0   1        2     3      4     5    6    7    8    9   ...    18    19  \
0  2.0   1   530101  38.5   66.0  28.0  3.0  3.0  NaN  2.0  ...  45.0   8.4   
1  1.0   1   534817  39.2   88.0  20.0  NaN  NaN  4.0  1.0  ...  50.0  85.0   
2  2.0   1   530334  38.3   40.0  24.0  1.0  1.0  3.0  1.0  ...  33.0   6.7   
3  1.0   9  5290409  39.1  164.0  84.0  4.0  1.0  6.0  2.0  ...  48.0   7.2   
4  2.0   1   530255  37.3  104.0  35.0  NaN  NaN  6.0  2.0  ...  74.0   7.4   

    20   21   22  23     24  25  26  27  
0  NaN  NaN  2.0   2  11300   0   0   2  
1  2.0  2.0  3.0   2   2208   0   0   2  
2  NaN  NaN  1.0   2      0   0   0   1  
3  3.0  5.3  2.0   1   2208   0   0   1  
4  NaN  NaN  2.0   2   4300   0   0   2  

[5 rows x 28 columns]
> 0, Missing: 1 (0.3%)
> 1, Missing: 0 (0.0%)
> 2, Missing: 0 (0.0%)
> 3, Missing: 60 (20.0%)
> 4, Missing: 24 (8.0%)
> 5, Missing: 58 (19.3%)
> 6, Missing: 56 (18.7%)
> 7, Missing: 69 (23.0%)
> 8, Missing: 47 (15.7%)
> 9, Missing: 32 (10.7%)
> 10, M

  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))
  print('> %d,