- Reference: DSO 530

# Imputing missing values

## Use sklearn.impute.SimpleImputer

In [1]:
import pandas as pd
from io import StringIO
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [2]:
# Other options for the
# strategy parameter are median, constant(default zero) or most_frequent,
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imp = imp.fit(df.values)
imputed_data = imp.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

# sklearn.impute.KNNImputer

- The KNNImputer class provides imputation for filling in missing values using the k-Nearest Neighbors approach.
- Each observation’s missing values are imputed using the mean values from n_neighbors nearest neighbors found in the training data sets.
- n_neighbors to 2, which means that the number of neighboring observations to use for imputation is 2. n_neighbors’ default value is 5.
- weights to “uniform”, which means that all points in each neighborhood are weighted equally. Besides, weights can be also set to “distance”, which means that we weight points by the inverse of their distance.

In [4]:
import numpy as np
from sklearn.impute import KNNImputer
X = [[1, 2, np.nan], [3, 4, 3], [np.nan, np.nan, 5], [8, 8, 7]]
df = pd.DataFrame(X, columns=['A', 'B', 'C'])

imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer.fit_transform(X) # you will get a np array

array([[1. , 2. , 5. ],
       [3. , 4. , 3. ],
       [5.5, 6. , 5. ],
       [8. , 8. , 7. ]])

# map-labels

## sklearn.preprocessing.LabelEncoder

In [5]:
df = pd.DataFrame([['green', 'M', 10.1, 'class1'],
['red', 'L', 13.5, 'class2'],
['blue', 'XL', 15.3, 'class1']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [6]:
from sklearn.preprocessing import LabelEncoder
# Label encoding with sklearn's LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([0, 1, 0])

In [7]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)

## sklearn.preprocessing.OneHotEncoder
## sklearn.compose.ColumnTransformer

In [13]:
X = df[['color', 'size', 'price']].values
X

array([['green', 'M', 10.1],
       ['red', 'L', 13.5],
       ['blue', 'XL', 15.3]], dtype=object)

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([("color",OneHotEncoder(), [0])], remainder = 'passthrough') 
# the "color" parameter seems to be redundant, but it must exist when using this package
X = ct.fit_transform(X)
X

# or we can use 
# one-hot encoding via pandas
# pd.get_dummies(df[['price', 'color', 'size']])

array([[0.0, 1.0, 0.0, 'M', 10.1],
       [0.0, 0.0, 1.0, 'L', 13.5],
       [1.0, 0.0, 0.0, 'XL', 15.3]], dtype=object)

- one-hot encoding datasets, we have to keep in mind that it introduces multicollinearity, which can be an issue for certain methods (for instance, methods that require matrix inversion). 
- If features are highly correlated, matrices are computationally difficult to invert, which can lead to numerically unstable estimates. 

- To reduce the correlation among variables, we can simply remove one feature column from the one-hot encoded array. Note that we do not lose any important information by removing a feature column, though; 
- If we use the get_dummies function, we can drop the first column by passing a True argument to the drop_first parameter, as shown in the following code example:

In [15]:
# multicollinearity guard in get_dummies
pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)
# or we can
X = df[['color', 'size', 'price']].values
ct = ColumnTransformer([("color", OneHotEncoder(drop = 'first'), [0])], 
                       remainder = 'passthrough')
X = ct.fit_transform(X)
X

Unnamed: 0,price,color_green,color_red,size_M,size_XL
0,10.1,1,0,1,0
1,13.5,0,1,0,0
2,15.3,0,0,0,1


# Feature Scaling 

## sklearn.preprocessing.MinMaxScaler/StandardScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

In [None]:
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [23]:
from sklearn.preprocessing import MinMaxScaler

ex_train = np.array([3, 4, 5, 6, 7, 8]).reshape(-1, 1)
# ex_train = np.array([3, 4, 5, 6, 7, 8])
# if we do not reshape, python will raise errors

mms_ex = MinMaxScaler()
ex_train_norm = mms_ex.fit_transform(ex_train)
ex_train_norm

array([[0. ],
       [0.2],
       [0.4],
       [0.6],
       [0.8],
       [1. ]])

# Split training data

## sklearn.model_selection.train_test_split

In [None]:
# stratify: allocate the dataset in the distribution of y, keep the the distribution of y label in splited datesets the same
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)