## Exercise 1: Imputer 1

In [1]:
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:
train_data = np.array([
    [7, 6, 5],
    [4, np.nan, 5],
    [1, 20, 8]
])
test_data = np.array([
    [np.nan, 1, 2],
    [7, np.nan, 9],
    [np.nan, 2, 4]
])

In [3]:
print("Train data:\n", train_data)

Train data:
 [[ 7.  6.  5.]
 [ 4. nan  5.]
 [ 1. 20.  8.]]


In [4]:
print("Test data:\n", test_data)

Test data:
 [[nan  1.  2.]
 [ 7. nan  9.]
 [nan  2.  4.]]


In [5]:
print("Missing values in train_data:", np.isnan(train_data).sum())

Missing values in train_data: 1


In [6]:
print("Missing values in test_data:", np.isnan(test_data).sum())

Missing values in test_data: 3


In [7]:
imp_mean = SimpleImputer(strategy='mean')
imp_mean.fit(train_data)

In [8]:
print("Imputer statistics_:", imp_mean.statistics_)

Imputer statistics_: [ 4. 13.  6.]


In [9]:
filled_train = imp_mean.transform(train_data)
print("Filled train set:\n", filled_train)

Filled train set:
 [[ 7.  6.  5.]
 [ 4. 13.  5.]
 [ 1. 20.  8.]]


In [10]:
filled_test = imp_mean.transform(test_data)
print("Filled test set:\n", filled_test)

Filled test set:
 [[ 4.  1.  2.]
 [ 7. 13.  9.]
 [ 4.  2.  4.]]


## Exercise 2: Scaler

In [11]:
from sklearn.preprocessing import StandardScaler
import numpy as np

In [12]:
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
X_test = np.array([[ 2., -1.,  1.],
                   [ 3.,  3.,  -1.],
                   [ 1.,  1., 1.]])

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
print("Scaled train set:\n", X_train_scaled)

Scaled train set:
 [[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]


In [14]:
print("Mean of scaled train set (axis=0):", X_train_scaled.mean(axis=0))
print("Std of scaled train set (axis=0):", X_train_scaled.std(axis=0))


Mean of scaled train set (axis=0): [0. 0. 0.]
Std of scaled train set (axis=0): [1. 1. 1.]


In [15]:
X_test_scaled = scaler.transform(X_test)
print("Scaled test set:\n", X_test_scaled)

Scaled test set:
 [[ 1.22474487 -1.22474487  0.53452248]
 [ 2.44948974  3.67423461 -1.06904497]
 [ 0.          1.22474487  0.53452248]]


## Exercise 3: One hot Encoder

In [16]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from tabulate import tabulate

In [17]:
X_train = np.array([['Python'],
                    ['Java'],
                    ['Java'],
                    ['C++']])

In [18]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train)

In [19]:
df_train = pd.DataFrame(X_train_encoded, columns=encoder.categories_[0])
print("One hot encoded train set:")
print(tabulate(df_train.astype(int), headers='keys', tablefmt='grid'))

One hot encoded train set:
+----+-------+--------+----------+
|    |   C++ |   Java |   Python |
|  0 |     0 |      0 |        1 |
+----+-------+--------+----------+
|  1 |     0 |      1 |        0 |
+----+-------+--------+----------+
|  2 |     0 |      1 |        0 |
+----+-------+--------+----------+
|  3 |     1 |      0 |        0 |
+----+-------+--------+----------+


In [20]:
X_test = np.array([['Python'],
                   ['Java'],
                   ['C'],
                   ['C++']])

In [21]:
X_test_encoded = encoder.transform(X_test)
df_test = pd.DataFrame(X_test_encoded, columns=encoder.categories_[0])
print("\nOne hot encoded test set:")
print(tabulate(df_test.astype(int), headers='keys', tablefmt='grid'))


One hot encoded test set:
+----+-------+--------+----------+
|    |   C++ |   Java |   Python |
|  0 |     0 |      0 |        1 |
+----+-------+--------+----------+
|  1 |     0 |      1 |        0 |
+----+-------+--------+----------+
|  2 |     0 |      0 |        0 |
+----+-------+--------+----------+
|  3 |     1 |      0 |        0 |
+----+-------+--------+----------+


## Exercise 4: Ordinal Encoder

In [22]:
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

In [23]:
X_train = np.array([['good'], ['bad'], ['neutral']])
X_test = np.array([['good'], ['good'], ['bad']])

In [24]:
encoder = OrdinalEncoder(categories=[['bad', 'neutral', 'good']])
encoder.fit(X_train)

In [25]:
print("Categories:", encoder.categories_)

Categories: [array(['bad', 'neutral', 'good'], dtype=object)]


In [26]:
X_train_encoded = encoder.transform(X_train)
print("Encoded train set:\n", X_train_encoded.astype(int))

Encoded train set:
 [[2]
 [0]
 [1]]


In [27]:
X_test_encoded = encoder.transform(X_test)
print("Encoded test set:\n", X_test_encoded.astype(int))

Encoded test set:
 [[2]
 [2]
 [0]]


## Exercise 5: Categorical variables

In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [29]:
df = pd.read_csv('breast-cancer.txt', header=None)
df.columns = [
    'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig',
    'breast', 'breast-quad', 'irradiat', 'Class'
]

In [30]:
print("First 5 rows:")
print(df.head())

First 5 rows:
     age menopause tumor-size inv-nodes node-caps  deg-malig breast  \
0  40-49   premeno      15-19       0-2       yes          3  right   
1  50-59      ge40      15-19       0-2        no          1  right   
2  50-59      ge40      35-39       0-2        no          2   left   
3  40-49   premeno      35-39       0-2       yes          3  right   
4  40-49   premeno      30-34       3-5       yes          2   left   

  breast-quad irradiat                 Class  
0     left_up       no     recurrence-events  
1     central       no  no-recurrence-events  
2    left_low       no     recurrence-events  
3    left_low      yes  no-recurrence-events  
4    right_up       no     recurrence-events  


In [31]:
print(df.dtypes)

age            object
menopause      object
tumor-size     object
inv-nodes      object
node-caps      object
deg-malig       int64
breast         object
breast-quad    object
irradiat       object
Class          object
dtype: object


In [32]:
print(df.isnull().sum())

age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      8
deg-malig      0
breast         0
breast-quad    1
irradiat       0
Class          0
dtype: int64


In [33]:
for col in df.columns:
    print(f"{col}: {df[col].unique()}")

age: ['40-49' '50-59' '60-69' '30-39' '70-79' '20-29']
menopause: ['premeno' 'ge40' 'lt40']
tumor-size: ['15-19' '35-39' '30-34' '25-29' '40-44' '10-14' '0-4' '20-24' '45-49'
 '50-54' '5-9']
inv-nodes: ['0-2' '3-5' '15-17' '6-8' '9-11' '12-14']
node-caps: ['yes' 'no' nan]
deg-malig: [3 1 2]
breast: ['right' 'left']
breast-quad: ['left_up' 'central' 'left_low' 'right_up' 'right_low' nan]
irradiat: ['no' 'yes']
Class: ['recurrence-events' 'no-recurrence-events']


In [34]:
X = df.drop('Class', axis=1)
y = df['Class']

In [35]:
X = X.replace('nan', np.nan)
X = X.dropna()
y = y[X.index]

In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=43
)

In [37]:
print("\nUnique values per feature in train set:")
print(X_train.nunique())


Unique values per feature in train set:
age             6
menopause       3
tumor-size     11
inv-nodes       6
node-caps       2
deg-malig       3
breast          2
breast-quad     5
irradiat        2
dtype: int64


In [38]:
ordinal_cols = ["menopause", "age", "tumor-size", "inv-nodes", "deg-malig"]
ohe_cols = ['node-caps', 'breast', 'breast-quad', 'irradiat']

In [39]:
ordinal_categories = [
    ['lt40', 'premeno', 'ge40'],  # menopause
    ['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-99'],  # age
    ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59'],  # tumor-size
    ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '18-20', '21-23', '24-26', '27-29', '30-32', '33-35', '36-39'],  # inv-nodes
    [1, 2, 3]  # deg-malig (ascending order!)
]

In [40]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
oe = OrdinalEncoder(categories=ordinal_categories)

In [41]:
ohe.fit(X_train[ohe_cols])
oe.fit(X_train[ordinal_cols])

In [42]:
X_test_ohe = ohe.transform(X_test[ohe_cols])
X_test_oe = oe.transform(X_test[ordinal_cols])

In [43]:
print(X_test_ohe[:10])

[[1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1.]
 [0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1.]
 [1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0.]
 [1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0.]
 [1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0.]
 [1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0.]
 [1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1.]
 [1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0.]]


In [44]:
print(ohe.get_feature_names_out(ohe_cols))

['node-caps_no' 'node-caps_yes' 'breast_left' 'breast_right'
 'breast-quad_central' 'breast-quad_left_low' 'breast-quad_left_up'
 'breast-quad_right_low' 'breast-quad_right_up' 'irradiat_no'
 'irradiat_yes']


In [45]:
print(X_test_oe[:10])

[[2. 5. 2. 0. 1.]
 [2. 5. 2. 0. 0.]
 [2. 5. 4. 5. 2.]
 [1. 4. 5. 1. 1.]
 [2. 5. 5. 0. 2.]
 [1. 2. 1. 0. 1.]
 [1. 2. 8. 0. 1.]
 [2. 5. 2. 0. 0.]
 [2. 5. 5. 0. 2.]
 [1. 2. 3. 0. 0.]]


In [46]:
from sklearn.compose import make_column_transformer

In [47]:
ct = make_column_transformer(
    (ohe, ohe_cols),
    (oe, ordinal_cols),
    remainder='drop'
)

In [48]:
ct.fit(X_train)
X_test_ct = ct.transform(X_test)
print(X_test_ct[:2])

[[1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 2. 5. 2. 0. 1.]
 [1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 2. 5. 2. 0. 0.]]


## Exercise 6: Pipeline

In [10]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import numpy as np

In [11]:
iris = load_iris()
X, y = iris['data'], iris['target']

In [12]:
X[[1,20,50,100,135], 0] = np.nan
X[[2,5,88,135], 1] = np.nan
X[[4,15], 2] = np.nan
X[[40,135], 3] = np.nan

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=43
)


In [14]:
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=200))
])

In [15]:
pipe.fit(X_train, y_train)

In [16]:
y_pred = pipe.predict(X_test)
print("Predictions on test set:")
print(y_pred)

Predictions on test set:
[0 0 2 1 2 0 2 1 1 1 0 1 2 0 1 1 0 0 2 2 0 0 0 2 2 2 0 1 0 0 1 0 1 1 2 2 1
 2 1 1 1 2 1 2 0 1 1 1 1 1]


In [17]:
score = pipe.score(X_test, y_test)
print("Test set score:", score)

Test set score: 0.98


In [18]:
score = pipe.score(X_test, y_test)
print("Test set score:", score)
print("Test set score as percentage: {:.0f}%".format(score * 100))

Test set score: 0.98
Test set score as percentage: 98%
