### Exercise 1: Imputer 1

In [274]:
import numpy as np
import pandas as pd
import sklearn

In [275]:
train_data = [[7, 6, 5],
              [4, np.nan, 5],
              [1, 20, 8]]


##### 1. Fit the SimpleImputer on the data. Print the statistics_. Check that the statistics match np.nanmean(train_data, axis=0).

In [276]:
from sklearn.impute import SimpleImputer


simp = SimpleImputer()

simp.fit(train_data)

print(f"statistic: {simp.statistics_}")
train_data

statistic: [ 4. 13.  6.]


[[7, 6, 5], [4, nan, 5], [1, 20, 8]]

##### 2. Fill the missing values in train_data using the fitted imputer and transform.

In [277]:
simp.transform(train_data)

array([[ 7.,  6.,  5.],
       [ 4., 13.,  5.],
       [ 1., 20.,  8.]])

##### 3. Fill the missing values in test_data using the fitted imputer and transform.

In [278]:
test_data = [[np.nan, 1, 2],
             [7, np.nan, 9],
             [np.nan, 2, 4]]

simp.transform(test_data)

array([[ 4.,  1.,  2.],
       [ 7., 13.,  9.],
       [ 4.,  2.,  4.]])

### Exercise 2: Scaler

In [279]:
X_train = np.array([[ 1., -1.,  2.],
                     [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])


##### 1. Fit the StandardScaler on the data and scale X_train using fit_transform. Compute the mean and std on axis 0.

In [280]:
from sklearn.discriminant_analysis import StandardScaler


scaler = StandardScaler()

scaler.fit_transform(X_train)


array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

##### 2. Scale the test set using the StandardScaler fitted on the train set.

In [281]:
X_test = np.array([[ 2., -1.,  1.],
                     [ 3.,  3.,  -1.],
                     [ 1.,  1., 1.]])
scaler.transform(X_test)

array([[ 1.22474487, -1.22474487,  0.53452248],
       [ 2.44948974,  3.67423461, -1.06904497],
       [ 0.        ,  1.22474487,  0.53452248]])

### Exercise 3: One hot Encoder

##### 1. Using OneHotEncoder with handle_unknown='ignore', fit the One Hot Encoder and transform X_train. The expected output is:

In [282]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

X_train = [['Python'], ['Java'], ['Java'], ['C++']]


encoder = OneHotEncoder(handle_unknown='ignore')
encoded = encoder.fit_transform(X_train)

pd.DataFrame(encoded.toarray(), columns=encoder.categories_[0],dtype='int')

Unnamed: 0,C++,Java,Python
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0


##### 2. Transform X_test using the fitted One Hot Encoder on the train set.

In [283]:
X_test = [['Python'], ['Java'], ['C'], ['C++']]

encoded_test = encoder.transform(X_test)
pd.DataFrame(encoded_test.toarray(), columns=encoder.categories_[0],dtype='int')

Unnamed: 0,C++,Java,Python
0,0,0,1
1,0,1,0
2,0,0,0
3,1,0,0


### Exercise 4: Ordinal Encoder

##### 1. Fit the OrdinalEncoder by specifying the categories in the following order: categories=[['bad', 'neutral', 'good']]. Transform the train set. Print the categories_

In [284]:
from sklearn.preprocessing import OrdinalEncoder


X_train = [['good'], ['bad'], ['neutral']]

oencd = OrdinalEncoder(categories=[['bad', 'neutral', 'good']])
transformed =  oencd.fit_transform(X_train)

print(transformed)
oencd.categories_

[[2.]
 [0.]
 [1.]]


[array(['bad', 'neutral', 'good'], dtype=object)]

##### 2. Transform the X_test using the fitted Ordinal Encoder on train set.

In [285]:
X_test = [['good'], ['good'], ['bad']]

transformed = oencd.transform(X_test)
transformed

array([[2.],
       [2.],
       [0.]])

### Exercise 5: Categorical variables

In [286]:
#load the data
from sklearn.model_selection import train_test_split


cols = [
"age"
,"menopause"
,"tumor-size"
,"inv-nodes"
,"node-caps"
,"deg-malig"
,"breast"
,"breast-quad"
,"irradiat",
"Class"]
data = pd.read_csv("data/breast-cancer.csv",names=cols)

#check num of missing value
print(f"number of missing value: {data.isnull().sum().sum()}")
# drop row (axix=0) containing missing value
data.dropna(axis=0,inplace=True)

# split train,test dataset
# y = data["Class"] #target
# X = data.drop("Class",axis=1) # features
X_train,X_test = train_test_split(data,test_size=0.2, random_state=43)


number of missing value: 9


##### 1. Count the number of unique values per feature in the train set.

In [287]:
data.nunique()

age             6
menopause       3
tumor-size     11
inv-nodes       6
node-caps       2
deg-malig       3
breast          2
breast-quad     5
irradiat        2
Class           2
dtype: int64

##### 2. Data transformation- OneHotEncoder

In [288]:
nonordcols =['node-caps' , 'breast', 'breast-quad', 'irradiat']
oneHotEnc = OneHotEncoder(handle_unknown='ignore')
oneHotEnc.fit(X_train[nonordcols])

nonord = X_test[nonordcols]

nord_t = oneHotEnc.transform(nonord)

nord_t.toarray()[:10]

array([[1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.],
       [0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.],
       [1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.]])

##### 3. Data transformation- OrdinalEncoder

In [289]:
ordenc = OrdinalEncoder(
    categories=[
        ['lt40','premeno','ge40'],
        [ '10-19','20-29','30-39','40-49','50-59', '60-69','70-79','80-89','90-99' ],
        ['0-4', '5-9', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59'],
        ['0-2', '3-5', '6-8', '9-11', '12-14', '15-17', '18-20', '21-23', '24-26', '27-29', '30-32', '33-35', '36-39'],
        ['1', '2', '3']
    ],
)
ordcols = ["menopause", "age", "tumor-size","inv-nodes", "deg-malig"]

ordinal =X_train[ordcols]
ordenc.fit(ordinal)
ord_t = ordenc.transform(X_test[ordcols])
ord_t[:10]

array([[2., 5., 2., 0., 1.],
       [2., 5., 2., 0., 0.],
       [2., 5., 4., 5., 2.],
       [1., 4., 5., 1., 1.],
       [2., 5., 5., 0., 2.],
       [1., 2., 1., 0., 1.],
       [1., 2., 8., 0., 1.],
       [2., 5., 2., 0., 0.],
       [2., 5., 5., 0., 2.],
       [1., 2., 3., 0., 0.]])

##### 4. Use a make_column_transformer to combine the two Encoders.

In [290]:
from sklearn.compose import make_column_transformer


columnTransformer = make_column_transformer((oneHotEnc,nonordcols),(ordenc,ordcols))

columnTransformer.fit(X_train)

combined = columnTransformer.transform(X_test)

combined[:2]

array([[1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 2., 5., 2., 0., 1.],
       [1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 2., 5., 2., 0., 0.]])

### Exercise 6: Pipeline

The goal of this exercise is to learn to use the Scikit-learn object: Pipeline. The data set: used for this exercise is the iris data set.

In [297]:
#Load the iris dataset
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression


iris = load_iris()
X, y = iris['data'], iris['target']

#add missing values
X[[1,20,50,100,135], 0] = np.nan
X[[2,5,88,135], 1] = np.nan
X[[4,15], 2] = np.nan
X[[40,135], 3] = np.nan

X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.33, random_state=43)
 
#implement imputer
imputer = SimpleImputer(strategy="median")
X_train_imp = imputer.fit_transform(X_train,y_train)

X_test_imp = imputer.transform(X_test)

#implement scaler
stdScaler = StandardScaler()

X_train_scaled = stdScaler.fit_transform(X_train_imp)
X_test_scaled = stdScaler.transform(X_test_imp)

#training the model
model = LogisticRegression()

model.fit(X_train_scaled,y_train)

#check the accuracy of the model
score = model.score(X_test_scaled,y_test)

score

0.98