# 5.1 Overview of the Scikit-Learn Library

## 5.1.2 Estimators, Transformers, and Predictors

Transformers and the transform() method

In [36]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
X = pd.DataFrame({'A':[1, 2, 3, 2], 'B':[11, 1, 8, 3]})
scaler = StandardScaler()
scaler.fit(X)
print(scaler.mean_)
print(scaler.var_)

[2.   5.75]
[ 0.5    15.6875]


In [37]:
X_scaled = scaler.transform(X)
print(X_scaled)

[[-1.41421356  1.32550825]
 [ 0.         -1.19926937]
 [ 1.41421356  0.56807496]
 [ 0.         -0.69431384]]


In [38]:
print(X_scaled[:, 0].mean())
print(X_scaled[:, 1].mean())
print(X_scaled[:, 0].var())
print(X_scaled[:, 1].var())

0.0
0.0
0.9999999999999998
1.0


# 5.2 Data Preprocessing with sklearn

In [39]:
import pandas as pd
import numpy as np
df = pd.read_csv('datapreprocessing.csv')

In [40]:
df

Unnamed: 0,Color,Years,Strength,Height,Weight,Dangerous
0,Green,2.3,210.0,170.0,20 to 30 kg,Yes
1,Red,4.1,100.0,,10 to 20 kg,No
2,Blue,1.4,,412.0,0 to 10 kg,No
3,Green,,313.0,123.0,10 to 20 kg,Yes
4,,5.2,512.0,372.0,0 to 10 kg,Yes


## 5.2.1 Handling Missing Data

In [41]:
# Importing the SimpleImputer class
from sklearn.impute import SimpleImputer

# Instantiating a SimpleImputer object
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

# Calling the fit() method to calculate the means
imp.fit(df[['Years', 'Strength', 'Height']])

# transforming the data
df[['Years', 'Strength', 'Height']] = imp.transform(df[['Years', 'Strength', 'Height']])

In [42]:
print(imp.statistics_)

[  3.25 283.75 269.25]


In [43]:
df

Unnamed: 0,Color,Years,Strength,Height,Weight,Dangerous
0,Green,2.3,210.0,170.0,20 to 30 kg,Yes
1,Red,4.1,100.0,269.25,10 to 20 kg,No
2,Blue,1.4,283.75,412.0,0 to 10 kg,No
3,Green,3.25,313.0,123.0,10 to 20 kg,Yes
4,,5.2,512.0,372.0,0 to 10 kg,Yes


In [44]:
# Updating the 'strategy' parameter of the SimpleImputer object
imp.set_params(strategy='most_frequent')

# Calling the fit() method to get the mode
imp.fit(df[['Color']])

# Transforming the column
df[['Color']] = imp.transform(df[['Color']])

## 5.2.2 Encoding Categorical Data

In [45]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Dangerous'] = le.fit_transform(df['Dangerous'])

In [46]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(dtype=np.int)
df[['Weight']] = oe.fit_transform(df[['Weight']])

In [47]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(dtype=np.int, sparse = False, drop='first')
color_encoded = ohe.fit_transform(df[['Color']])
df2 = pd.DataFrame(color_encoded, columns = ohe.get_feature_names())
df = pd.concat((df, df2), axis = 1)
df

Unnamed: 0,Color,Years,Strength,Height,Weight,Dangerous,x0_Green,x0_Red
0,Green,2.3,210.0,170.0,2,1,1,0
1,Red,4.1,100.0,269.25,1,0,0,1
2,Blue,1.4,283.75,412.0,0,0,0,0
3,Green,3.25,313.0,123.0,1,1,1,0
4,Green,5.2,512.0,372.0,0,1,1,0


## 5.2.3 Feature Scaling

In [48]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
df[['Years', 'Strength', 'Height']] = mms.fit_transform(df[['Years', 'Strength', 'Height']])
df

Unnamed: 0,Color,Years,Strength,Height,Weight,Dangerous,x0_Green,x0_Red
0,Green,0.236842,0.26699,0.16263,2,1,1,0
1,Red,0.710526,0.0,0.506055,1,0,0,1
2,Blue,0.0,0.445995,1.0,0,0,0,0
3,Green,0.486842,0.51699,0.0,1,1,1,0
4,Green,1.0,1.0,0.861592,0,1,1,0


# 5.3 PipeLine and ColumnTransformer

## 5.3.1 Pipeline

In [49]:
data = pd.DataFrame([[1], [4], [np.NaN], [8], [11]], columns=['A'])

from sklearn.pipeline import Pipeline

pl = Pipeline([
   ('imp', SimpleImputer(strategy="mean")), 
   ('scaler', MinMaxScaler())              
])

print(pl.fit_transform(data))

[[0. ]
 [0.3]
 [0.5]
 [0.7]
 [1. ]]


## 5.3.2 ColumnTransformer

In [50]:
from sklearn.compose import ColumnTransformer

data = pd.DataFrame([[1], [4], [np.NaN], [8], [11]], columns=['A'])

ct = ColumnTransformer([
  ('imp', SimpleImputer(strategy="mean"), ['A']),
   ('scaler', MinMaxScaler(), ['A'])              
])

In [51]:
print(ct.fit_transform(data))

[[ 1.   0. ]
 [ 4.   0.3]
 [ 6.   nan]
 [ 8.   0.7]
 [11.   1. ]]


In [52]:
data = pd.DataFrame({'A': [1, 2, 3, 4, 5], 'B':['Apple', 'Orange', 'Apple', 'Banana', 'Apple'], 'C':[11, 12, 13, 14, 15]})

ct2 = ColumnTransformer([
  ('encode', OrdinalEncoder(), ['B']),
  ('normalize', MinMaxScaler(), ['A'])                         
], remainder='passthrough')

print(ct2.fit_transform(data))

[[ 0.    0.   11.  ]
 [ 2.    0.25 12.  ]
 [ 0.    0.5  13.  ]
 [ 1.    0.75 14.  ]
 [ 0.    1.   15.  ]]


# 5.4 Model Evaluation with sklearn

## 5.4.1 Classification metrics

Accuracy

In [53]:
from sklearn.metrics import accuracy_score
true = ['Cat', 'Cat', 'Dog', 'Dog', 'Cat', 'Dog']
pred = ['Cat', 'Cat', 'Cat', 'Dog', 'Cat', 'Cat']
score = accuracy_score(true, pred)
print(score)

0.6666666666666666


Precision and Recall

In [54]:
from sklearn.metrics import precision_score, recall_score
true = ['Cat', 'Cat', 'Dog', 'Dog', 'Cat', 'Dog']
pred = ['Cat', 'Cat', 'Cat', 'Dog', 'Cat', 'Cat']
precision = precision_score(true, pred, pos_label = 'Dog')
recall = recall_score(true, pred, pos_label = 'Dog')
print(precision)
print(recall)

1.0
0.3333333333333333


## 5.4.2 Regression metrics

In [55]:
from sklearn.metrics import r2_score, mean_squared_error
pred = [2.1, 1.4, 5.6, 7.9]
true = [2.5, 1.6, 5.1, 6.8]
RMSE = mean_squared_error(true, pred, squared=False)
r2 = r2_score(true, pred)
print(RMSE)
print(r2)

0.6442049363362565
0.902696365767878
