# Examples for scikit-learn 0.23
By: Jeff Hale

In [16]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression

#### Check version to make sure you are using 0.23

In [2]:
sklearn.__version__

'0.23.1'

If your aren't, `pip install -U scikit-learn`

#### Load  the diamonds dataset from seaborn for a regresssion example

In [3]:
df_diamonds = sns.load_dataset('diamonds')
df_diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


#### Create X and y
Just using a subset of features.

In [4]:
X = df_diamonds[['carat', 'depth']]
y = df_diamonds['price']

#### Split into training and test sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

#### Create and fit a linear regression model

In [6]:
lr = LinearRegression()

In [7]:
lr.fit(X_train, y_train)

LinearRegression()

## Classification Example: Titanic

In [8]:
df_titanic = sns.load_dataset('titanic')
df_titanic.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


#### Create X and y
Just using a subset of features.

In [9]:
X = df_titanic.loc[:, ['sex', 'fare', 'class']]
X = X.replace(dict(male=0, female=1))
y = df_titanic['survived']

#### Split into training and test sets

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
X_train[:2]

Unnamed: 0,sex,fare,class
199,1,13.0,Second
129,0,6.975,Third


In [11]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 668 entries, 199 to 168
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   sex     668 non-null    int64   
 1   fare    668 non-null    float64 
 2   class   668 non-null    category
dtypes: category(1), float64(1), int64(1)
memory usage: 16.4 KB


In [12]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 223 entries, 707 to 573
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   sex     223 non-null    int64   
 1   fare    223 non-null    float64 
 2   class   223 non-null    category
dtypes: category(1), float64(1), int64(1)
memory usage: 5.5 KB


In [13]:
X_train['class'].value_counts()

Third     365
First     162
Second    141
Name: class, dtype: int64

In [14]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [15]:
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
y_train.tail()

663    0
664    0
665    0
666    0
667    0
Name: survived, dtype: int64

#### OneHotEncode the `class` feature and stitch the data back together.

In [16]:
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_train_class = ohe.fit_transform(X_train['class'].to_numpy().reshape(-1, 1))
X_test_class = ohe.transform(X_test['class'].to_numpy().reshape(-1, 1))

In [17]:
X_train_class.shape

(668, 3)

In [18]:
X_train_class_df = pd.DataFrame(
    X_train_class, 
    columns=ohe.get_feature_names()
)
X_train_class_df.head()

Unnamed: 0,x0_First,x0_Second,x0_Third
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [19]:
X_train_transformed = pd.concat(
    [X_train.loc[:, ['sex', 'fare']], 
    X_train_class_df], 
    axis=1
)

In [20]:
X_train_transformed

Unnamed: 0,sex,fare,x0_First,x0_Second,x0_Third
0,1,13.0000,0.0,1.0,0.0
1,0,6.9750,0.0,0.0,1.0
2,0,8.0500,0.0,0.0,1.0
3,1,83.4750,1.0,0.0,0.0
4,0,7.7500,0.0,0.0,1.0
...,...,...,...,...,...
663,1,8.6625,0.0,0.0,1.0
664,0,8.7125,0.0,0.0,1.0
665,0,49.5042,1.0,0.0,0.0
666,0,221.7792,1.0,0.0,0.0


In [21]:
X_test_class_df = pd.DataFrame(
    X_test_class, 
    columns=ohe.get_feature_names()
)
X_test_class_df.head()

Unnamed: 0,x0_First,x0_Second,x0_Third
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [22]:
X_test_transformed = pd.concat(
    [X_test.loc[:, ['sex', 'fare']], 
    X_test_class_df], 
    axis=1
)

In [23]:
X_test_transformed.head()

Unnamed: 0,sex,fare,x0_First,x0_Second,x0_Third
0,0,26.2875,1.0,0.0,0.0
1,0,8.05,0.0,0.0,1.0
2,1,65.0,0.0,1.0,0.0
3,0,56.4958,0.0,0.0,1.0
4,1,7.925,0.0,0.0,1.0


In [24]:
y_test.head()

0    1
1    0
2    1
3    0
4    1
Name: survived, dtype: int64

#### Create a logistic regression model

In [25]:
logreg = LogisticRegression(n_jobs=-1)

#### Notice how only the changed parameters are shown in the output of fitting, by default

In [26]:
logreg.fit(X_train_transformed, y_train)

LogisticRegression(n_jobs=-1)

#### `get_params()` will still show you all the parameters 

In [27]:
logreg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': -1,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

#### Or, to see all the params when fitting, do the following:

In [32]:
sklearn.set_config(print_changed_only=False)

In [33]:
logreg.fit(X_train_transformed, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
logreg

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#### Show the number of features passed to the estimator

In [28]:
logreg.n_features_in_

5

#### Score

In [29]:
logreg.score(X_test_transformed, y_test)

0.7713004484304933

#### Let's see what's new with pipelines

In [30]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

#### Let's do different things with the different columns

Let's make a ColumnTransformer object with `make_column_transformer`

In [31]:
ct = make_column_transformer(
    (StandardScaler(), ["sex"]),
    (OneHotEncoder(), ['class']),
    remainder='passthrough'
)

#### Let's try to get the feature names

In [32]:
ct.get_feature_names

<bound method ColumnTransformer.get_feature_names of ColumnTransformer(remainder='passthrough',
                  transformers=[('standardscaler', StandardScaler(), ['sex']),
                                ('onehotencoder', OneHotEncoder(), ['class'])])>

In [33]:
ct.get_params

<bound method ColumnTransformer.get_params of ColumnTransformer(remainder='passthrough',
                  transformers=[('standardscaler', StandardScaler(), ['sex']),
                                ('onehotencoder', OneHotEncoder(), ['class'])])>

#### The default output from fitting is the same.

In [34]:
pipe = make_pipeline(ct, LogisticRegression())
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('standardscaler',
                                                  StandardScaler(), ['sex']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['class'])])),
                ('logisticregression', LogisticRegression())])

#### After fitting we can see the number of features in the pipeline

In [35]:
pipe.n_features_in_

3

In [40]:
pipe.named_steps

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('standardscaler', StandardScaler(), ['sex']),
                                 ('onehotencoder', OneHotEncoder(), ['class'])]),
 'logisticregression': LogisticRegression()}

In [41]:
pipe.named_steps['columntransformer'].transformers_[1][1].get_feature_names()

# Code to see feature names from a columntransformer object
# https://stackoverflow.com/a/54648023/4590385

array(['x0_First', 'x0_Second', 'x0_Third'], dtype=object)

#### Here's the new interactive DAG
You just need to configure the display.

In [42]:
sklearn.set_config(display="diagram")

In [47]:
pipe = make_pipeline(ct, LogisticRegression())
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('standardscaler',
                                                  StandardScaler(), ['sex']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['class'])])),
                ('logisticregression', LogisticRegression())])

In [46]:
sklearn.set_config(display='text')

You can click on the name of the step in the diagram above to expand it.

#### Note that the diagram doesn't show pass-through columns!

#### If you also want information on the steps in the pipeline you can us the following methods to get it.

In [39]:
pipe.named_steps

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('standardscaler', StandardScaler(), ['sex']),
                                 ('onehotencoder', OneHotEncoder(), ['class'])]),
 'logisticregression': LogisticRegression()}

In [40]:
pipe.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('standardscaler', StandardScaler(), ['sex']),
                                   ('onehotencoder', OneHotEncoder(), ['class'])])),
  ('logisticregression', LogisticRegression())],
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('standardscaler', StandardScaler(), ['sex']),
                                 ('onehotencoder', OneHotEncoder(), ['class'])]),
 'logisticregression': LogisticRegression(),
 'columntransformer__n_jobs': None,
 'columntransformer__remainder': 'passthrough',
 'columntransformer__sparse_threshold': 0.3,
 'columntransformer__transformer_weights': None,
 'columntransformer__transformers': [('standardscaler',
   StandardScaler(),
   ['sex']),
  ('onehotencoder', OneHotEncoder(), ['class'])],
 'columntransformer__verbose': False,
 'columntransformer__standardscaler': Sta

In [41]:
pipe.score(X_test, y_test)

0.7713004484304933

In [42]:
pipe.named_steps['logisticregression']

## Loading Datasets

In [4]:
from sklearn.datasets import load_boston, load_breast_cancer, load_diabetes, load_digits, load_iris, load_linnerud, load_wine

#### Loading a dataset still results in a `Bunch` object, even if you pass `as_frame=True`.

In [5]:
df_d = load_diabetes(as_frame=True)
type(df_d)

sklearn.utils.Bunch

#### You must access the `Bunch`'s `data` attribute to return the DataFrame.

In [6]:
dataset_list = [
    load_boston, 
    load_breast_cancer, 
    load_diabetes, 
    load_digits, 
    load_iris, 
    load_linnerud, 
    load_wine
]

for dataset in dataset_list:
    try:
        df = dataset(as_frame=True)
        print(f"Worked for {dataset}: {type(df.data)}")
    except Exception as e:
        print(f"Didn't work for {dataset}: {e}")
        continue

Didn't work for <function load_boston at 0x1a24c19830>: load_boston() got an unexpected keyword argument 'as_frame'
Worked for <function load_breast_cancer at 0x1a24c19170>: <class 'pandas.core.frame.DataFrame'>
Worked for <function load_diabetes at 0x1a24c194d0>: <class 'pandas.core.frame.DataFrame'>
Worked for <function load_digits at 0x1a24c19320>: <class 'pandas.core.frame.DataFrame'>
Worked for <function load_iris at 0x1a24c13f80>: <class 'pandas.core.frame.DataFrame'>
Worked for <function load_linnerud at 0x1a24c19680>: <class 'pandas.core.frame.DataFrame'>
Worked for <function load_wine at 0x1a24c13dd0>: <class 'pandas.core.frame.DataFrame'>


#### Looks like the Boston Housing dataset doesn't have the `as_frame` parameter yet.

### .frame vs .data
Note that you the `.frame` attribute returns the target column included in the DataFrame. The `.data` attribute does not. Hat tip to Kevin Markham for pointing that out.

In [15]:
df = load_breast_cancer(as_frame=True)

In [13]:
df.frame

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [14]:
df.data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


# The End!