# Model validation
- Cross validation (KFold)
- GridSearch CV 
- Pipeline

## Cross validation (KFold)

In [1]:
from sklearn import datasets
from collections import Counter

In [2]:
iris = datasets.load_iris(as_frame=True)

In [3]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [4]:
iris.data['category'] = iris.target.map({i:n for i, n in enumerate(iris.target_names)})
iris.data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),category
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [5]:
X_, y_ = datasets.load_iris(return_X_y=True)
X_.shape, y_.shape, Counter(y_)

((150, 4), (150,), Counter({0: 50, 1: 50, 2: 50}))

- Only select the data of first two categories

In [7]:
sample_idx = iris.target[iris.target != 2].index
X = X_[sample_idx]
y = y_[sample_idx]
X.shape, y.shape, Counter(y)

((100, 4), (100,), Counter({0: 50, 1: 50}))

- Tain/test split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, 
                                                    random_state=0, shuffle=True)

In [10]:
X_train.shape, y_train.shape, Counter(y_train)

((60, 4), (60,), Counter({0: 33, 1: 27}))

In [11]:
X_test.shape, y_test.shape, Counter(y_test)

((40, 4), (40,), Counter({0: 17, 1: 23}))

- Fit a classifier on training data and test on testing data

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
clf = LogisticRegression(random_state=42).fit(X_train, y_train)
clf.score(X_test, y_test)

1.0

### Option-1: directly apply cross validation to get the model validation scores

In [21]:
from sklearn.model_selection import cross_validate, cross_val_score

In [22]:
clf = LogisticRegression(random_state=42)
cv_scores = cross_val_score(clf, X, y, cv=5, scoring='recall')

In [23]:
cv_scores

array([1., 1., 1., 1., 1.])

### Option-2: apply KFold to split the data into K-folds and then do cross validation on the k-fold data

In [24]:
from sklearn.model_selection import KFold

In [25]:
# example usage of KFold
X_example = ['a','b','c','d']
kf = KFold(n_splits=4, shuffle=True, random_state=42)

for train_idx, test_idx in kf.split(X_example):
    print("Train samples: %s \nTest samples: %s\n" % (train_idx, test_idx))

Train samples: [0 2 3] 
Test samples: [1]

Train samples: [0 1 2] 
Test samples: [3]

Train samples: [1 2 3] 
Test samples: [0]

Train samples: [0 1 3] 
Test samples: [2]



In [18]:
# apply KFold split on the iris dataset
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, test_idx in kf.split(X):
    print("Train samples: %s \nTest samples: %s\n" % (train_idx, test_idx))

Train samples: [ 1  2  3  5  6  7  8  9 11 13 14 15 16 17 19 20 21 23 24 25 26 27 28 29
 32 34 35 36 37 38 40 41 42 43 46 47 48 49 50 51 52 54 55 56 57 58 59 60
 61 62 63 64 65 66 67 68 69 71 72 74 75 78 79 81 82 84 85 86 87 88 89 91
 92 93 94 95 96 97 98 99] 
Test samples: [ 0  4 10 12 18 22 30 31 33 39 44 45 53 70 73 76 77 80 83 90]

Train samples: [ 0  1  2  3  4  6  7  8 10 12 13 14 17 18 19 20 21 22 23 24 25 27 29 30
 31 32 33 34 36 37 38 39 41 43 44 45 46 48 49 50 51 52 53 54 56 57 58 59
 60 61 62 63 64 67 68 70 71 73 74 75 76 77 78 79 80 81 82 83 84 86 87 89
 90 91 92 94 95 97 98 99] 
Test samples: [ 5  9 11 15 16 26 28 35 40 42 47 55 65 66 69 72 85 88 93 96]

Train samples: [ 0  1  2  4  5  9 10 11 12 14 15 16 18 20 21 22 23 26 28 29 30 31 32 33
 35 37 39 40 41 42 43 44 45 46 47 48 50 51 52 53 54 55 56 57 58 59 60 61
 63 65 66 67 68 69 70 71 72 73 74 75 76 77 79 80 82 83 84 85 86 87 88 90
 91 92 93 94 96 97 98 99] 
Test samples: [ 3  6  7  8 13 17 19 24 25 27 34 36 38 49 62 64 

In [19]:
clf = LogisticRegression(random_state=42)
cv_scores = cross_val_score(clf, X, y, cv=kf, scoring='recall')
cv_scores

array([1., 1., 1., 1., 1.])

### Option-3: apply stratified KFold to keep the class distribution

In [20]:
Counter(y[train_idx]), Counter(y[test_idx])

(Counter({0: 42, 1: 38}), Counter({0: 8, 1: 12}))

In [21]:
from sklearn.model_selection import StratifiedKFold

In [22]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, test_idx in kf.split(X, y):
    print("Train samples: %s \nTest samples: %s\n" % (train_idx, test_idx))
    break

Train samples: [ 0  1  2  3  4  5  6  7  8  9 10 12 16 17 18 19 21 22 23 24 26 27 28 30
 31 32 34 36 37 38 39 40 41 42 43 44 46 47 48 49 50 51 53 54 55 56 57 58
 60 61 62 63 64 66 67 68 69 70 71 72 73 74 75 76 77 79 80 81 82 83 84 86
 88 89 90 91 93 95 96 98] 
Test samples: [11 13 14 15 20 25 29 33 35 45 52 59 65 78 85 87 92 94 97 99]



In [23]:
Counter(y[train_idx]), Counter(y[test_idx])

(Counter({0: 40, 1: 40}), Counter({0: 10, 1: 10}))

### Option-4: Leave One Out strategy, useful when don't have enough data

In [24]:
from sklearn.model_selection import LeaveOneOut

In [25]:
X_example = [1, 2, 3, 4]
loo = LeaveOneOut()
for train_idx, test_idx in loo.split(X_example):
    print("%s %s" % (train_idx, test_idx))

[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]


In [26]:
for train_idx, test_idx in loo.split(X):
    print("Train samples: %s \nTest samples: %s\n" % (train_idx, test_idx))
    break

Train samples: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
 97 98 99] 
Test samples: [0]



## GridSearchCV
- search for the best hyperparameter settings

In [14]:
from sklearn.model_selection import GridSearchCV

- Example for Iris dataset

In [15]:
lr = LogisticRegression(penalty='l1',solver='liblinear',random_state=42,max_iter=2000)
lr.fit(X, y)

In [16]:
lr = LogisticRegression(random_state=42)
parameters = {'penalty':('l1', 'l2'), 
              'C':[0.1, 1, 10],
              'solver': ['liblinear']} # 2*3*1 = 6 combinations

grid_cv = GridSearchCV(estimator = lr, param_grid = parameters, cv=5)

grid_cv.fit(X, y)

In [17]:
sorted(grid_cv.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'param_penalty',
 'param_solver',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [18]:
grid_cv.cv_results_['mean_fit_time']

array([0.00096149, 0.00056734, 0.00060425, 0.0005538 , 0.00062399,
       0.00052662])

In [19]:
grid_cv.cv_results_['mean_test_score']

array([1., 1., 1., 1., 1., 1.])

In [20]:
grid_cv.cv_results_['split0_test_score']

array([1., 1., 1., 1., 1., 1.])

In [34]:
grid_cv.best_estimator_ 

In [36]:
grid_cv.best_params_

{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}

In [35]:
grid_cv.best_score_

1.0

## Pipeline

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,
                                                    random_state=42)
X_train.shape, X_test.shape

((60, 4), (40, 4))

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [44]:
pipe = Pipeline([('scaler', StandardScaler()), 
                 ('clf', LogisticRegression())])

In [45]:
pipe.fit(X_train, y_train)

In [46]:
pipe.score(X_test, y_test)

1.0

## Hyper-parameter tunning using GridSearchCV and pipeline 
- example with text classification
- jointly find the best hyperparameters in each step of the pipeline

In [47]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from pprint import pprint
from time import time

- Load data

In [48]:
data = fetch_20newsgroups(subset="train", 
                      categories = ["alt.atheism","talk.religion.misc"])
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

857 documents
2 categories



In [49]:
data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [50]:
data.target_names

['alt.atheism', 'talk.religion.misc']

In [51]:
print(data.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [52]:
print(data.data[0])

From: mangoe@cs.umd.edu (Charley Wingate)
Subject: Benediktine Metaphysics
Lines: 24

Benedikt Rosenau writes, with great authority:

>     IF IT IS CONTRADICTORY IT CANNOT EXIST.

"Contradictory" is a property of language.  If I correct this to


      THINGS DEFINED BY CONTRADICTORY LANGUAGE DO NOT EXIST

I will object to definitions as reality.  If you then amend it to

      THINGS DESCRIBED BY CONTRADICTORY LANGUAGE DO NOT EXIST

then we've come to something which is plainly false.  Failures in
description are merely failures in description.

(I'm not an objectivist, remember.)


-- 
C. Wingate        + "The peace of God, it is no peace,
                  +    but strife closed in the sod.
mangoe@cs.umd.edu +  Yet, brothers, pray for but one thing:
tove!mangoe       +    the marv'lous peace of God."



### Define a pipeline: text feature extraction + classifier 

In [53]:
pipeline = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", LogisticRegression()),
    ]
)

In [54]:
pipeline.steps

[('vect', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('clf', LogisticRegression())]

### Explore the parameters: search over different values

In [56]:
parameters = {
    "vect__max_df": (0.5, 0.75, 1.0),
    "vect__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__norm': ('l1', 'l2'),
    "clf__penalty": ("l1", "l2"),
#     "clf__C": (1.0, ),
#     'clf__solver': ('liblinear'),
} # 24

### Jointly find the best parameters for both feature extraction and the classifier

In [57]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data.data, data.target)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__penalty': ('l1', 'l2'),
 'tfidf__norm': ('l1', 'l2'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 24 candidates, totalling 120 fits


60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/zhaowang/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/zhaowang/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/zhaowang/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/zhaowang/opt/anaconda3/lib/python3.8/site-packages/sklearn/base

done in 20.241s

Best score: 0.931
Best parameters set:
	clf__penalty: 'l2'
	tfidf__norm: 'l2'
	vect__max_df: 0.5
	vect__ngram_range: (1, 1)
