# Cross validating Scikit-Learn models

In [None]:
import os

import matplotlib.pyplot as plt

from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit

from faculty_xval.validation import JobsCrossValidator
from faculty_xval.utilities import job_name_to_job_id

In [None]:
REFERENCE_DIR = "foo"
JOB_NAME = "bar"

**Note:** In the code cell above, the definitions of `REFERENCE_DIR` and `JOB_NAME` change depending on how `faculty-xval` was installed:

* If you followed the **User installation instructions**, choose any `REFERENCE_DIR` for saving the results of cross validation. Running this notebook will create a sub-directory labelled with the current date and time, so that the output data are never overwritten. Make sure to specify `JOB_NAME` as `"cross_validation"`.

* If you followed the **Developer installation instructions**, create a directory for saving the results of cross validation in your personal directory (`/project/{USER_NAME}/temp`). Then select `REFERENCE_DIR` and `JOB_NAME` as follows:
```python
REFERENCE_DIR = "/project/{}/temp/".format(
    os.environ["USER_NAME"]
)
JOB_NAME = "cross_validation_{}".format(
    os.environ["USER_NAME"]
)
```

In [None]:
# A quick check...
if not os.path.isdir(REFERENCE_DIR):
    raise OSError((
        "Path {} cannot be found "
        + "or is not a directory"
    ).format(REFERENCE_DIR))

In [None]:
N_SPLITS = 10
TEST_SIZE = 0.25

MIN_SAMPLES_LEAF = 20
N_ESTIMATORS = 200

NUM_SUBRUNS = 3

## Initialise the cross validator

In [None]:
job_id = job_name_to_job_id(JOB_NAME)
cross_validator = JobsCrossValidator(job_id, REFERENCE_DIR)

## Load the data

In [None]:
dataset = load_digits()
features = dataset["data"]
targets = dataset["target"]

In [None]:
print("Features:")
plt.imshow(features[3].reshape(8,8))
plt.show()

print("Target: {}".format(targets[3]))

In [None]:
split_generator = ShuffleSplit(
    n_splits=N_SPLITS,
    test_size=TEST_SIZE
).split(features)

## Define the Scikit-Learn model

In [None]:
model = RandomForestClassifier(
    min_samples_leaf=MIN_SAMPLES_LEAF,
    n_estimators=N_ESTIMATORS
)

## Perform cross validation

In [None]:
cross_validator.run(
    model,
    [features],
    [targets],
    split_generator,
    NUM_SUBRUNS
)