## Split Dataset into validation / test in 50:50 fashion

The digests are stored in a json. The dataset can then be filtered with

```python
from sec_certs.model.evaluation import get_validation_dgsts
from sec_cers.dataset.common_criteria import CCDataset

dset = CCDataset.from_json() # or call FIPSDataset.from_json()
validation_dgsts = get_validation_dgsts('/path/to/validation_set.json')
validation_certs = [x for x in dset if x.dgst in validation_dgsts]
y_valid = [(x.heuristics.verified_cpe_matches) for x in validation_certs]
```

In [2]:
import json
from typing import Union
from pathlib import Path

from sklearn.model_selection import train_test_split
from sec_certs.dataset.common_criteria import CCDataset
from sec_certs.dataset.fips import FIPSDataset

In [3]:
def split_dataset(dset_type: str, path: Union[str, Path], validation_outpath: Union[str, Path] = './validation_set.json', test_outpath: Union[str, Path] = './test_set.json'):
    if dset_type == 'cc':
        dset: CCDataset = CCDataset.from_json(path)
    elif dset_type == 'fips':
        dset: FIPSDataset = FIPSDataset.from_json(path)
    else:
        raise ValueError(f'type variable must be cc or fips, {dset_type} was given')

    cpe_rich_certs = [x for x in dset if x.heuristics.verified_cpe_matches]
    cpe_free_certs = [x for x in dset if not x.heuristics.verified_cpe_matches]

    x_valid_cpe_rich, x_test_cpe_rich = train_test_split(cpe_rich_certs, test_size=0.5)
    x_valid_cpe_free, x_test_cpe_free = train_test_split(cpe_free_certs, test_size=0.5)

    validation_set = [x.dgst for x in x_valid_cpe_rich + x_valid_cpe_free]
    test_set = [x.dgst for x in x_test_cpe_rich + x_test_cpe_free]

    with Path(validation_outpath).open('w') as handle:
        json.dump(validation_set, handle, indent=4)

    with Path(test_outpath).open('w') as handle:
        json.dump(test_set, handle, indent=4)

In [4]:
split_dataset('fips', '/path/to/fips_dataset.json')