/
sklearn_model.py
152 lines (132 loc) · 5.55 KB
/
sklearn_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
Code for processing datasets using scikit-learn.
"""
import inspect
import logging
from typing import List, Optional
import numpy as np
from sklearn.base import BaseEstimator
from deepchem.models import Model
from deepchem.data import Dataset
from deepchem.trans import Transformer
from deepchem.utils.data_utils import load_from_disk, save_to_disk
from deepchem.utils.typing import OneOrMany
logger = logging.getLogger(__name__)
class SklearnModel(Model):
"""Wrapper class that wraps scikit-learn models as DeepChem models.
When you're working with scikit-learn and DeepChem, at times it can
be useful to wrap a scikit-learn model as a DeepChem model. The
reason for this might be that you want to do an apples-to-apples
comparison of a scikit-learn model to another DeepChem model, or
perhaps you want to use the hyperparameter tuning capabilities in
`dc.hyper`. The `SklearnModel` class provides a wrapper around scikit-learn
models that allows scikit-learn models to be trained on `Dataset` objects
and evaluated with the same metrics as other DeepChem models.
Example
------
>>> import deepchem as dc
>>> import numpy as np
>>> from sklearn.linear_model import LinearRegression
>>> # Generating a random data and creating a dataset
>>> X, y = np.random.randn(5, 1), np.random.randn(5)
>>> dataset = dc.data.NumpyDataset(X, y)
>>> # Wrapping a Sklearn Linear Regression model using DeepChem models API
>>> sklearn_model = LinearRegression()
>>> dc_model = dc.models.SklearnModel(sklearn_model)
>>> dc_model.fit(dataset) # fitting dataset
Notes
-----
All `SklearnModels` perform learning solely in memory. This means that it
may not be possible to train `SklearnModel` on large `Dataset`s.
"""
def __init__(self,
model: BaseEstimator,
model_dir: Optional[str] = None,
**kwargs):
"""
Parameters
----------
model: BaseEstimator
The model instance which inherits a scikit-learn `BaseEstimator` Class.
model_dir: str, optional (default None)
If specified the model will be stored in this directory. Else, a
temporary directory will be used.
model_instance: BaseEstimator (DEPRECATED)
The model instance which inherits a scikit-learn `BaseEstimator` Class.
kwargs: dict
kwargs['use_weights'] is a bool which determines if we pass weights into
self.model.fit().
"""
if 'model_instance' in kwargs:
model_instance = kwargs['model_instance']
if model is not None:
raise ValueError(
"Can not use both model and model_instance argument at the same time."
)
logger.warning(
"model_instance argument is deprecated and will be removed in a future version of DeepChem."
"Use model argument instead.")
model = model_instance
super(SklearnModel, self).__init__(model, model_dir, **kwargs)
if 'use_weights' in kwargs:
self.use_weights = kwargs['use_weights']
else:
self.use_weights = True
if self.use_weights and self.model is not None:
# model is None when reloading a model
if 'sample_weight' not in inspect.getfullargspec(
self.model.fit).args:
self.use_weights = False
logger.info("The model does not support training with weights."
"Hence, not using weight of datapoint for training")
def fit(self, dataset: Dataset) -> None:
"""Fits scikit-learn model to data.
Parameters
----------
dataset: Dataset
The `Dataset` to train this model on.
"""
X = dataset.X
y = np.squeeze(dataset.y)
w = np.squeeze(dataset.w)
# Some scikit-learn models don't use weights.
if self.use_weights:
self.model.fit(X, y, w)
return
self.model.fit(X, y)
def predict_on_batch(self, X: np.typing.ArrayLike) -> np.ndarray:
"""Makes predictions on batch of data.
Parameters
----------
X: np.ndarray
A numpy array of features.
Returns
-------
np.ndarray
The value is a return value of `predict_proba` or `predict` method
of the scikit-learn model. If the scikit-learn model has both methods,
the value is always a return value of `predict_proba`.
"""
try:
return self.model.predict_proba(X)
except AttributeError:
return self.model.predict(X)
def predict(self,
X: Dataset,
transformers: List[Transformer] = []) -> OneOrMany[np.ndarray]:
"""Makes predictions on dataset.
Parameters
----------
dataset: Dataset
Dataset to make prediction on.
transformers: List[Transformer]
Transformers that the input data has been transformed by. The output
is passed through these transformers to undo the transformations.
"""
return super(SklearnModel, self).predict(X, transformers)
def save(self):
"""Saves scikit-learn model to disk using joblib."""
save_to_disk(self.model, self.get_model_filename(self.model_dir))
def reload(self):
"""Loads scikit-learn model from joblib file on disk."""
self.model = load_from_disk(self.get_model_filename(self.model_dir))