/
phishing.py
271 lines (238 loc) · 9.93 KB
/
phishing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""The phishing dataset contains a slightly synthetic dataset of urls - some regular and some used for phishing.
The phishing url dataset contains slightly synthetic dataset of urls - some regular and some used for phishing.
The dataset is based on the `great project <https://github.com/Rohith-2/url_classification_dl>`_ by
`Rohith Ramakrishnan <https://www.linkedin.com/in/rohith-ramakrishnan-54094a1a0/>`_ and others, accompanied by
a `blog post <https://medium.com/nerd-for-tech/url-feature-engineering-and-classification-66c0512fb34d>`_.
The authors have released it under an open license per our request, and for that we are very grateful to them.
This dataset is licensed under the `Creative Commons Zero v1.0 Universal (CC0 1.0)
<https://creativecommons.org/publicdomain/zero/1.0/>`_.
The typical ML task in this dataset is to build a model that predicts the if the url is part of a phishing attack.
Dataset Shape:
.. list-table:: Dataset Shape
:widths: 50 50
:header-rows: 1
* - Property
- Value
* - Samples Total
- 11.35K
* - Dimensionality
- 25
* - Features
- real, string
* - Targets
- boolean
Description:
.. list-table:: Dataset Description
:widths: 50 50 50
:header-rows: 1
* - Column name
- Column Role
- Description
* - target
- Label
- 0 if the URL is benign, 1 if it is related to phishing
* - month
- Data
- The month this URL was first encountered, as an int
* - scrape_date
- Date
- The exact date this URL was first encountered
* - ext
- Feature
- The domain extension
* - urlLength
- Feature
- The number of characters in the URL
* - numDigits
- Feature
- The number of digits in the URL
* - numParams
- Feature
- The number of query parameters in the URL
* - num_%20
- Feature
- The number of '%20' substrings in the URL
* - num_@
- Feature
- The number of @ characters in the URL
* - entropy
- Feature
- The entropy of the URL
* - has_ip
- Feature
- True if the URL string contains an IP address
* - hasHttp
- Feature
- True if the url's domain supports http
* - hasHttps
- Feature
- True if the url's domain supports https
* - urlIsLive
- Feature
- The URL was live at the time of scraping
* - dsr
- Feature
- The number of days since domain registration
* - dse
- Feature
- The number of days since domain registration expired
* - bodyLength
- Feature
- The number of characters in the URL's web page
* - numTitles
- Feature
- The number of HTML titles (H1/H2/...) in the page
* - numImages
- Feature
- The number of images in the page
* - numLinks
- Feature
- The number of links in the page
* - specialChars
- Feature
- The number of special characters in the page
* - scriptLength
- Feature
- The number of characters in scripts embedded in the page
* - sbr
- Feature
- The ratio of scriptLength to bodyLength (`= scriptLength / bodyLength`)
* - bscr
- Feature
- The ratio of bodyLength to specialChars (`= specialChars / bodyLength`)
* - sscr
- Feature
- The ratio of scriptLength to specialChars (`= scriptLength / specialChars`)
"""
import typing as t
from urllib.request import urlopen
import joblib
import pandas as pd
import sklearn
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from deepchecks.tabular.dataset import Dataset
__all__ = ['load_data', 'load_fitted_model']
_MODEL_URL = 'https://figshare.com/ndownloader/files/35122765'
_FULL_DATA_URL = 'https://figshare.com/ndownloader/files/33079757'
_TRAIN_DATA_URL = 'https://ndownloader.figshare.com/files/33079781'
_TEST_DATA_URL = 'https://ndownloader.figshare.com/files/33079787'
_MODEL_VERSION = '1.0.2'
_target = 'target'
_CAT_FEATURES = ['ext']
_NON_FEATURES = ['month', 'has_ip', 'urlIsLive']
_NUM_FEATURES = ['urlLength', 'numDigits', 'numParams', 'num_%20', 'num_@', 'entropy', 'hasHttp', 'hasHttps', 'dsr',
'dse', 'bodyLength', 'numTitles', 'numImages', 'numLinks', 'specialChars', 'scriptLength', 'sbr',
'bscr', 'sscr']
_DATE_COL = 'scrape_date'
def load_data(data_format: str = 'Dataset', as_train_test: bool = True) -> \
t.Union[t.Tuple, t.Union[Dataset, pd.DataFrame]]:
"""Load and returns the phishing url dataset (classification).
Parameters
----------
data_format : str , default: Dataset
Represent the format of the returned value. Can be 'Dataset'|'Dataframe'
'Dataset' will return the data as a Dataset object
'Dataframe' will return the data as a pandas Dataframe object
as_train_test : bool , default: True
If True, the returned data is splitted into train and test exactly like the toy model
was trained. The first return value is the train data and the second is the test data.
In order to get this model, call the load_fitted_model() function.
Otherwise, returns a single object.
Returns
-------
dataset : Union[deepchecks.Dataset, pd.DataFrame]
the data object, corresponding to the data_format attribute.
train, test : Tuple[Union[deepchecks.Dataset, pd.DataFrame],Union[deepchecks.Dataset, pd.DataFrame]
tuple if as_train_test = True. Tuple of two objects represents the dataset splitted to train and test sets.
"""
if not as_train_test:
dataset = pd.read_csv(_FULL_DATA_URL, index_col=0)
if data_format == 'Dataset':
dataset = Dataset(dataset, label=_target, cat_features=_CAT_FEATURES, datetime_name=_DATE_COL)
return dataset
else:
train = pd.read_csv(_TRAIN_DATA_URL, index_col=0)
test = pd.read_csv(_TEST_DATA_URL, index_col=0)
if data_format == 'Dataset':
train = Dataset(train, label=_target, cat_features=_CAT_FEATURES, datetime_name=_DATE_COL)
test = Dataset(test, label=_target, cat_features=_CAT_FEATURES, datetime_name=_DATE_COL)
return train, test
def load_fitted_model(pretrained=True):
"""Load and return a fitted regression model to predict the target in the phishing dataset.
Returns
-------
model : Joblib
the model/pipeline that was trained on the phishing dataset.
"""
if sklearn.__version__ == _MODEL_VERSION and pretrained:
with urlopen(_MODEL_URL) as f:
model = joblib.load(f)
else:
model = _build_model()
train, _ = load_data()
model.fit(train.data[train.features], train.data[train.label_name])
return model
class UrlDatasetProcessor:
"""A custom processing pipeline for the phishing URLs dataset."""
def _cols_to_scale(self, df: pd.DataFrame) -> t.List[object]:
return [
i
for i, x in df.dtypes.items()
if pd.api.types.is_numeric_dtype(x) and i != _target
]
def _shared_preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df[_DATE_COL] = pd.to_datetime(
df[_DATE_COL], format='%Y-%m-%d')
df = df.set_index(keys=_DATE_COL, drop=True)
df = df.drop(_NON_FEATURES, axis=1)
df = pd.get_dummies(df, columns=['ext'])
return df
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""Fit this preprossor on the input dataframe and transform it."""
df = self._shared_preprocess(df)
self.scaler = sklearn.preprocessing.StandardScaler()
self.scale_cols = self._cols_to_scale(df)
df[self.scale_cols] = self.scaler.fit_transform(df[self.scale_cols])
return df
def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""Transform the input dataframe using this fitted preprossor."""
df = self._shared_preprocess(df)
try:
df[self.scale_cols] = self.scaler.transform(df[self.scale_cols])
return df
except AttributeError as e:
raise Exception(
'UrlDatasetProcessor is unfitted! Call fit_transform() first!'
) from e
def get_url_preprocessor():
"""Return a data processor object for the phishing URL dataset."""
return UrlDatasetProcessor()
def _build_model():
"""Build the model to fit."""
return Pipeline(steps=[
('preprocessing',
ColumnTransformer(transformers=[('num', SimpleImputer(),
_NUM_FEATURES),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('encoder',
OneHotEncoder())]),
_CAT_FEATURES)])),
('model',
RandomForestClassifier(criterion='entropy', n_estimators=40, random_state=0))])