Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
349 changes: 0 additions & 349 deletions dragnet/data_processing.py

Large diffs are not rendered by default.

147 changes: 77 additions & 70 deletions dragnet/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import ExtraTreesClassifier

from .compat import string_, str_cast
from .compat import string_, str_cast, unicode_
from .util import get_and_union_features
from .blocks import TagCountNoCSSReadabilityBlockifier


class Extractor(BaseEstimator, ClassifierMixin):
Expand Down Expand Up @@ -36,7 +37,7 @@ class Extractor(BaseEstimator, ClassifierMixin):
``predict_proba()`` method.
"""

def __init__(self, blockifier,
def __init__(self, blockifier=TagCountNoCSSReadabilityBlockifier,
features=('kohlschuetter', 'weninger', 'readability'),
model=None,
to_extract='content', prob_threshold=0.5, max_block_weight=200):
Expand Down Expand Up @@ -65,7 +66,7 @@ def features(self):
def features(self, feats):
self._features = get_and_union_features(feats)

def fit(self, blocks, labels, weights=None):
def fit(self, documents, labels, weights=None):
"""
Fit :class`Extractor` features and model to a training dataset.

Expand All @@ -77,16 +78,25 @@ def fit(self, blocks, labels, weights=None):
Returns:
:class`Extractor`
"""
features_mat = self.features.fit_transform(blocks)
block_groups = np.array([self.blockifier.blockify(doc) for doc in documents])
mask = [self._has_enough_blocks(blocks) for blocks in block_groups]
block_groups = block_groups[mask]
labels = np.concatenate(np.array(labels)[mask])

# TODO: This only 'fit's one doc at a time. No feature fitting actually
# happens for now, but this might be important if the features change
features_mat = np.concatenate([self.features.fit_transform(blocks)
for blocks in block_groups])
if weights is None:
self.model.fit(features_mat, labels)
else:
weights = np.concatenate(np.array(weights)[mask])
self.model.fit(features_mat, labels, sample_weight=weights)
return self

def concatenate_data(self, data):
def get_html_labels_weights(self, data):
"""
Concatenate the blocks, labels, and weights of many files' data.
Gather the html, labels, and weights of many files' data.
Primarily useful for training/testing an :class`Extractor`.

Args:
Expand All @@ -96,19 +106,16 @@ def concatenate_data(self, data):
Tuple[List[Block], np.array(int), np.array(int)]: All blocks, all
labels, and all weights, respectively.
"""
all_blocks = []
all_labels = np.empty(0, dtype=int)
all_weights = np.empty(0, dtype=int)
all_html = []
all_labels = []
all_weights = []
for html, content, comments in data:
blocks = self.blockifier.blockify(html)
if not self._has_enough_blocks(blocks):
continue
all_blocks.extend(blocks)
labels, weights, _ = self._get_labels_and_weights(
all_html.append(html)
labels, weights = self._get_labels_and_weights(
content, comments)
all_labels = np.hstack((all_labels, labels))
all_weights = np.hstack((all_weights, weights))
return all_blocks, all_labels, all_weights
all_labels.append(labels)
all_weights.append(weights)
return np.array(all_html), np.array(all_labels), np.array(all_weights)

def _has_enough_blocks(self, blocks):
if len(blocks) < 3:
Expand All @@ -126,29 +133,22 @@ def _get_labels_and_weights(self, content, comments):
Returns:
Tuple[np.array[int], np.array[int], List[str]]
"""
# TODO: get rid of the third element here and elsewhere?
# extract content and comments
if 'content' in self.to_extract and 'comments' in self.to_extract:
if self.max_block_weight is None:
return (np.logical_or(content[0], comments[0]).astype(int),
content[1],
content[2] + comments[2])
else:
return (np.logical_or(content[0], comments[0]).astype(int),
np.minimum(content[1], self.max_block_weight),
content[2] + comments[2])
labels = np.logical_or(content[0], comments[0]).astype(int)
weights = content[1],
# extract content only
elif 'content' in self.to_extract:
if self.max_block_weight is None:
return content
else:
return (content[0], np.minimum(content[1], self.max_block_weight), content[2])
labels = content[0]
weights = content[1]
# extract comments only
else:
if self.max_block_weight is None:
return comments
else:
return (comments[0], np.minimum(comments[1], self.max_block_weight), comments[2])
labels = comments[0]
weights = comments[1]
if self.max_block_weight is None:
weights = np.minimum(weights, self.max_block_weight)

return labels, weights

def extract(self, html, encoding=None, as_blocks=False):
"""
Expand All @@ -166,55 +166,62 @@ def extract(self, html, encoding=None, as_blocks=False):
Returns:
str or List[Block]
"""
blocks = self.blockifier.blockify(html, encoding=encoding)
return self.extract_from_blocks(blocks, as_blocks=as_blocks)
preds, blocks = self.predict(html, encoding=encoding, return_blocks=True)
if as_blocks is False:
return str_cast(b'\n'.join(blocks[ind].text for ind in np.flatnonzero(preds)))
else:
return [blocks[ind] for ind in np.flatnonzero(preds)]


def extract_from_blocks(self, blocks, as_blocks=False):
def predict(self, documents, **kwargs):
"""
Extract the main content and/or comments from a sequence of (all) blocks
and return it as a string or as a sequence of block objects.
Predict class (content=1 or not-content=0) of the blocks in one or many
HTML document(s).

Args:
blocks (List[Block]): Blockify'd HTML document.
as_blocks (bool): If False, return the main content as a combined
string; if True, return the content-holding blocks as a list of
block objects.
documents (str or List[str]): HTML document(s)

Returns:
str or List[Block]
``np.ndarray`` or List[``np.ndarray``]: array of binary predictions
for content (1) or not-content (0).
"""
if not self._has_enough_blocks(blocks):
if as_blocks is False:
return ''
else:
return []
features_mat = self.features.transform(blocks)
if self.prob_threshold is None:
preds = self.model.predict(features_mat)
else:
self._positive_idx = (
self._positive_idx or list(self.model.classes_).index(1))
preds = (self.model.predict_proba(features_mat) > self.prob_threshold)[:, self._positive_idx]
if as_blocks is False:
return str_cast(b'\n'.join(blocks[ind].text for ind in np.flatnonzero(preds)))
if isinstance(documents, (str, bytes, unicode_, np.unicode_)):
return self._predict_one(documents, **kwargs)
else:
return [blocks[ind] for ind in np.flatnonzero(preds)]
return np.concatenate([self._predict_one(doc, **kwargs) for doc in documents])

def predict(self, blocks):

def _predict_one(self, document, encoding=None, return_blocks=False):
"""
Predict class (content=1 or not-content=0) of each block in a sequence.
Predict class (content=1 or not-content=0) of each block in an HTML
document.

Args:
blocks (List[Block]): Blockify'd HTML document.
documents (str): HTML document

Returns:
``np.ndarray``: 1D array of block-level, binary predictions for
content (1) or not-content (0).
``np.ndarray``: array of binary predictions for content (1) or
not-content (0).
"""
features_mat = self.features.transform(blocks)
if self.prob_threshold is None:
return self.model.predict(features_mat)
# blockify
blocks = self.blockifier.blockify(document, encoding=encoding)
# get features
try:
features = self.features.transform(blocks)
except ValueError: # Can't make features, predict no content
preds = np.zeros((len(blocks)))
# make predictions
else:
self._positive_idx = (
self._positive_idx or list(self.model.classes_).index(1))
return (self.model.predict_proba(features_mat) > self.prob_threshold)[:, self._positive_idx].astype(int)
if self.prob_threshold is None:
preds = self.model.predict(features)
else:
self._positive_idx = (
self._positive_idx or list(self.model.classes_).index(1))
preds = self.model.predict_proba(features) > self.prob_threshold
preds = preds[:, self._positive_idx].astype(int)

if return_blocks:
return preds, blocks
else:
return preds

2 changes: 1 addition & 1 deletion dragnet/features/weninger.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class WeningerFeatures(BaseEstimator, TransformerMixin):
__name__ = 'weninger'

def __init__(self, sigma=1.0):
self.sigma = 1.0
self.sigma = sigma

def fit(self, blocks, y=None):
"""
Expand Down
62 changes: 43 additions & 19 deletions dragnet/model_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import os
import pprint
import numpy as np

from sklearn.externals import joblib
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
Expand All @@ -29,6 +30,13 @@ def evaluate_model_predictions(y_true, y_pred, weights=None):
Returns:
Dict[str, float]
"""
if isinstance(y_pred[0], np.ndarray):
y_pred = np.concatenate(y_pred)
if isinstance(y_true[0], np.ndarray):
y_true = np.concatenate(y_true)
if (weights is not None) and (isinstance(weights[0], np.ndarray)):
weights = np.concatenate(weights)

accuracy = accuracy_score(
y_true, y_pred, normalize=True, sample_weight=weights)
precision = precision_score(
Expand Down Expand Up @@ -85,25 +93,27 @@ def train_model(extractor, data_dir, output_dir=None):
# set up directories and file naming
output_dir, fname_prefix = _set_up_output_dir_and_fname_prefix(output_dir, extractor)

# prepare, split, and concatenate the data
# prepare and split the data
logging.info('preparing, splitting, and concatenating the data...')
data = prepare_all_data(data_dir)
training_data, test_data = train_test_split(
data, test_size=0.2, random_state=42)
train_blocks, train_labels, train_weights = extractor.concatenate_data(training_data)
test_blocks, test_labels, test_weights = extractor.concatenate_data(test_data)
train_html, train_labels, train_weights = extractor.get_html_labels_weights(training_data)
test_html, test_labels, test_weights = extractor.get_html_labels_weights(test_data)

# fit the extractor on training data
# then evaluate it on train and test data
logging.info('fitting and evaluating the extractor features and model...')
try:
extractor.fit(train_blocks, train_labels, weights=train_weights)
extractor.fit(train_html, train_labels, weights=train_weights)
except (TypeError, ValueError):
extractor.fit(train_blocks, train_labels)
extractor.fit(train_html, train_labels)
train_eval = evaluate_model_predictions(
train_labels, extractor.predict(train_blocks))
np.concatenate(train_labels), extractor.predict(train_html),
np.concatenate(train_weights))
test_eval = evaluate_model_predictions(
test_labels, extractor.predict(test_blocks))
np.concatenate(test_labels), extractor.predict(test_html),
np.concatenate(test_weights))

# report model performance
_report_model_performance(output_dir, fname_prefix, train_eval, test_eval)
Expand Down Expand Up @@ -149,36 +159,50 @@ def train_many_models(extractor, param_grid, data_dir, output_dir=None,
# set up directories and file naming
output_dir, fname_prefix = _set_up_output_dir_and_fname_prefix(output_dir, extractor)

# prepare, split, and concatenate the data
logging.info('preparing, splitting, and concatenating the data...')
# prepare and split the data
logging.info('preparing and splitting the data...')
data = prepare_all_data(data_dir)
training_data, test_data = train_test_split(
data, test_size=0.2, random_state=42)
train_blocks, train_labels, train_weights = extractor.concatenate_data(training_data)
test_blocks, test_labels, test_weights = extractor.concatenate_data(test_data)
train_html, train_labels, train_weights = extractor.get_html_labels_weights(training_data)
test_html, test_labels, test_weights = extractor.get_html_labels_weights(test_data)

# filter docs we can't get features from
train_blocks = np.array([extractor.blockifier.blockify(doc)
for doc in train_html])
train_mask = [extractor._has_enough_blocks(blocks) for blocks in train_blocks]
train_blocks = train_blocks[train_mask]
train_labels = np.concatenate(train_labels[train_mask])
train_weights = np.concatenate(train_weights[train_mask])
test_labels = np.concatenate(test_labels)
test_weights = np.concatenate(test_weights)
# get features
# TODO: This only 'fit's one doc at a time. No feature fitting actually
# happens for now, but this might be important if the features change
train_features = np.concatenate([extractor.features.fit_transform(blocks)
for blocks in train_blocks])

# fit many models
gscv = GridSearchCV(
extractor, param_grid, fit_params={'weights': train_weights},
extractor.model, param_grid, fit_params={'sample_weight': train_weights},
scoring=kwargs.get('scoring', 'f1'), cv=kwargs.get('cv', 5),
n_jobs=kwargs.get('n_jobs', 1), verbose=kwargs.get('verbose', 1))
gscv = gscv.fit(train_blocks, train_labels)
gscv = gscv.fit(train_features, train_labels)

logging.info('Score of the best model, on left-out data: %s', gscv.best_score_)
logging.info('Params of the best model: %s', gscv.best_params_)

# evaluate best model on train and test data
best_extractor = gscv.best_estimator_
extractor.model = gscv.best_estimator_
train_eval = evaluate_model_predictions(
train_labels, best_extractor.predict(train_blocks))
train_labels, extractor.predict(train_html[train_mask]), weights=train_weights)
test_eval = evaluate_model_predictions(
test_labels, best_extractor.predict(test_blocks))
_report_model_performance(output_dir, fname_prefix, train_eval, test_eval)
test_labels, extractor.predict(test_html), weights=test_weights)

# pickle the final model
_write_model_to_disk(output_dir, fname_prefix, best_extractor)
_write_model_to_disk(output_dir, fname_prefix, extractor)

return best_extractor
return extractor


def _set_up_output_dir_and_fname_prefix(output_dir, extractor):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Training errors for final model (block level):
{'accuracy': 0.8888867678537985,
'f1': 0.8135523962386007,
'precision': 0.8147783353737416,
'recall': 0.8123301407281905}
Test errors for final model (block level):
{'accuracy': 0.9155827171056652,
'f1': 0.8185729821740981,
'precision': 0.8098381446406853,
'recall': 0.8274983004758668}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Training errors for final model (block level):
{'accuracy': 0.9205167323012377,
'f1': 0.9404319043625426,
'precision': 0.9184194567439528,
'recall': 0.9635254409178062}
Test errors for final model (block level):
{'accuracy': 0.8960239564871967,
'f1': 0.9161795830853575,
'precision': 0.8729650256683729,
'recall': 0.9638954889057517}
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Training errors for final model (block level):
{'accuracy': 0.9049718380999011,
'f1': 0.8650992736751364,
'precision': 0.8685604952034155,
'recall': 0.8616655286517149}
Test errors for final model (block level):
{'accuracy': 0.9211562671881685,
'f1': 0.8931811181654694,
'precision': 0.8712090865626898,
'recall': 0.9162901000930941}
Binary file not shown.
Loading