dragnet-org · b4hand · Mar 19, 2018 · Mar 1, 2018 · Mar 2, 2018 · Mar 16, 2018
diff --git a/dragnet/data_processing.py b/dragnet/data_processing.py
diff --git a/dragnet/extractor.py b/dragnet/extractor.py
@@ -4,8 +4,9 @@
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.ensemble import ExtraTreesClassifier
 
-from .compat import string_, str_cast
+from .compat import string_, str_cast, unicode_
 from .util import get_and_union_features
+from .blocks import TagCountNoCSSReadabilityBlockifier
 
 
 class Extractor(BaseEstimator, ClassifierMixin):
@@ -36,7 +37,7 @@ class Extractor(BaseEstimator, ClassifierMixin):
             ``predict_proba()`` method.
     """
 
-    def __init__(self, blockifier,
+    def __init__(self, blockifier=TagCountNoCSSReadabilityBlockifier,
                  features=('kohlschuetter', 'weninger', 'readability'),
                  model=None,
                  to_extract='content', prob_threshold=0.5, max_block_weight=200):
@@ -65,7 +66,7 @@ def features(self):
     def features(self, feats):
         self._features = get_and_union_features(feats)
 
-    def fit(self, blocks, labels, weights=None):
+    def fit(self, documents, labels, weights=None):
         """
         Fit :class`Extractor` features and model to a training dataset.
 
@@ -77,16 +78,25 @@ def fit(self, blocks, labels, weights=None):
         Returns:
             :class`Extractor`
         """
-        features_mat = self.features.fit_transform(blocks)
+        block_groups = np.array([self.blockifier.blockify(doc) for doc in documents])
+        mask = [self._has_enough_blocks(blocks) for blocks in block_groups]
+        block_groups = block_groups[mask]
+        labels = np.concatenate(np.array(labels)[mask])
+
+        # TODO: This only 'fit's one doc at a time. No feature fitting actually
+        # happens for now, but this might be important if the features change
+        features_mat = np.concatenate([self.features.fit_transform(blocks)
+                                       for blocks in block_groups])
         if weights is None:
             self.model.fit(features_mat, labels)
         else:
+            weights = np.concatenate(np.array(weights)[mask])
             self.model.fit(features_mat, labels, sample_weight=weights)
         return self
 
-    def concatenate_data(self, data):
+    def get_html_labels_weights(self, data):
         """
-        Concatenate the blocks, labels, and weights of many files' data.
+        Gather the html, labels, and weights of many files' data.
         Primarily useful for training/testing an :class`Extractor`.
 
         Args:
@@ -96,19 +106,16 @@ def concatenate_data(self, data):
             Tuple[List[Block], np.array(int), np.array(int)]: All blocks, all
                 labels, and all weights, respectively.
         """
-        all_blocks = []
-        all_labels = np.empty(0, dtype=int)
-        all_weights = np.empty(0, dtype=int)
+        all_html = []
+        all_labels = []
+        all_weights = []
         for html, content, comments in data:
-            blocks = self.blockifier.blockify(html)
-            if not self._has_enough_blocks(blocks):
-                continue
-            all_blocks.extend(blocks)
-            labels, weights, _ = self._get_labels_and_weights(
+            all_html.append(html)
+            labels, weights = self._get_labels_and_weights(
                 content, comments)
-            all_labels = np.hstack((all_labels, labels))
-            all_weights = np.hstack((all_weights, weights))
-        return all_blocks, all_labels, all_weights
+            all_labels.append(labels)
+            all_weights.append(weights)
+        return np.array(all_html), np.array(all_labels), np.array(all_weights)
 
     def _has_enough_blocks(self, blocks):
         if len(blocks) < 3:
@@ -126,29 +133,22 @@ def _get_labels_and_weights(self, content, comments):
         Returns:
             Tuple[np.array[int], np.array[int], List[str]]
         """
-        # TODO: get rid of the third element here and elsewhere?
         # extract content and comments
         if 'content' in self.to_extract and 'comments' in self.to_extract:
-            if self.max_block_weight is None:
-                return (np.logical_or(content[0], comments[0]).astype(int),
-                        content[1],
-                        content[2] + comments[2])
-            else:
-                return (np.logical_or(content[0], comments[0]).astype(int),
-                        np.minimum(content[1], self.max_block_weight),
-                        content[2] + comments[2])
+            labels = np.logical_or(content[0], comments[0]).astype(int)
+            weights = content[1],
         # extract content only
         elif 'content' in self.to_extract:
-            if self.max_block_weight is None:
-                return content
-            else:
-                return (content[0], np.minimum(content[1], self.max_block_weight), content[2])
+            labels = content[0]
+            weights = content[1]
         # extract comments only
         else:
-            if self.max_block_weight is None:
-                return comments
-            else:
-                return (comments[0], np.minimum(comments[1], self.max_block_weight), comments[2])
+            labels = comments[0]
+            weights = comments[1]
+        if self.max_block_weight is None:
+            weights = np.minimum(weights, self.max_block_weight)
+
+        return labels, weights
 
     def extract(self, html, encoding=None, as_blocks=False):
         """
@@ -166,55 +166,62 @@ def extract(self, html, encoding=None, as_blocks=False):
         Returns:
             str or List[Block]
         """
-        blocks = self.blockifier.blockify(html, encoding=encoding)
-        return self.extract_from_blocks(blocks, as_blocks=as_blocks)
+        preds, blocks = self.predict(html, encoding=encoding, return_blocks=True)
+        if as_blocks is False:
+            return str_cast(b'\n'.join(blocks[ind].text for ind in np.flatnonzero(preds)))
+        else:
+            return [blocks[ind] for ind in np.flatnonzero(preds)]
+
 
-    def extract_from_blocks(self, blocks, as_blocks=False):
+    def predict(self, documents, **kwargs):
         """
-        Extract the main content and/or comments from a sequence of (all) blocks
-        and return it as a string or as a sequence of block objects.
+        Predict class (content=1 or not-content=0) of the blocks in one or many
+        HTML document(s).
 
         Args:
-            blocks (List[Block]): Blockify'd HTML document.
-            as_blocks (bool): If False, return the main content as a combined
-                string; if True, return the content-holding blocks as a list of
-                block objects.
+            documents (str or List[str]): HTML document(s)
 
         Returns:
-            str or List[Block]
+            ``np.ndarray`` or List[``np.ndarray``]: array of binary predictions
+                for content (1) or not-content (0).
         """
-        if not self._has_enough_blocks(blocks):
-            if as_blocks is False:
-                return ''
-            else:
-                return []
-        features_mat = self.features.transform(blocks)
-        if self.prob_threshold is None:
-            preds = self.model.predict(features_mat)
-        else:
-            self._positive_idx = (
-                self._positive_idx or list(self.model.classes_).index(1))
-            preds = (self.model.predict_proba(features_mat) > self.prob_threshold)[:, self._positive_idx]
-        if as_blocks is False:
-            return str_cast(b'\n'.join(blocks[ind].text for ind in np.flatnonzero(preds)))
+        if isinstance(documents, (str, bytes, unicode_, np.unicode_)):
+            return self._predict_one(documents, **kwargs)
         else:
-            return [blocks[ind] for ind in np.flatnonzero(preds)]
+            return np.concatenate([self._predict_one(doc, **kwargs) for doc in documents])
 
-    def predict(self, blocks):
+
+    def _predict_one(self, document, encoding=None, return_blocks=False):
         """
-        Predict class (content=1 or not-content=0) of each block in a sequence.
+        Predict class (content=1 or not-content=0) of each block in an HTML
+        document.
 
         Args:
-            blocks (List[Block]): Blockify'd HTML document.
+            documents (str): HTML document
 
         Returns:
-            ``np.ndarray``: 1D array of block-level, binary predictions for
-                content (1) or not-content (0).
+            ``np.ndarray``: array of binary predictions for content (1) or
+            not-content (0).
         """
-        features_mat = self.features.transform(blocks)
-        if self.prob_threshold is None:
-            return self.model.predict(features_mat)
+        # blockify
+        blocks = self.blockifier.blockify(document, encoding=encoding)
+        # get features
+        try:
+            features = self.features.transform(blocks)
+        except ValueError: # Can't make features, predict no content
+            preds = np.zeros((len(blocks)))
+        # make predictions
         else:
-            self._positive_idx = (
-                self._positive_idx or list(self.model.classes_).index(1))
-            return (self.model.predict_proba(features_mat) > self.prob_threshold)[:, self._positive_idx].astype(int)
+            if self.prob_threshold is None:
+                preds = self.model.predict(features)
+            else:
+                self._positive_idx = (
+                    self._positive_idx or list(self.model.classes_).index(1))
+                preds = self.model.predict_proba(features) > self.prob_threshold
+                preds = preds[:, self._positive_idx].astype(int)
+
+        if return_blocks:
+            return preds, blocks
+        else:
+            return preds
+
diff --git a/dragnet/features/weninger.py b/dragnet/features/weninger.py
@@ -20,7 +20,7 @@ class WeningerFeatures(BaseEstimator, TransformerMixin):
     __name__ = 'weninger'
 
     def __init__(self, sigma=1.0):
-        self.sigma = 1.0
+        self.sigma = sigma
 
     def fit(self, blocks, y=None):
         """

diff --git a/dragnet/model_training.py b/dragnet/model_training.py
@@ -4,6 +4,7 @@
 import logging
 import os
 import pprint
+import numpy as np
 
 from sklearn.externals import joblib
 from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
@@ -29,6 +30,13 @@ def evaluate_model_predictions(y_true, y_pred, weights=None):
     Returns:
         Dict[str, float]
     """
+    if isinstance(y_pred[0], np.ndarray):
+        y_pred = np.concatenate(y_pred)
+    if isinstance(y_true[0], np.ndarray):
+        y_true = np.concatenate(y_true)
+    if (weights is not None) and (isinstance(weights[0], np.ndarray)):
+        weights = np.concatenate(weights)
+
     accuracy = accuracy_score(
         y_true, y_pred, normalize=True, sample_weight=weights)
     precision = precision_score(
@@ -85,25 +93,27 @@ def train_model(extractor, data_dir, output_dir=None):
     # set up directories and file naming
     output_dir, fname_prefix = _set_up_output_dir_and_fname_prefix(output_dir, extractor)
 
-    # prepare, split, and concatenate the data
+    # prepare and split the data
     logging.info('preparing, splitting, and concatenating the data...')
     data = prepare_all_data(data_dir)
     training_data, test_data = train_test_split(
         data, test_size=0.2, random_state=42)
-    train_blocks, train_labels, train_weights = extractor.concatenate_data(training_data)
-    test_blocks, test_labels, test_weights = extractor.concatenate_data(test_data)
+    train_html, train_labels, train_weights = extractor.get_html_labels_weights(training_data)
+    test_html, test_labels, test_weights = extractor.get_html_labels_weights(test_data)
 
     # fit the extractor on training data
     # then evaluate it on train and test data
     logging.info('fitting and evaluating the extractor features and model...')
     try:
-        extractor.fit(train_blocks, train_labels, weights=train_weights)
+        extractor.fit(train_html, train_labels, weights=train_weights)
     except (TypeError, ValueError):
-        extractor.fit(train_blocks, train_labels)
+        extractor.fit(train_html, train_labels)
     train_eval = evaluate_model_predictions(
-        train_labels, extractor.predict(train_blocks))
+        np.concatenate(train_labels), extractor.predict(train_html),
+        np.concatenate(train_weights))
     test_eval = evaluate_model_predictions(
-        test_labels, extractor.predict(test_blocks))
+        np.concatenate(test_labels), extractor.predict(test_html),
+        np.concatenate(test_weights))
 
     # report model performance
     _report_model_performance(output_dir, fname_prefix, train_eval, test_eval)
@@ -149,36 +159,50 @@ def train_many_models(extractor, param_grid, data_dir, output_dir=None,
     # set up directories and file naming
     output_dir, fname_prefix = _set_up_output_dir_and_fname_prefix(output_dir, extractor)
 
-    # prepare, split, and concatenate the data
-    logging.info('preparing, splitting, and concatenating the data...')
+    # prepare and split the data
+    logging.info('preparing and splitting the data...')
     data = prepare_all_data(data_dir)
     training_data, test_data = train_test_split(
         data, test_size=0.2, random_state=42)
-    train_blocks, train_labels, train_weights = extractor.concatenate_data(training_data)
-    test_blocks, test_labels, test_weights = extractor.concatenate_data(test_data)
+    train_html, train_labels, train_weights = extractor.get_html_labels_weights(training_data)
+    test_html, test_labels, test_weights = extractor.get_html_labels_weights(test_data)
+
+    # filter docs we can't get features from
+    train_blocks = np.array([extractor.blockifier.blockify(doc)
+                            for doc in train_html])
+    train_mask = [extractor._has_enough_blocks(blocks) for blocks in train_blocks]
+    train_blocks = train_blocks[train_mask]
+    train_labels = np.concatenate(train_labels[train_mask])
+    train_weights = np.concatenate(train_weights[train_mask])
+    test_labels = np.concatenate(test_labels)
+    test_weights = np.concatenate(test_weights)
+    # get features
+    # TODO: This only 'fit's one doc at a time. No feature fitting actually
+    # happens for now, but this might be important if the features change
+    train_features = np.concatenate([extractor.features.fit_transform(blocks)
+                                    for blocks in train_blocks])
 
     # fit many models
     gscv = GridSearchCV(
-        extractor, param_grid, fit_params={'weights': train_weights},
+        extractor.model, param_grid, fit_params={'sample_weight': train_weights},
         scoring=kwargs.get('scoring', 'f1'), cv=kwargs.get('cv', 5),
         n_jobs=kwargs.get('n_jobs', 1), verbose=kwargs.get('verbose', 1))
-    gscv = gscv.fit(train_blocks, train_labels)
+    gscv = gscv.fit(train_features, train_labels)
 
     logging.info('Score of the best model, on left-out data: %s', gscv.best_score_)
     logging.info('Params of the best model: %s', gscv.best_params_)
 
     # evaluate best model on train and test data
-    best_extractor = gscv.best_estimator_
+    extractor.model = gscv.best_estimator_
     train_eval = evaluate_model_predictions(
-        train_labels, best_extractor.predict(train_blocks))
+        train_labels, extractor.predict(train_html[train_mask]), weights=train_weights)
     test_eval = evaluate_model_predictions(
-        test_labels, best_extractor.predict(test_blocks))
-    _report_model_performance(output_dir, fname_prefix, train_eval, test_eval)
+        test_labels, extractor.predict(test_html), weights=test_weights)
 
     # pickle the final model
-    _write_model_to_disk(output_dir, fname_prefix, best_extractor)
+    _write_model_to_disk(output_dir, fname_prefix, extractor)
 
-    return best_extractor
+    return extractor
 
 
 def _set_up_output_dir_and_fname_prefix(output_dir, extractor):

diff --git a/...ls/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_block_errors.txt b/...ls/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_block_errors.txt
@@ -0,0 +1,10 @@
+Training errors for final model (block level):
+{'accuracy': 0.8888867678537985,
+ 'f1': 0.8135523962386007,
+ 'precision': 0.8147783353737416,
+ 'recall': 0.8123301407281905}
+Test errors for final model (block level):
+{'accuracy': 0.9155827171056652,
+ 'f1': 0.8185729821740981,
+ 'precision': 0.8098381446406853,
+ 'recall': 0.8274983004758668}
diff --git a/...klearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_block_errors.txt b/...klearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_block_errors.txt
@@ -0,0 +1,10 @@
+Training errors for final model (block level):
+{'accuracy': 0.9205167323012377,
+ 'f1': 0.9404319043625426,
+ 'precision': 0.9184194567439528,
+ 'recall': 0.9635254409178062}
+Test errors for final model (block level):
+{'accuracy': 0.8960239564871967,
+ 'f1': 0.9161795830853575,
+ 'precision': 0.8729650256683729,
+ 'recall': 0.9638954889057517}
diff --git a/...y2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_model.pkl.gz b/...y2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_model.pkl.gz
diff --git a/...models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_model.pkl.gz b/...models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_model.pkl.gz
diff --git a/...els/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_block_errors.txt b/...els/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_block_errors.txt
@@ -0,0 +1,10 @@
+Training errors for final model (block level):
+{'accuracy': 0.9049718380999011,
+ 'f1': 0.8650992736751364,
+ 'precision': 0.8685604952034155,
+ 'recall': 0.8616655286517149}
+Test errors for final model (block level):
+{'accuracy': 0.9211562671881685,
+ 'f1': 0.8931811181654694,
+ 'precision': 0.8712090865626898,
+ 'recall': 0.9162901000930941}
diff --git a/..._models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_model.pkl.gz b/..._models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_model.pkl.gz