diff --git a/dragnet/data_processing.py b/dragnet/data_processing.py index a43c4e9..95acf46 100644 --- a/dragnet/data_processing.py +++ b/dragnet/data_processing.py @@ -401,352 +401,3 @@ def prepare_all_data(data_dir, block_pct_tokens_thresh=0.1): return [prepare_data(data_dir, fileroot, block_pct_tokens_thresh) for fileroot in gs_blocks_fileroots] - -# class DragnetModelData(object): -# """ -# the data needed to train a model -# includes the html, the gold standard tokens -# -# a datadir with the training data directory structure -# each training data document has a number of files with a common -# "fileroot" and a set of additional files in subdirectories -# HTML / fileroot.html -# Corrected / fileroot.html.corrected.txt = cut and paste content -# from the HTML -# block_corrected / fileroot.block_corrected.txt -# source = one of 'all', 'domain_list', 'technoratti', 'reader' -# """ -# def __init__(self, datadir, block_percent_threshold=0.1, source='all'): -# # set the re_source = a regex that can be used on fileroot -# # to eliminate files based on source -# if source == 'technoratti': -# re_keep = '^T[0-9]+' -# elif source == 'domain_list': -# re_keep = '(^[0-9])|(^[a-zA-Z]{2})' -# elif source == 'reader': -# re_keep = '^R[0-9]+' -# elif source == 'all': -# re_keep = '' # match anything -# else: -# raise ValueError("Invalid source") -# self._re_source = re.compile(re_keep) -# self._source = source -# -# # now read in all the data -# self._read_all_data(datadir, block_percent_threshold, source) -# -# def _read_all_data(self, datadir, block_percent_threshold, source): -# """ -# block_percent_threshold = the cut-off percent of all tokens in a block -# that are in the gold standard, above which the block is -# classified as content -# stores attributes .training_data, .test_data where each is a list -# of tuples: -# (raw_html_string, -# content_gold_standard, comments_gold_standard, encoding) -# where content/comments gold_standard = -# (list of block 0/1 flag, list of # tokens, all tokens as a list) -# encoding is the encoding from tag for cleaneval, -# otherwise it is None -# stores attributes .training_files, .test_files where each is a list -# of the file names -# """ -# self.training_data = [] -# self.test_data = [] -# self.training_files = [] -# self.test_files = [] -# -# training_fileroot = set(open(datadir + '/training.txt', 'r').read().strip().split()) -# print("Reading the training and test data...") -# for file, fileroot in get_list_all_corrected_files(datadir): -# if self._re_source.match(fileroot): -# html, encoding = read_HTML_file(datadir, fileroot) -# block_corrected_file = open( -# '%s/block_corrected/%s.block_corrected.txt' % -# (datadir, fileroot), 'r') -# blocks = block_corrected_file.read()[:-1].split('\n') -# -# content = [] -# comments = [] -# for block in blocks: -# block_split = block.split('\t') -# # will store the weights as the total number of tokens in the document -# content.append((float(block_split[0]), len(block_split[2].strip().split()), block_split[3].strip().split())) -# comments.append((float(block_split[1]), len(block_split[2].strip().split()), block_split[4].strip().split())) -# -# ret = [] -# for content_comments in [content, comments]: -# extracted_flag = (np.array([ele[0] for ele in content_comments]) > block_percent_threshold).astype(np.int) -# extracted_flag[np.array([ele[0] for ele in content_comments]) == -1] = -1 -# counts = np.array([ele[1] for ele in content_comments]) -# tokens = [] -# for this_block_tokens in [ele[2] for ele in content_comments if ele[1] > 0]: -# tokens.extend(this_block_tokens) -# ret.append((extracted_flag, counts, tokens)) -# -# if fileroot in training_fileroot: -# self.training_data.append((html, ret[0], ret[1], encoding)) -# self.training_files.append(fileroot) -# else: -# self.test_data.append((html, ret[0], ret[1], encoding)) -# self.test_files.append(fileroot) -# -# print("..done!") -# print("Got %s training, %s test documents" % (len(self.training_data), len(self.test_data))) -# -# @staticmethod -# def diagnose_css(datadir, plotdir): -# data = DragnetModelData(datadir, source='all') -# -# # get a list of all the css tokens extracted as content and not content -# # ONLY USE TRAINING DATA -# content_css = [] -# no_content_css = [] -# for datum in data.training_data: -# blocks = Blockifier.blockify(datum[0], encoding=datum[3]) -# extracted = np.logical_or(datum[1][0], datum[2][0]) -# assert len(blocks) == len(extracted) -# content_css.extend([blocks[k].css for k in range_(len(blocks)) if extracted[k]]) -# no_content_css.extend([blocks[k].css for k in range_(len(blocks)) if not extracted[k]]) -# -# # make a list of the most popular tokens -# from collections import defaultdict -# popular_tokens = {} -# for c, d in [('content', content_css), ('no_content', no_content_css)]: -# popular_tokens[c] = {} -# for tag in ['id', 'class']: -# popular_tokens[c][tag] = defaultdict(lambda: 0) -# for block in d: -# for tag in ['id', 'class']: -# for token in re.split('\W+|_', block[tag]): -# popular_tokens[c][tag][token] += 1 -# -# # sort tokens by most popular -# popular_tokens_sorted = {} -# for c in ['content', 'no_content']: -# popular_tokens_sorted[c] = {} -# for tag in ['id', 'class']: -# popular_tokens_sorted[c][tag] = [(v, k) for k, v in popular_tokens[c][tag].iteritems()] -# popular_tokens_sorted[c][tag].sort(reverse=True) -# -# # write to a file with percent of total -# for c in ['content', 'no_content']: -# for tag in ['id', 'class']: -# total_tokens = np.sum([ele[0] for ele in popular_tokens_sorted[c][tag]]) -# with open(plotdir + '/css_token_count_%s_%s.tsv' % (c, tag), 'w') as f: -# f.write("Token\tCount\tPercent Total\tCum Total\n") -# cumcount = 0 -# for count, token in popular_tokens_sorted[c][tag]: -# cumcount += count -# f.write("%s\t%s\t%s\t%s\n" % (count, -# token, -# float(count) / total_tokens, -# float(cumcount) / total_tokens)) -# -# # take the ratio of token count in content vs no content -# # for the tokens in the specified list -# css_tokens = open("dragnet_css_tokens.txt", 'r').read().strip().split('\n') -# content_no_content_ratio = {} -# no_content_block_count = len(no_content_css) -# content_block_count = len(content_css) -# for tag in ['id', 'class']: -# content_no_content_ratio[tag] = [] -# for token in css_tokens: -# content_count_percent = np.sum([re.search(token, block[tag].lower()) is not None for block in content_css]) / float(content_block_count) -# no_content_count_percent = np.sum([re.search(token, block[tag].lower()) is not None for block in no_content_css]) / float(no_content_block_count) -# -# if no_content_count_percent > 0: -# ratio = content_count_percent / no_content_count_percent -# else: -# ratio = np.inf -# -# content_no_content_ratio[tag].append((ratio, token, content_count_percent, no_content_count_percent)) -# -# content_no_content_ratio[tag].sort() -# -# # dump ratios to a file -# with open(plotdir + '/css_popular_token_ratio.txt', 'w') as f: -# f.write("Ratio of appearence frequency in content vs non-content blocks\n") -# f.write("Ratio------token-----percent of content blocks present-----percent of non-content blocks present\n") -# for tag in ['id', 'class']: -# f.write("\n%s\n" % tag) -# for t in content_no_content_ratio[tag]: -# f.write("%s\t%s\t%s\t%s\n" % t) -# -# @staticmethod -# def diagnose_data(datadir, plotdir, training_or_test='both'): -# """Do some diagnosis if the data set -# -# Plotdir = output plots to this directory""" -# import pylab as plt -# -# # we will accumulate the percent extracted for some histograms -# percent_extracted = [] -# for s, t in [('all', 'All data'), -# ('technoratti', 'Technoratti'), -# ('domain_list', "Domain list"), -# ('reader', "Popular RSS on Google Reader")]: -# -# data = DragnetModelData(datadir, source=s) -# data._diagnose_data_one_source(plotdir, t, training_or_test='both') -# -# percent_extracted.append((t, data._get_percent_tokens_extracted_in_block(datadir))) -# -# # plot percent extracted -# fig = plt.figure(3) -# fig.clf() -# k = 0 -# for ti, d in percent_extracted: -# plt.subplot(221 + k) -# plt.hist(d, 30) -# plt.title(ti) -# k += 1 -# fig.show() -# fig.savefig(plotdir + '/percent_tokens_extracted.png') -# -# def _get_percent_tokens_extracted_in_block(self, datadir): -# ret = [] -# for file, fileroot in get_list_all_corrected_files(datadir): -# if self._re_source.match(fileroot): -# # a histogram of block frequency -# with open(os.path.join(datadir, -# 'block_corrected/%s.block_corrected.txt' % fileroot), -# 'r') as block_corrected_file: -# blocks = block_corrected_file.read()[:-1].split('\n') -# -# for block in blocks: -# block_split = block.split('\t') -# ret.append(float(block_split[0])) -# -# return np.asarray(ret) -# -# def _diagnose_data_one_source(self, plotdir, ti, training_or_test='both'): -# """Make some plots and do some exploratory analyis on training data -# training_or_test is one of "training", "test", "both" -# """ -# import pylab as plt -# from mozsci.histogram import Histogram1DFast -# -# if training_or_test == 'training': -# plot_data = self.training_data -# files = self.training_files -# elif training_or_test == 'test': -# plot_data = self.test_data -# files = self.test_files -# elif training_or_test == 'both': -# plot_data = self.training_data + self.test_data -# files = self.training_files + self.test_files -# else: -# raise ValueError("Invalid training_or_test") -# -# # block_level_aggreate = holds block count of # extracted as -# # content, comments and total -# block_level_aggregate = {'content': [], 'comments': [], 'total': []} -# for datum in plot_data: -# k = 1 -# block_level_aggregate['total'].append(len(datum[1][1])) -# for c in ['content', 'comments']: -# extracted_flag, overall_token_count, tokens = datum[k] -# block_level_aggregate[c].append(np.sum(extracted_flag)) -# k += 1 -# -# # plot -# block_level_aggregate['total'] = np.array(block_level_aggregate['total']).astype(np.float) -# fig = plt.figure(1) -# fig.clf() -# -# plt.subplot(221) -# plt.hist(block_level_aggregate['total'], 30) -# plt.title("Block count across files") -# -# plt.subplot(222) -# plt.hist(block_level_aggregate['content'] / block_level_aggregate['total'], 30) -# plt.title("Percent of blocks that are content across files") -# -# plt.subplot(223) -# plt.hist(block_level_aggregate['comments'] / block_level_aggregate['total'], 30) -# plt.title("Percent of blocks that are comments across files") -# -# txt = "Total blocks: %s " % int(np.sum(block_level_aggregate['total'])) -# for s in ['content', 'comments']: -# txt += "\nTotal %s %s (%s %%)" % (s, int(np.sum(block_level_aggregate[s])), np.sum(block_level_aggregate[s]) / np.sum(block_level_aggregate['total']) * 100) -# plt.figtext(0.6, 0.4, txt) -# -# add_plot_title(ti + '\nBlock level, training + test') -# -# fig.show() -# fig.savefig(plotdir + '/' + self._source + '_block_level.png') -# -# # percent extracted as content vs block number -# bins = 20 -# content_percent_vs_block_percent = { -# 'content': np.zeros((len(plot_data), bins)), -# 'comments': np.zeros((len(plot_data), bins))} -# -# # number of tokens in block vs block number -# block_length_vs_block_percent = np.zeros((len(plot_data), bins)) -# -# for datum_number in range_(len(plot_data)): -# datum = plot_data[datum_number] -# k = 1 -# for c in ['content', 'comments']: -# extracted_flag, overall_token_count, tokens = datum[k] -# block_percent = np.arange(len(extracted_flag)) / float(len(extracted_flag)) -# -# # count of extracted blocks in each bin -# h = Histogram1DFast(bins, 0, 1) -# h.update_counts(block_percent, extracted_flag) -# extracted_counts = h.bin_count -# -# # overall count -# h = Histogram1DFast(bins, 0, 1) -# h.update(block_percent) -# total_counts = h.bin_count -# -# # number of tokens in block -# if c == 'content': # token count same for content, comments -# h = Histogram1DFast(bins, 0, 1) -# h.update_counts(block_percent, overall_token_count) -# token_count = h.bin_count -# block_length_vs_block_percent[datum_number, :] = token_count.astype(np.float) / total_counts -# -# content_percent_vs_block_percent[c][datum_number, :] = extracted_counts.astype(np.float) / total_counts -# k += 1 -# -# # plot -# fig = plt.figure(2) -# fig.clf() -# -# plt.subplot(311) -# c = 'content' -# masked_data = np.ma.masked_array(content_percent_vs_block_percent[c], np.isnan(content_percent_vs_block_percent[c])) -# np.mean(masked_data, axis=0) -# plt.plot(np.linspace(0, 1, bins), np.mean(masked_data, axis=0)) -# plt.title("Content") -# plt.ylabel("Percent extracted") -# -# plt.subplot(312) -# c = 'comments' -# masked_data = np.ma.masked_array(content_percent_vs_block_percent[c], np.isnan(content_percent_vs_block_percent[c])) -# np.mean(masked_data, axis=0) -# plt.plot(np.linspace(0, 1, bins), np.mean(masked_data, axis=0)) -# plt.title("Comments") -# plt.ylabel("Percent extracted") -# -# plt.subplot(313) -# masked_data = np.ma.masked_array(block_length_vs_block_percent, np.isnan(block_length_vs_block_percent)) -# np.mean(masked_data, axis=0) -# plt.plot(np.linspace(0, 1, bins), np.mean(masked_data, axis=0)) -# plt.title("All tokens") -# plt.xlabel("Block position in document") -# plt.ylabel("# tokens in block") -# -# add_plot_title(ti + '\nPercent of blocks extracted, # tokens in doc, training + test') -# fig.show() -# fig.savefig(plotdir + '/' + self._source + '_block_level_block_position.png') -# -# -# def add_plot_title(ti_str): -# """Add a string as a title on top of a subplot""" -# import pylab as plt -# plt.figtext(0.5, 0.94, ti_str, ha='center', color='black', weight='bold', size='large') diff --git a/dragnet/extractor.py b/dragnet/extractor.py index 231ba03..7e3f372 100644 --- a/dragnet/extractor.py +++ b/dragnet/extractor.py @@ -4,8 +4,9 @@ from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.ensemble import ExtraTreesClassifier -from .compat import string_, str_cast +from .compat import string_, str_cast, unicode_ from .util import get_and_union_features +from .blocks import TagCountNoCSSReadabilityBlockifier class Extractor(BaseEstimator, ClassifierMixin): @@ -36,7 +37,7 @@ class Extractor(BaseEstimator, ClassifierMixin): ``predict_proba()`` method. """ - def __init__(self, blockifier, + def __init__(self, blockifier=TagCountNoCSSReadabilityBlockifier, features=('kohlschuetter', 'weninger', 'readability'), model=None, to_extract='content', prob_threshold=0.5, max_block_weight=200): @@ -65,7 +66,7 @@ def features(self): def features(self, feats): self._features = get_and_union_features(feats) - def fit(self, blocks, labels, weights=None): + def fit(self, documents, labels, weights=None): """ Fit :class`Extractor` features and model to a training dataset. @@ -77,16 +78,25 @@ def fit(self, blocks, labels, weights=None): Returns: :class`Extractor` """ - features_mat = self.features.fit_transform(blocks) + block_groups = np.array([self.blockifier.blockify(doc) for doc in documents]) + mask = [self._has_enough_blocks(blocks) for blocks in block_groups] + block_groups = block_groups[mask] + labels = np.concatenate(np.array(labels)[mask]) + + # TODO: This only 'fit's one doc at a time. No feature fitting actually + # happens for now, but this might be important if the features change + features_mat = np.concatenate([self.features.fit_transform(blocks) + for blocks in block_groups]) if weights is None: self.model.fit(features_mat, labels) else: + weights = np.concatenate(np.array(weights)[mask]) self.model.fit(features_mat, labels, sample_weight=weights) return self - def concatenate_data(self, data): + def get_html_labels_weights(self, data): """ - Concatenate the blocks, labels, and weights of many files' data. + Gather the html, labels, and weights of many files' data. Primarily useful for training/testing an :class`Extractor`. Args: @@ -96,19 +106,16 @@ def concatenate_data(self, data): Tuple[List[Block], np.array(int), np.array(int)]: All blocks, all labels, and all weights, respectively. """ - all_blocks = [] - all_labels = np.empty(0, dtype=int) - all_weights = np.empty(0, dtype=int) + all_html = [] + all_labels = [] + all_weights = [] for html, content, comments in data: - blocks = self.blockifier.blockify(html) - if not self._has_enough_blocks(blocks): - continue - all_blocks.extend(blocks) - labels, weights, _ = self._get_labels_and_weights( + all_html.append(html) + labels, weights = self._get_labels_and_weights( content, comments) - all_labels = np.hstack((all_labels, labels)) - all_weights = np.hstack((all_weights, weights)) - return all_blocks, all_labels, all_weights + all_labels.append(labels) + all_weights.append(weights) + return np.array(all_html), np.array(all_labels), np.array(all_weights) def _has_enough_blocks(self, blocks): if len(blocks) < 3: @@ -126,29 +133,22 @@ def _get_labels_and_weights(self, content, comments): Returns: Tuple[np.array[int], np.array[int], List[str]] """ - # TODO: get rid of the third element here and elsewhere? # extract content and comments if 'content' in self.to_extract and 'comments' in self.to_extract: - if self.max_block_weight is None: - return (np.logical_or(content[0], comments[0]).astype(int), - content[1], - content[2] + comments[2]) - else: - return (np.logical_or(content[0], comments[0]).astype(int), - np.minimum(content[1], self.max_block_weight), - content[2] + comments[2]) + labels = np.logical_or(content[0], comments[0]).astype(int) + weights = content[1], # extract content only elif 'content' in self.to_extract: - if self.max_block_weight is None: - return content - else: - return (content[0], np.minimum(content[1], self.max_block_weight), content[2]) + labels = content[0] + weights = content[1] # extract comments only else: - if self.max_block_weight is None: - return comments - else: - return (comments[0], np.minimum(comments[1], self.max_block_weight), comments[2]) + labels = comments[0] + weights = comments[1] + if self.max_block_weight is None: + weights = np.minimum(weights, self.max_block_weight) + + return labels, weights def extract(self, html, encoding=None, as_blocks=False): """ @@ -166,55 +166,62 @@ def extract(self, html, encoding=None, as_blocks=False): Returns: str or List[Block] """ - blocks = self.blockifier.blockify(html, encoding=encoding) - return self.extract_from_blocks(blocks, as_blocks=as_blocks) + preds, blocks = self.predict(html, encoding=encoding, return_blocks=True) + if as_blocks is False: + return str_cast(b'\n'.join(blocks[ind].text for ind in np.flatnonzero(preds))) + else: + return [blocks[ind] for ind in np.flatnonzero(preds)] + - def extract_from_blocks(self, blocks, as_blocks=False): + def predict(self, documents, **kwargs): """ - Extract the main content and/or comments from a sequence of (all) blocks - and return it as a string or as a sequence of block objects. + Predict class (content=1 or not-content=0) of the blocks in one or many + HTML document(s). Args: - blocks (List[Block]): Blockify'd HTML document. - as_blocks (bool): If False, return the main content as a combined - string; if True, return the content-holding blocks as a list of - block objects. + documents (str or List[str]): HTML document(s) Returns: - str or List[Block] + ``np.ndarray`` or List[``np.ndarray``]: array of binary predictions + for content (1) or not-content (0). """ - if not self._has_enough_blocks(blocks): - if as_blocks is False: - return '' - else: - return [] - features_mat = self.features.transform(blocks) - if self.prob_threshold is None: - preds = self.model.predict(features_mat) - else: - self._positive_idx = ( - self._positive_idx or list(self.model.classes_).index(1)) - preds = (self.model.predict_proba(features_mat) > self.prob_threshold)[:, self._positive_idx] - if as_blocks is False: - return str_cast(b'\n'.join(blocks[ind].text for ind in np.flatnonzero(preds))) + if isinstance(documents, (str, bytes, unicode_, np.unicode_)): + return self._predict_one(documents, **kwargs) else: - return [blocks[ind] for ind in np.flatnonzero(preds)] + return np.concatenate([self._predict_one(doc, **kwargs) for doc in documents]) - def predict(self, blocks): + + def _predict_one(self, document, encoding=None, return_blocks=False): """ - Predict class (content=1 or not-content=0) of each block in a sequence. + Predict class (content=1 or not-content=0) of each block in an HTML + document. Args: - blocks (List[Block]): Blockify'd HTML document. + documents (str): HTML document Returns: - ``np.ndarray``: 1D array of block-level, binary predictions for - content (1) or not-content (0). + ``np.ndarray``: array of binary predictions for content (1) or + not-content (0). """ - features_mat = self.features.transform(blocks) - if self.prob_threshold is None: - return self.model.predict(features_mat) + # blockify + blocks = self.blockifier.blockify(document, encoding=encoding) + # get features + try: + features = self.features.transform(blocks) + except ValueError: # Can't make features, predict no content + preds = np.zeros((len(blocks))) + # make predictions else: - self._positive_idx = ( - self._positive_idx or list(self.model.classes_).index(1)) - return (self.model.predict_proba(features_mat) > self.prob_threshold)[:, self._positive_idx].astype(int) + if self.prob_threshold is None: + preds = self.model.predict(features) + else: + self._positive_idx = ( + self._positive_idx or list(self.model.classes_).index(1)) + preds = self.model.predict_proba(features) > self.prob_threshold + preds = preds[:, self._positive_idx].astype(int) + + if return_blocks: + return preds, blocks + else: + return preds + diff --git a/dragnet/features/weninger.py b/dragnet/features/weninger.py index 0ad7937..ed383ab 100644 --- a/dragnet/features/weninger.py +++ b/dragnet/features/weninger.py @@ -20,7 +20,7 @@ class WeningerFeatures(BaseEstimator, TransformerMixin): __name__ = 'weninger' def __init__(self, sigma=1.0): - self.sigma = 1.0 + self.sigma = sigma def fit(self, blocks, y=None): """ diff --git a/dragnet/model_training.py b/dragnet/model_training.py index 4e4838f..ee8d888 100644 --- a/dragnet/model_training.py +++ b/dragnet/model_training.py @@ -4,6 +4,7 @@ import logging import os import pprint +import numpy as np from sklearn.externals import joblib from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score @@ -29,6 +30,13 @@ def evaluate_model_predictions(y_true, y_pred, weights=None): Returns: Dict[str, float] """ + if isinstance(y_pred[0], np.ndarray): + y_pred = np.concatenate(y_pred) + if isinstance(y_true[0], np.ndarray): + y_true = np.concatenate(y_true) + if (weights is not None) and (isinstance(weights[0], np.ndarray)): + weights = np.concatenate(weights) + accuracy = accuracy_score( y_true, y_pred, normalize=True, sample_weight=weights) precision = precision_score( @@ -85,25 +93,27 @@ def train_model(extractor, data_dir, output_dir=None): # set up directories and file naming output_dir, fname_prefix = _set_up_output_dir_and_fname_prefix(output_dir, extractor) - # prepare, split, and concatenate the data + # prepare and split the data logging.info('preparing, splitting, and concatenating the data...') data = prepare_all_data(data_dir) training_data, test_data = train_test_split( data, test_size=0.2, random_state=42) - train_blocks, train_labels, train_weights = extractor.concatenate_data(training_data) - test_blocks, test_labels, test_weights = extractor.concatenate_data(test_data) + train_html, train_labels, train_weights = extractor.get_html_labels_weights(training_data) + test_html, test_labels, test_weights = extractor.get_html_labels_weights(test_data) # fit the extractor on training data # then evaluate it on train and test data logging.info('fitting and evaluating the extractor features and model...') try: - extractor.fit(train_blocks, train_labels, weights=train_weights) + extractor.fit(train_html, train_labels, weights=train_weights) except (TypeError, ValueError): - extractor.fit(train_blocks, train_labels) + extractor.fit(train_html, train_labels) train_eval = evaluate_model_predictions( - train_labels, extractor.predict(train_blocks)) + np.concatenate(train_labels), extractor.predict(train_html), + np.concatenate(train_weights)) test_eval = evaluate_model_predictions( - test_labels, extractor.predict(test_blocks)) + np.concatenate(test_labels), extractor.predict(test_html), + np.concatenate(test_weights)) # report model performance _report_model_performance(output_dir, fname_prefix, train_eval, test_eval) @@ -149,36 +159,50 @@ def train_many_models(extractor, param_grid, data_dir, output_dir=None, # set up directories and file naming output_dir, fname_prefix = _set_up_output_dir_and_fname_prefix(output_dir, extractor) - # prepare, split, and concatenate the data - logging.info('preparing, splitting, and concatenating the data...') + # prepare and split the data + logging.info('preparing and splitting the data...') data = prepare_all_data(data_dir) training_data, test_data = train_test_split( data, test_size=0.2, random_state=42) - train_blocks, train_labels, train_weights = extractor.concatenate_data(training_data) - test_blocks, test_labels, test_weights = extractor.concatenate_data(test_data) + train_html, train_labels, train_weights = extractor.get_html_labels_weights(training_data) + test_html, test_labels, test_weights = extractor.get_html_labels_weights(test_data) + + # filter docs we can't get features from + train_blocks = np.array([extractor.blockifier.blockify(doc) + for doc in train_html]) + train_mask = [extractor._has_enough_blocks(blocks) for blocks in train_blocks] + train_blocks = train_blocks[train_mask] + train_labels = np.concatenate(train_labels[train_mask]) + train_weights = np.concatenate(train_weights[train_mask]) + test_labels = np.concatenate(test_labels) + test_weights = np.concatenate(test_weights) + # get features + # TODO: This only 'fit's one doc at a time. No feature fitting actually + # happens for now, but this might be important if the features change + train_features = np.concatenate([extractor.features.fit_transform(blocks) + for blocks in train_blocks]) # fit many models gscv = GridSearchCV( - extractor, param_grid, fit_params={'weights': train_weights}, + extractor.model, param_grid, fit_params={'sample_weight': train_weights}, scoring=kwargs.get('scoring', 'f1'), cv=kwargs.get('cv', 5), n_jobs=kwargs.get('n_jobs', 1), verbose=kwargs.get('verbose', 1)) - gscv = gscv.fit(train_blocks, train_labels) + gscv = gscv.fit(train_features, train_labels) logging.info('Score of the best model, on left-out data: %s', gscv.best_score_) logging.info('Params of the best model: %s', gscv.best_params_) # evaluate best model on train and test data - best_extractor = gscv.best_estimator_ + extractor.model = gscv.best_estimator_ train_eval = evaluate_model_predictions( - train_labels, best_extractor.predict(train_blocks)) + train_labels, extractor.predict(train_html[train_mask]), weights=train_weights) test_eval = evaluate_model_predictions( - test_labels, best_extractor.predict(test_blocks)) - _report_model_performance(output_dir, fname_prefix, train_eval, test_eval) + test_labels, extractor.predict(test_html), weights=test_weights) # pickle the final model - _write_model_to_disk(output_dir, fname_prefix, best_extractor) + _write_model_to_disk(output_dir, fname_prefix, extractor) - return best_extractor + return extractor def _set_up_output_dir_and_fname_prefix(output_dir, extractor): diff --git a/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_block_errors.txt b/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_block_errors.txt new file mode 100644 index 0000000..a6a9cf3 --- /dev/null +++ b/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_block_errors.txt @@ -0,0 +1,10 @@ +Training errors for final model (block level): +{'accuracy': 0.8888867678537985, + 'f1': 0.8135523962386007, + 'precision': 0.8147783353737416, + 'recall': 0.8123301407281905} +Test errors for final model (block level): +{'accuracy': 0.9155827171056652, + 'f1': 0.8185729821740981, + 'precision': 0.8098381446406853, + 'recall': 0.8274983004758668} \ No newline at end of file diff --git a/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_block_errors.txt b/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_block_errors.txt new file mode 100644 index 0000000..acc8b86 --- /dev/null +++ b/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_block_errors.txt @@ -0,0 +1,10 @@ +Training errors for final model (block level): +{'accuracy': 0.9205167323012377, + 'f1': 0.9404319043625426, + 'precision': 0.9184194567439528, + 'recall': 0.9635254409178062} +Test errors for final model (block level): +{'accuracy': 0.8960239564871967, + 'f1': 0.9161795830853575, + 'precision': 0.8729650256683729, + 'recall': 0.9638954889057517} \ No newline at end of file diff --git a/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_model.pkl.gz b/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_model.pkl.gz index 521ebe8..f4d8bdf 100644 Binary files a/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_model.pkl.gz and b/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_model.pkl.gz differ diff --git a/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_model.pkl.gz b/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_model.pkl.gz index e7cb31b..80c294e 100644 Binary files a/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_model.pkl.gz and b/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_model.pkl.gz differ diff --git a/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_block_errors.txt b/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_block_errors.txt new file mode 100644 index 0000000..66b365c --- /dev/null +++ b/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_block_errors.txt @@ -0,0 +1,10 @@ +Training errors for final model (block level): +{'accuracy': 0.9049718380999011, + 'f1': 0.8650992736751364, + 'precision': 0.8685604952034155, + 'recall': 0.8616655286517149} +Test errors for final model (block level): +{'accuracy': 0.9211562671881685, + 'f1': 0.8931811181654694, + 'precision': 0.8712090865626898, + 'recall': 0.9162901000930941} \ No newline at end of file diff --git a/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_model.pkl.gz b/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_model.pkl.gz index fc34f6f..ddfda23 100644 Binary files a/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_model.pkl.gz and b/dragnet/pickled_models/py2_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_model.pkl.gz differ diff --git a/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_block_errors.txt b/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_block_errors.txt new file mode 100644 index 0000000..6380ab1 --- /dev/null +++ b/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_block_errors.txt @@ -0,0 +1,10 @@ +Training errors for final model (block level): +{'accuracy': 0.8880447742482034, + 'f1': 0.811423824082008, + 'precision': 0.8157638153549375, + 'recall': 0.8071297672971197} +Test errors for final model (block level): +{'accuracy': 0.9135073030617857, + 'f1': 0.8150270281573802, + 'precision': 0.8024665939179312, + 'recall': 0.827986913664174} \ No newline at end of file diff --git a/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_block_errors.txt b/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_block_errors.txt new file mode 100644 index 0000000..5376898 --- /dev/null +++ b/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_block_errors.txt @@ -0,0 +1,10 @@ +Training errors for final model (block level): +{'accuracy': 0.9181331474618691, + 'f1': 0.9387730533370016, + 'precision': 0.9149869162618192, + 'recall': 0.963828893783574} +Test errors for final model (block level): +{'accuracy': 0.8929707266393693, + 'f1': 0.9138891019551617, + 'precision': 0.8692234593397384, + 'recall': 0.9633937494039252} \ No newline at end of file diff --git a/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_model.pkl.gz b/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_model.pkl.gz index 77fe70d..5b18125 100644 Binary files a/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_model.pkl.gz and b/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_model.pkl.gz differ diff --git a/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_model.pkl.gz b/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_model.pkl.gz index cdfe8e1..81043cf 100644 Binary files a/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_model.pkl.gz and b/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_model.pkl.gz differ diff --git a/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_content_block_errors.txt b/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_content_block_errors.txt new file mode 100644 index 0000000..4e9c77b --- /dev/null +++ b/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_content_block_errors.txt @@ -0,0 +1,10 @@ +Training errors for final model (block level): +{'accuracy': 0.9055403901546596, + 'f1': 0.8659474221784542, + 'precision': 0.869143906134945, + 'recall': 0.8627743637329225} +Test errors for final model (block level): +{'accuracy': 0.9199633319073519, + 'f1': 0.8913886502283953, + 'precision': 0.8708065436067614, + 'recall': 0.9129672539972683} \ No newline at end of file diff --git a/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_content_model.pkl.gz b/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_content_model.pkl.gz index ffdb5aa..a28771b 100644 Binary files a/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_content_model.pkl.gz and b/dragnet/pickled_models/py2_sklearn_0.18.0/kohlschuetter_readability_weninger_content_model.pkl.gz differ diff --git a/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_block_errors.txt b/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_block_errors.txt index 05c85ad..14e0bd5 100644 --- a/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_block_errors.txt +++ b/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_block_errors.txt @@ -1,11 +1,10 @@ Training errors for final model (block level): -{'accuracy': 0.99960252445444053, - 'f1': 0.99919343733401533, - 'precision': 0.99933105103687092, - 'recall': 0.99905586152635717} - +{'accuracy': 0.886500603377158, + 'f1': 0.8087575250472692, + 'precision': 0.8133641076752258, + 'recall': 0.8042028283945921} Test errors for final model (block level): -{'accuracy': 0.89056809905316825, - 'f1': 0.64719694746110934, - 'precision': 0.77070954211814047, - 'recall': 0.55780419934227166} +{'accuracy': 0.9117667909307584, + 'f1': 0.8100755614489277, + 'precision': 0.8026590198123045, + 'recall': 0.8176304384772264} \ No newline at end of file diff --git a/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_block_errors.txt b/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_block_errors.txt index 350457e..70ecd61 100644 --- a/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_block_errors.txt +++ b/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_block_errors.txt @@ -1,11 +1,10 @@ Training errors for final model (block level): -{'accuracy': 0.99948619014842321, - 'f1': 0.99939039118482653, - 'precision': 1.0, - 'recall': 0.99878152516265495} - +{'accuracy': 0.9204290246339881, + 'f1': 0.9404201712810462, + 'precision': 0.9176159900970319, + 'recall': 0.964386676596891} Test errors for final model (block level): -{'accuracy': 0.91469410050983246, - 'f1': 0.87341259119156978, - 'precision': 0.88187150456963581, - 'recall': 0.86511441188277804} +{'accuracy': 0.8981140377681355, + 'f1': 0.9177836561506915, + 'precision': 0.8752737205679842, + 'recall': 0.9646335850324057} \ No newline at end of file diff --git a/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_model.pkl.gz b/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_model.pkl.gz new file mode 100644 index 0000000..6fed511 Binary files /dev/null and b/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_content_model.pkl.gz differ diff --git a/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_model.pkl.gz b/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_model.pkl.gz new file mode 100644 index 0000000..f74f01d Binary files /dev/null and b/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_comments_model.pkl.gz differ diff --git a/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_block_errors.txt b/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_block_errors.txt index 90b9add..30e68d5 100644 --- a/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_block_errors.txt +++ b/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_block_errors.txt @@ -1,11 +1,10 @@ Training errors for final model (block level): -{'accuracy': 0.9989820748223478, - 'f1': 0.99711578079934071, - 'precision': 0.9943026186041416, - 'recall': 0.99994490661671531} - +{'accuracy': 0.9054717718032232, + 'f1': 0.8658442469426517, + 'precision': 0.8690781718068796, + 'recall': 0.8626343003542436} Test errors for final model (block level): -{'accuracy': 0.87859613983976692, - 'f1': 0.69106915324916018, - 'precision': 0.58421464943204071, - 'recall': 0.8457612702013042} +{'accuracy': 0.9201588950681415, + 'f1': 0.8918900132071536, + 'precision': 0.8694900158764989, + 'recall': 0.9154746777382902} \ No newline at end of file diff --git a/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_model.pkl.gz b/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_model.pkl.gz new file mode 100644 index 0000000..cc118a3 Binary files /dev/null and b/dragnet/pickled_models/py3_sklearn_0.15.2_0.17.1/kohlschuetter_readability_weninger_content_model.pkl.gz differ diff --git a/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_block_errors.txt b/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_block_errors.txt new file mode 100644 index 0000000..1567b66 --- /dev/null +++ b/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_block_errors.txt @@ -0,0 +1,10 @@ +Training errors for final model (block level): +{'accuracy': 0.8857617952624446, + 'f1': 0.8090955569752635, + 'precision': 0.8069922589144594, + 'recall': 0.8112098475156461} +Test errors for final model (block level): +{'accuracy': 0.9138055368819898, + 'f1': 0.8146103452264529, + 'precision': 0.8065278500780844, + 'recall': 0.8228564751869476} \ No newline at end of file diff --git a/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_block_errors.txt b/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_block_errors.txt new file mode 100644 index 0000000..1a98308 --- /dev/null +++ b/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_block_errors.txt @@ -0,0 +1,10 @@ +Training errors for final model (block level): +{'accuracy': 0.9203423488216474, + 'f1': 0.9403472982866902, + 'precision': 0.917661608068621, + 'recall': 0.964183054177825} +Test errors for final model (block level): +{'accuracy': 0.8932151805903563, + 'f1': 0.9139981611675603, + 'precision': 0.8701278254676313, + 'recall': 0.9625271084462247} \ No newline at end of file diff --git a/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_model.pkl.gz b/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_model.pkl.gz index a9c8416..9c13e60 100644 Binary files a/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_model.pkl.gz and b/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_content_model.pkl.gz differ diff --git a/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_model.pkl.gz b/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_model.pkl.gz index f556114..baf41c7 100644 Binary files a/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_model.pkl.gz and b/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_comments_model.pkl.gz differ diff --git a/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_content_block_errors.txt b/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_content_block_errors.txt new file mode 100644 index 0000000..e14fd9d --- /dev/null +++ b/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_content_block_errors.txt @@ -0,0 +1,10 @@ +Training errors for final model (block level): +{'accuracy': 0.904718001792332, + 'f1': 0.8644470492730987, + 'precision': 0.8697976287363944, + 'recall': 0.8591618957578304} +Test errors for final model (block level): +{'accuracy': 0.9226914380003667, + 'f1': 0.8950468762963577, + 'precision': 0.8747291809914246, + 'recall': 0.9163308712108342} \ No newline at end of file diff --git a/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_content_model.pkl.gz b/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_content_model.pkl.gz index 0c41d75..888a054 100644 Binary files a/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_content_model.pkl.gz and b/dragnet/pickled_models/py3_sklearn_0.18.0/kohlschuetter_readability_weninger_content_model.pkl.gz differ