# Extract Articles and Sentences HowTo

* save RAM by processing the documents with an Iterable (OJCorpus)
* parallelize the sentence segmentation using joblib

In [1]:
import os
import codecs
import nltk
from lxml import etree

OJCORPUS_DIR = "/home/kuhn/Data/ojc_joint_set/" # roughly 340k documents
HTML_PARSER = etree.HTMLParser()

OJ_TEST_ARTICLE = os.path.join(OJCORPUS_DIR, "448048.html")

In [10]:
from segtok.segmenter import split_single, split_multi
from segtok import segmenter

def segtok_sent_generator(document):
    """
    returns a generator over the sentences of a document.
    each sentence is represented as a string.
    
    Parameters
    ----------
    document : list of str
        a plain text document represented as a list of its segments
        (extracted from their corresponding HTML elements)
    """
    for segment in document:
        for sentence in segmenter.split_multi(segment):
            if sentence.strip():
                yield sentence

In [11]:
class OJCorpus(object):
    """
    This class represents a corpus of openjur.de court decision HTML files
    as an Iterable over parsed documents.
    Each parsed document is represented by a (filename, list of sentences) tuple.
    """
    def __init__(self, corpus_path):
        self.corpus_path = os.path.abspath(corpus_path)
        self.file_names = iter(os.listdir(self.corpus_path))
    
    def __iter__(self):
        return self
    
    def next(self):
        file_name = self.file_names.next()
        tree = etree.parse(os.path.join(self.corpus_path, file_name), parser=HTML_PARSER)
        return file_name, segtok_sent_generator(tree.xpath('//article//text()'))

In [3]:
# def write_sent_split_file(file_name, document, output_dir):
#     doc_id = file_name.split('.')[0]
#     with codecs.open(os.path.join(output_dir, doc_id+'.txt'), 'w', encoding='utf-8') as out_file:
#         for sent in segtok_sent_generator(document):
#             out_file.write(sent + '\n')

In [4]:
## Sentence tokenize text data in parallel

# from joblib import delayed, Parallel

# foo = Parallel(n_jobs=4)(delayed(write_sent_split_file)(fname, doc, '/tmp/ojc/') for fname, doc in OJCorpus(OJCORPUS_DIR))

## Word tokenize with sklearn

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
tokenizer = CountVectorizer().build_tokenizer()


def sklearn_toksent_generator(corpus_path):
    ojcorpus = OJCorpus(OJCORPUS_DIR)
    for fname, sentences in ojcorpus:
        for sentence in sentences:
            tokenized_sentence = tokenizer(sentence)
            if len(tokenized_sentence) > 1:
                yield tokenized_sentence

In [13]:
# iterable over all tokenized sentences in the corpus
# uses almost no RAM
tok_sentences = sklearn_toksent_generator(OJCORPUS_DIR)

## Sentence Tokenized with jursegment

In [2]:
from jursegtok import tools
import hickle

In [16]:
jur_sentok = tools.sklearn_tokjursent_generator(OJCORPUS_DIR)

In [4]:
from gensim.models.phrases import Phrases

```python
Phrases?

Init signature: Phrases(self, sentences=None, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter='_')
Docstring:
Detect phrases, based on collected collocation counts. Adjacent words that appear
together more frequently than expected are joined together with the `_` character.

It can be used to generate phrases on the fly, using the `phrases[sentence]`
and `phrases[corpus]` syntax.
Init docstring:
Initialize the model from an iterable of `sentences`. Each sentence must be
a list of words (unicode strings) that will be used for training.

The `sentences` iterable can be simply a list, but for larger corpora,
consider a generator that streams the sentences directly from disk/network,
without storing everything in RAM. See :class:`BrownCorpus`,
:class:`Text8Corpus` or :class:`LineSentence` in the :mod:`gensim.models.word2vec`
module for such examples.

`min_count` ignore all words and bigrams with total collected count lower
than this.

`threshold` represents a threshold for forming the phrases (higher means
fewer phrases). A phrase of words `a` and `b` is accepted if
`(cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold`, where `N` is the
total vocabulary size.

`max_vocab_size` is the maximum size of the vocabulary. Used to control
pruning of less common words, to keep memory under control. The default
of 40M needs about 3.6GB of RAM; increase/decrease `max_vocab_size` depending
on how much available memory you have.

`delimiter` is the glue character used to join collocation tokens.
File:           /usr/local/lib/python2.7/dist-packages/gensim/models/phrases.py
Type:           type
```

## Notes on RAM consumption

* `Phrases` accepts an iterable for the `sentences` parameter
* we can limit its memory consumption by reducing `max_vocab_size`
* we should probably serialize the ngram models
* try [hickle](https://github.com/telegraphic/hickle) for serialization  
  (faster, less RAM consumption, "Big Data" ready (HDF5)

In [15]:
import hickle
%time hickle.dump(Phrases(tok_sentences), '/tmp/ojbigram.hkl', mode='w')

KeyboardInterrupt: 

In [6]:
# use jursegtok instead of naive sentence tokenizer
%time hickle.dump(Phrases(sentences=jur_sentok, max_vocab_size=10000000), '/tmp/ojbigram340k_jurseg', mode='w')

dumping <class 'gensim.models.phrases.Phrases'> to file <HDF5 file "ojbigram340k_jurseg" (mode r+)>
CPU times: user 1h 42min 20s, sys: 55.3 s, total: 1h 43min 16s
Wall time: 2h 37min 49s


In [15]:
bigram = hickle.load('/tmp/ojbigram340k_jurseg', safe=False)

In [6]:
%time hickle.dump(Phrases(sentences=bigram[jur_sentok], max_vocab_size=10000000), '/tmp/ojtrigram340k_jurseg.hkl', mode='w')

dumping <class 'gensim.models.phrases.Phrases'> to file <HDF5 file "ojtrigram340k_jurseg.hkl" (mode r+)>
CPU times: user 3h 52min 21s, sys: 1min 45s, total: 3h 54min 7s
Wall time: 4h 36min 53s


In [13]:
trigram = hickle.load('/tmp/ojtrigram340k_jurseg.hkl', safe=False)

In [10]:
%time hickle.dump(Phrases(sentences=trigram[bigram[jur_sentok]], max_vocab_size=10000000), '/tmp/ojquadgram340k_jurseg.hkl', mode='w')

dumping <class 'gensim.models.phrases.Phrases'> to file <HDF5 file "ojquadgram340k_jurseg.hkl" (mode r+)>
CPU times: user 4h 20min 12s, sys: 1min 46s, total: 4h 21min 58s
Wall time: 5h 17min 55s


In [12]:
quadram = hickle.load('/tmp/ojquadgram340k_jurseg.hkl', safe=False)

In [17]:
%time hickle.dump(Phrases(sentences=quadram[trigram[bigram[jur_sentok]]], max_vocab_size=10000000), '/tmp/ojpentagram340k_jurseg.hkl', mode='w')

dumping <class 'gensim.models.phrases.Phrases'> to file <HDF5 file "ojpentagram340k_jurseg.hkl" (mode r+)>
CPU times: user 7h 6min 23s, sys: 2min 34s, total: 7h 8min 57s
Wall time: 7h 57min 12s
