# Imports

In [1]:
import os
import codecs
import sqlite3
import pathlib

# Constants

In [2]:
PROJECT_DIR = pathlib.Path('~/project').expanduser()
DATA_DIR = PROJECT_DIR / 'data'

# What Is a Corpus?

## Domain-Specific Corpora

## The Baleen Ingestion Engine

# Corpus Data Management

## Corpus Disk Structure

### The Baleen disk structure

# Corpus Readers

## Streaming Data Access with NLTK

In [3]:
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

CORPUS_ROOT = DATA_DIR / 'galactic'
DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader(
    root=CORPUS_ROOT.as_posix(),
    fileids=DOC_PATTERN,
    cat_pattern=CAT_PATTERN
)

In [4]:
print(corpus.readme())

# Galactic Classifier

Can you spot the difference in language between Star Wars and Star Trek?

Scripts obtained from:

- http://www.imsdb.com/alphabetical/S
- http://www.chakoteya.net/StarTrek/9.htm



In [5]:
print(corpus.license())

Copyright (c) by Lucas Arts and Paramount Pictures.



In [6]:
print(corpus.citation())

@misc{ddl_galactic_2016,
  title = {Galactic {{Corpus}}},
  timestamp = {2016-04-19T17:16:23Z},
  publisher = {{District Data Labs}},
  author = {Voorhees, Will and Bengfort, Benjamin},
  month = apr,
  year = {2016}
}



In [7]:
corpus.categories()

['Star Trek', 'Star Wars']

In [8]:
corpus.fileids()

['Star Trek/Star Trek - Balance of Terror.txt',
 'Star Trek/Star Trek - First Contact.txt',
 'Star Trek/Star Trek - Generations.txt',
 'Star Trek/Star Trek - Nemesis.txt',
 'Star Trek/Star Trek - The Motion Picture.txt',
 'Star Trek/Star Trek 2 - The Wrath of Khan.txt',
 'Star Wars/Star Wars Episode 1.txt',
 'Star Wars/Star Wars Episode 2.txt',
 'Star Wars/Star Wars Episode 3.txt',
 'Star Wars/Star Wars Episode 4.txt',
 'Star Wars/Star Wars Episode 5.txt',
 'Star Wars/Star Wars Episode 6.txt',
 'Star Wars/Star Wars Episode 7.txt']

## Reading an HTML Corpus

In [9]:
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

CAT_PATTERN = r'([a-z_\s]+)/.*'
DOC_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.json'
TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li']

class HTMLCorpusReader(CategorizedCorpusReader, CorpusReader):
    """
    A corpus reader for raw HTML documents to enable preprocessing.
    """

    def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8',
                 tags=TAGS, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        # Save the tags that we specifically want to extract.
        self.tags = tags

    def resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. Implemented similarly to
        the NLTK ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids

    def docs(self, fileids=None, categories=None):
        """
        Returns the complete text of an HTML document, closing the document
        after we are done reading it and yielding it in a memory safe fashion.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, encoding in self.abspaths(fileids, include_encoding=True):
            with codecs.open(path, 'r', encoding=encoding) as f:
                yield f.read()

    def sizes(self, fileids=None, categories=None):
        """
        Returns a list of tuples, the fileid and size on disk of the file.
        This function is used to detect oddly large files in the corpus.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, getting every path and computing filesize
        for path in self.abspaths(fileids):
            yield os.path.getsize(path)

In [10]:
CORPUS_ROOT = DATA_DIR / 'galactic'
DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = HTMLCorpusReader(
    root=CORPUS_ROOT.as_posix(),
    fileids=DOC_PATTERN,
    cat_pattern=CAT_PATTERN
)

In [11]:
corpus.resolve(None, categories=['Star Trek'])

['Star Trek/Star Trek - Balance of Terror.txt',
 'Star Trek/Star Trek - First Contact.txt',
 'Star Trek/Star Trek - Generations.txt',
 'Star Trek/Star Trek - Nemesis.txt',
 'Star Trek/Star Trek - The Motion Picture.txt',
 'Star Trek/Star Trek 2 - The Wrath of Khan.txt']

In [12]:
list(corpus.docs(categories=['Star Wars']))

['', '', '', '', '', '', '']

In [13]:
list(corpus.sizes(categories=['Star Wars']))

[0, 0, 0, 0, 0, 0, 0]

## Reading a Corpus from a Database

In [14]:
class SqliteCorpusReader(object):

    def __init__(self, path):
        self._cur = sqlite3.connect(path).cursor()

    def scores(self):
        """
        Returns the review score
        """
        self._cur.execute("SELECT score FROM reviews")
        for score in iter(self._cur.fetchone, None):
            yield score

    def texts(self):
        """
        Returns the full review texts
        """
        self._cur.execute("SELECT content FROM content")
        for text in iter(self._cur.fetchone, None):
            yield text

    def ids(self):
        """
        Returns the review ids
        """
        self._cur.execute("SELECT reviewid FROM content")
        for idx in iter(self._cur.fetchone, None):
            yield idx