Merge prosedecomposer functionality (#26)

* added prosedecomposer functionality * fixed textprocessingtestcase * fixed unit tests * added Sphinx docs
coreybobco · Mar 20, 2020 · 81a5323 · 81a5323
1 parent 25cee0c
commit 81a5323
Show file tree

Hide file tree

Showing 14 changed files with 291 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,13 +1,21 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 
+## [0.3.0]
+- Added random sampling from Project Gutenberg and Internet Archive
+- Added ability to swap parts of speech between two strings (e.g. swap nouns)
+- Added markov chain and cut-up functionality to scramble texts.
+
+## [0.2.4]
+- Menu tweak
+
 ## [0.2.4]
 - Menu tweak
 
 ## [0.2.3] 2020-02-12
 - Fixed bug relating to package data path
 
-## [0.2.2] 2020-02-12
+## [0.2.2] 2020-02-12nm
 - Added missing dependencies to setup.py
 
 ## [0.2.1] 2020-02-12

diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/make.bat b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/Generative Lexicon (lexigen.py).rst b/docs/source/Generative Lexicon (lexigen.py).rst
@@ -0,0 +1,10 @@
+Generative Lexicon (lexigen.py)
+===============================
+
+   |
+
+These functions allow for controlled random sampling of words from Project Datamuse's API.
+
+.. automodule:: generativepoetry.lexigen
+   :members:
+
diff --git a/docs/source/Poem Text Generation (poemgen.py, jolastic.py).rst b/docs/source/Poem Text Generation (poemgen.py, jolastic.py).rst
@@ -0,0 +1,21 @@
+Poem Text Generation (poemgen.py, jolastic.py)
+==============================================
+
+   |
+
+These functions generate the actual text of poems.
+
+.. automodule:: generativepoetry.poemgen
+   :members:
+
+.. autoclass:: generativepoetry.poemgen.Poem
+   :members
+
+.. autoclass:: generativepoetry.poemgen.PoemGenerator
+   :members
+
+.. automodule:: generativepoetry.jolastic
+   :members:
+
+.. autoclass:: generativepoetry.jolastic.StochasticJolasticWordGenerator
+   :members
diff --git a/docs/source/Sampling and Text Processing from Online Libraries (decompser.py).rst b/docs/source/Sampling and Text Processing from Online Libraries (decompser.py).rst
@@ -0,0 +1,16 @@
+Sampling and Text Processing from Online Libraries (decompser.py)
+=================================================================
+
+   |
+
+With the decomposer module, one can sample random documents (books, etc.) from Project Gutenberg and Archive.org and rearrange the texts using Markov chain algorithms, cut-up, or by swapping instances of a part of speech between two texts.
+
+.. automodule:: generativepoetry.decomposer
+   :members:
+
+   |
+
+The ParsedText class has several methods for random sampling.
+
+.. autoclass:: generativepoetry.decomposer.ParsedText
+   :members:
diff --git a/docs/source/Utilities.rst b/docs/source/Utilities.rst
@@ -0,0 +1,9 @@
+Utilities
+=========
+
+   |
+
+These functions are common utilities used by other modules.
+
+.. automodule:: generativepoetry.utils
+   :members:
diff --git a/docs/source/Visual Poem (PDF, PNG) Generation (pdf.py).rst b/docs/source/Visual Poem (PDF, PNG) Generation (pdf.py).rst
@@ -0,0 +1,9 @@
+Visual Poem (PDF,PNG) Generation (pdf.py)
+=========================================
+
+   |
+
+These functions allow for the creation of all currently supported types of visual poems.
+
+.. automodule:: generativepoetry.pdf
+   :members:
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -0,0 +1,54 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../..'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'Generative Poetry'
+copyright = '2020, Corey Bobco'
+author = 'Corey Bobco'
+
+# The full version, including alpha/beta/rc tags
+release = '0.3.0'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['sphinx.ext.autodoc']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -0,0 +1,64 @@
+Welcome to Generative Poetry's documentation!
+=============================================
+
+Table of Contents
+-----------------
+.. toctree::
+   :maxdepth: 2
+
+   Generative Lexicon (lexigen.py)
+   Visual Poem (PDF, PNG) Generation (pdf.py)
+   Poem Text Generation (poemgen.py, jolastic.py)
+   Sampling and Text Processing from Online Libraries (decompser.py)
+   Utilities
+
+
+Try now
+^^^^^^^
+
+Colab (recommended for most)
+""""""""""""""""""""""""""""
+You can install this software and launch the interactive menu for creating visual poems from the following `link <https://colab.research.google.com/drive/1eNTBSLMPpemYQq4n3keKCjVpw_JqY6w->`_ to a Jupyter notebook hosted on Google Colab.
+
+Note: this will only work on desktop and tablet effectively because of screen-size issues. Keep the `Github notes on usage <https://github.com/coreybobco/generativepoetry-py/>`_ open to consult for reference.
+
+Installing On Your Computer
+"""""""""""""""""""""""""""
+
+Alternatively, if you install this on your own machine (see below), you can initalize the interactive menu by running the following command from a terminal/shell: generative-poetry-cli
+
+Windows
+"""""""
+
+Because this library currently relies on the Python package hunspell, which does not support Windows, use Docker to launch a Linux-based container, then use pip to install, and enter the Python interactive shell within:
+
+.. code-block::
+
+   docker run -t -d python python3 -m pip install generativepoetry && python3
+
+OSX
+"""
+
+OSX users must install hunspell beforehand:
+
+.. code-block::
+
+   brew install hunspell
+
+Then download the en_US dictionary from http://wordlist.aspell.net/dicts/ and unzip it to /Library/Spelling/ and install using pip:
+
+.. code-block::
+
+   python3 -m pip install generativepoetry
+
+You will also need Microsoft's core font TTF files in /Library/Fonts/.
+
+Linux
+"""""
+
+Ubuntu/Debian users should install hunspell-en-us, libhunspell-dev, and libdb++-dev beforehand and then install with pip:
+
+.. code-block::
+
+   sudo apt-get install hunspell-en-us libhunspell-dev libdb++-dev
+   python3 -m pip install generativepoetry
diff --git a/generativepoetry/decomposer.py b/generativepoetry/decomposer.py
@@ -29,21 +29,37 @@ def __init__(self, text):
         self.paragraphs = self.raw_text.split("\n\n")
 
     def random_sentence(self, minimum_tokens=1) -> str:
+        """Returns a random sentence from the text.
+
+        Keyword Arguments:
+            minimum_tokens; allows for sampling a sentence of a minimum NLP tokens
+        """
         num_tokens = 0
         while num_tokens < minimum_tokens:
             sentence = random.choice(self.sentences)
             num_tokens = len([token.text for token in spacy_nlp(sentence)])
         return sentence
 
     def random_sentences(self, num=5, minimum_tokens=1) -> list:
+        """Returns a random sentence from the text.
+
+        Keyword Arguments:
+            minimum_tokens; allows for sampling a sentence of a minimum NLP tokens
+        """
         random_sentences = []
         while len(random_sentences) < num:
             random_sentence = self.random_sentence(minimum_tokens=minimum_tokens)
             if random_sentence not in random_sentences:
                 random_sentences.append(random_sentence)
         return random_sentences
+<<<<<<< HEAD
 
     def random_paragraph(self, minimum_sentences=3) -> str:
+        """Returns a random sentence from the text.
+
+        Keyword Arguments:
+            minimum_tokens; allows for sampling a sentence of a minimum NLP tokens
+        """
         num_sentences = 0
         while num_sentences < minimum_sentences:
             paragraph = random.choice(self.paragraphs)
@@ -65,6 +81,7 @@ def validate_url(url, expected_netloc=''):
 def get_internet_archive_document(url) -> str:
     """Downloads a document (book, etc.) from Internet Archive and returns it as a string. The linked document must
        have a text version. PDF text extraction is not supported at this time.
+       Returns a ParsedText instance.
     """
     validate_url(url, expected_netloc='archive.org')
     url_parts = urlsplit(url).path.split("/")
@@ -85,7 +102,9 @@ def get_internet_archive_document(url) -> str:
 
 
 def get_gutenberg_document(url) -> str:
-    """Downloads a document (book, etc.) from Project Gutenberg and returns it as a string."""
+    """Downloads a document (book, etc.) from Project Gutenberg and returns it as a string.
+
+    Returns a ParsedText instance."""
     # Get Project Gutenberg document ID from url string
     validate_url(url, expected_netloc='gutenberg.org')
     match = re.search("(?:files|ebooks|epub)\/(\d+)", urlsplit(url).path)
@@ -96,7 +115,7 @@ def get_gutenberg_document(url) -> str:
 
 
 def random_gutenberg_document(language_filter='en') -> str:
-    """Downloads a random document (book, etc.) from Project Gutenberg and returns it as a stirng.
+    """Downloads a random document (book, etc.) from Project Gutenberg and returns it as a string.
 
     Keyword arguments:
         language_filter (str) -- restrict the random document to a paritcular language (default: English)
@@ -115,6 +134,12 @@ def random_gutenberg_document(language_filter='en') -> str:
 def reconcile_replacement_word(original_word_with_ws, original_word_tag, replacement_word, replacement_word_tag) -> str:
     """Modify replacement word if needed to fix subject/verb agreement and preserve the whitespace or lack of before
     and after the original word.
+
+    Arguments:
+        original_word_with_ws (str): (str) original word with surrounding whitespace
+        original_word_tag (str): part-of-speech tag of original word
+        replacement_word (str): word that is replacing original word
+        replacement_word_tag (str):  part-of-speech tag of replacement word
     """
     # Pluralize or singularize the replacement word if we're dealing with nouns and one's plural and one's singular.
     if original_word_tag == 'NNS' and replacement_word_tag == 'NN':
@@ -175,6 +200,12 @@ def swap_parts_of_speech(text1, text2, parts_of_speech=['ADJ', 'NOUN']) -> (str,
 
 
 def markov(input: input_type, ngram_size=1, num_output_sentences=5) -> List[str]:
+    """Markov chain text generation from markovify library, supports custom n-gram length
+
+    Keyword arguments:
+    n-gram size: determines what n-gram model to use: x where x is order-x n-gram
+    num_output_sentences: number of sentencess to output
+    """
     if type(input) == list:
         list_of_texts = input
     elif type(input) == str:
@@ -183,7 +214,6 @@ def markov(input: input_type, ngram_size=1, num_output_sentences=5) -> List[str]
     for text in list_of_texts:
         markov_models.append(markovify.Text(text, state_size=ngram_size))
     textgen = markovify.combine(markov_models)
-    sentence_count = 0
     output_sentences = []
     while len(output_sentences) < num_output_sentences:
         sentence = textgen.make_sentence()
@@ -196,6 +226,11 @@ def cutup(input, min_cutout_words=3, max_cutout_words=7) -> List[str]:
     """Simulates William S. Burroughs' and Brion Gysin's cut-up technique by separating an input text into
     non-whitespace blocks of text and then randomly grouping those into cut-outs between the minimum and maximum
     length of words.
+
+    Arguments:
+        input (str) -- input string to be cut up
+        min_cutout_words (int) -- minimum number of words in cut out chunk
+        max_cutout_words -- maximum number of words in cutout chunk
     """
     if type(input) == list:
         list_of_texts = input

diff --git a/generativepoetry/markov.py → generativepoetry/jolastic.py b/generativepoetry/markov.py → generativepoetry/jolastic.py