Merge 8065d08 into 1178019

deanmalmgren · Aug 28, 2014 · 2dd5067 · 2dd5067
2 parents 1178019 + 8065d08
commit 2dd5067
Show file tree

Hide file tree

Showing 19 changed files with 96 additions and 5 deletions.
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -11,7 +11,7 @@ latest changes in development for next release
 
 .. THANKS FOR CONTRIBUTING; MENTION WHAT YOU DID IN THIS SECTION HERE!
 
-* support for ``.wav`` files (`#56`_ by `@arvindch`_)
+* support for ``.wav``, ``.mp3``, and ``.ogg`` files (`#56`_ and `#62`_ by `@arvindch`_)
 
 
 1.0.0
@@ -160,4 +160,4 @@ latest changes in development for next release
 .. _#53: https://github.com/deanmalmgren/textract/issues/53
 .. _#55: https://github.com/deanmalmgren/textract/issues/55
 .. _#56: https://github.com/deanmalmgren/textract/issues/56
-
+.. _#62: https://github.com/deanmalmgren/textract/issues/62
diff --git a/docs/index.rst b/docs/index.rst
@@ -58,8 +58,12 @@ file types by either mentioning them on the `issue tracker
 
 * ``.html`` via `beautifulsoup4`_
 
+* ``.mp3`` via `SpeechRecognition`_ and `sox`_
+
 * ``.odt`` via python builtins
 
+* ``.ogg`` via `SpeechRecognition`_ and `sox`_
+
 * ``.pdf`` via `pdftotext`_ (default) or `pdfminer`_
 
 * ``.png`` via `tesseract-ocr`_
@@ -91,6 +95,7 @@ by :ref:`contributing <contributing>`
 .. _ps2text: http://pages.cs.wisc.edu/~ghost/doc/pstotext.htm
 .. _ebooklib: https://github.com/aerkalov/ebooklib
 .. _SpeechRecognition: https://pypi.python.org/pypi/SpeechRecognition/
+.. _sox: http://sox.sourceforge.net/
 
 .. _related-projects:
 

diff --git a/docs/installation.rst b/docs/installation.rst
@@ -22,7 +22,8 @@ package manager before installing textract from pypi.
 
 .. code-block:: bash
 
-    apt-get install python-dev libxml2-dev libxslt1-dev antiword poppler-utils pstotext tesseract-ocr flac
+    apt-get install python-dev libxml2-dev libxslt1-dev antiword poppler-utils pstotext tesseract-ocr \
+    flac ffmpeg lame libmad0 libsox-fmt-mp3 sox
     pip install textract
 
 .. note::
@@ -105,6 +106,11 @@ documenation about how to install the textract dependencies, please
     - `tesseract-ocr <https://code.google.com/p/tesseract-ocr/>`_ 
       is required by the ``.jpg``, ``.png`` and ``.gif`` parser.
 
+    - `sox <http://sox.sourceforge.net/>`_
+      is required by the ``.mp3`` and ``.ogg`` parser.
+      You need to install ffmpeg, lame, libmad0 and libsox-fmt-mp3,
+      before building sox, for these filetypes to work.
+
 2. Add a requirements file to the `requirements directory
    <https://github.com/deanmalmgren/textract/tree/master/requirements>`_
    of the project with the lower-cased name of your operating system

diff --git a/docs/python_package.rst b/docs/python_package.rst
@@ -81,6 +81,14 @@ textract.parsers.json_parser module
     :undoc-members:
     :show-inheritance:
 
+textract.parsers.mp3_parser module
+----------------------------------
+
+.. automodule:: textract.parsers.mp3_parser
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 textract.parsers.odt_parser module
 ----------------------------------
 
@@ -89,6 +97,14 @@ textract.parsers.odt_parser module
     :undoc-members:
     :show-inheritance:
 
+textract.parsers.ogg_parser module
+----------------------------------
+
+.. automodule:: textract.parsers.ogg_parser
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 textract.parsers.pdf_parser module
 ----------------------------------
 
@@ -121,6 +137,14 @@ textract.parsers.ps_parser module
     :undoc-members:
     :show-inheritance:
 
+textract.parsers.sox module
+---------------------------
+
+.. automodule:: textract.parsers.sox
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 textract.parsers.tesseract module
 ---------------------------------
 

diff --git a/requirements/debian b/requirements/debian
@@ -19,6 +19,16 @@ pstotext
 # parse audio files, with SpeechRecognition
 flac
 
+# filetype conversion libs
+ffmpeg
+lame
+libmad0
+# debian-specific lib
+libsox-fmt-mp3
+
+# convert audio files
+sox
+
 # ubuntu 14.04 requires this in addition to libxml2-dev and
 # libxslt1-dev for compiling lxml.
 # https://github.com/deanmalmgren/textract/issues/19

diff --git a/tests/mp3/raw_text.mp3 b/tests/mp3/raw_text.mp3
diff --git a/tests/mp3/raw_text.txt b/tests/mp3/raw_text.txt
@@ -0,0 +1 @@
+everything is awesome
diff --git a/tests/mp3/standardized_text.mp3 b/tests/mp3/standardized_text.mp3
diff --git a/tests/ogg/raw_text.ogg b/tests/ogg/raw_text.ogg
diff --git a/tests/ogg/raw_text.txt b/tests/ogg/raw_text.txt
@@ -0,0 +1 @@
+everything is awesome
diff --git a/tests/ogg/standardized_text.ogg b/tests/ogg/standardized_text.ogg
diff --git a/tests/run_functional_tests.sh b/tests/run_functional_tests.sh
diff --git a/tests/test_mp3.py b/tests/test_mp3.py
@@ -0,0 +1,7 @@
+import unittest
+
+import base
+
+
+class Mp3TestCase(base.BaseParserTestCase, unittest.TestCase):
+    extension = 'mp3'
diff --git a/tests/test_ogg.py b/tests/test_ogg.py
@@ -0,0 +1,7 @@
+import unittest
+
+import base
+
+
+class OggTestCase(base.BaseParserTestCase, unittest.TestCase):
+    extension = 'ogg'
diff --git a/tests/wav/raw_text.txt b/tests/wav/raw_text.txt
@@ -1 +1 @@
-everything is awesome
+everything is awesome
diff --git a/textract/parsers/mp3_parser.py b/textract/parsers/mp3_parser.py
@@ -0,0 +1 @@
+from .sox import Parser
diff --git a/textract/parsers/ogg_parser.py b/textract/parsers/ogg_parser.py
@@ -0,0 +1 @@
+from .sox import Parser
diff --git a/textract/parsers/sox.py b/textract/parsers/sox.py
@@ -0,0 +1,24 @@
+"""
+Convert an audio file to-and-from various formats, using sox.
+"""
+
+from .utils import ShellParser
+
+
+class Parser(ShellParser):
+    """
+    Convert file to .wav, for use with wav_parser
+    Note: for testing, use -
+    http://www.text2speech.org/,
+    with American Male 2 for best results
+    """
+
+    def extract(self, filename, **kwargs):
+        command = (
+            'sox -G -c 1 "{0}" {1}.wav && '
+            'textract {1}.wav && '
+            'rm -f {1}.wav'
+        )
+        temp_filename = self.temp_filename()
+        stdout, _ = self.run(command.format(filename, temp_filename))
+        return stdout
diff --git a/textract/parsers/wav_parser.py b/textract/parsers/wav_parser.py
@@ -6,7 +6,8 @@
 class Parser(BaseParser):
     """
     Extract text (i.e. speech) from an audio file, using SpeechRecognition.
-    Only works with .wav files, for now.
+    SpeechRecognition expects a .wav file, with one channel
+    So the audio file has to be converted, if not compliant
     Note: for testing, use -
     http://www2.research.att.com/~ttsweb/tts/demo.php,
     with Rich (US English) for best results
@@ -23,4 +24,7 @@ def extract(self, filename, **kwargs):
         except LookupError:  # audio is not understandable
             speech = ''
 
+        # add a newline, to make output cleaner
+        speech += '\n'
+
         return speech