Fix parsing issue #29

glut23 · Apr 9, 2020 · 8538e1a · 8538e1a · dilraj1983 · Jan 6, 2023
1 parent 62d864e
commit 8538e1a
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 22 deletions.
diff --git a/docs/history.rst b/docs/history.rst
@@ -1,6 +1,11 @@
 History
 =======
 
+0.4.5 (09-04-2020)
+------------------
+
+* Fix issue reading buffer
+
 0.4.4 (27-03-2020)
 ------------------
 
@@ -11,15 +16,15 @@ History
 * Added Python 3.8 support
 * Improve parsing empty lines
 
-0.4.3 (22-11-2019) Few improvements
------------------------------------
+0.4.3 (22-11-2019)
+------------------
 
 * Parsing improvements, thanks to `@sontek <https://github.com/sontek>`_ (#18)
 * Add support for reading content from a file-like object, thanks to `@omerholz <https://github.com/omerholz>`_ (#23)
 * Documentation fixes thanks to `@sontek <https://github.com/sontek>`_ (#22) and `@netcmcc <https://github.com/netcmcc>`_ (#24)
 
-0.4.2 (08-06-2018) Rename of modules and usability improvements
----------------------------------------------------------------
+0.4.2 (08-06-2018)
+------------------
 
 * Renamed and reorganized few of the modules
 * Parsing methods are now class methods: read, from_srt and from_sbv
@@ -30,13 +35,13 @@ import webvtt
 webvtt.read('captions.vtt')  # this will return a WebVTT instance
 
 
-0.4.1 (24-12-2017) Hot fix on cue identifiers
----------------------------------------------
+0.4.1 (24-12-2017)
+------------------
 
 * Support for saving cue identifiers
 
-0.4.0 (18-09-2017) Refactor and parse compatibility
----------------------------------------------------
+0.4.0 (18-09-2017)
+------------------
 
 The main goal of this release is a refactor of the WebVTT parser to be able to parse easier and give support to
 new features of the format.
@@ -55,8 +60,8 @@ Other:
 
 * Refactored WebVTT parser
 
-0.3.3 (23-08-2017) Hot fix on cue tags
---------------------------------------
+0.3.3 (23-08-2017)
+------------------
 
 The text for the caption is now returned clean (tags removed). The cue text could contain tags like:
 * timestamp tags: *<00:19.000>*
@@ -66,20 +71,20 @@ The text for the caption is now returned clean (tags removed). The cue text coul
 
 Also a new attribute is available on captions to retrieve the text without cleaning tags: **raw_text**
 
-0.3.2 (11-08-2017) Hot fix for compatibility
---------------------------------------------
+0.3.2 (11-08-2017)
+------------------
 
 The goal of this release if to allow the WebVTT parser to be able to read caption files that contain metadata headers
 that extend to more than one line.
 
-0.3.1 (08-08-2017) Compatibility updates
-----------------------------------------
+0.3.1 (08-08-2017)
+------------------
 
 * Made hours in WebVTT parser optional as per specs.
 * Added support to parse WebVTT files that contain metadata headers.
 
-0.3.0 (02-06-2016) YouTube SBV
-------------------------------
+0.3.0 (02-06-2016)
+------------------
 
 New features:
 
@@ -93,14 +98,14 @@ Other:
 * Added an exception for invalid timestamps in captions.
 * Added an exception when saving without a filename.
 
-0.2.0 (23-05-2016) Module refactor
-----------------------------------
+0.2.0 (23-05-2016)
+------------------
 
 * Refactor of the main module and parsers.
 
 
-0.1.0 (20-05-2016) First release
---------------------------------
+0.1.0 (20-05-2016)
+------------------
 
 This module is released with the following initial features:
 

diff --git a/tests/test_webvtt.py b/tests/test_webvtt.py
@@ -1,5 +1,6 @@
 import os
 import io
+import textwrap
 from shutil import rmtree, copy
 
 import webvtt
@@ -237,6 +238,23 @@ def test_read_memory_buffer(self):
         vtt = webvtt.read_buffer(buffer)
         self.assertIsInstance(vtt.captions, list)
 
+    def test_read_memory_buffer_carriage_return(self):
+        """https://github.com/glut23/webvtt-py/issues/29"""
+        buffer = io.StringIO(textwrap.dedent('''\
+            WEBVTT\r
+            \r
+            00:00:00.500 --> 00:00:07.000\r
+            Caption text #1\r
+            \r
+            00:00:07.000 --> 00:00:11.890\r
+            Caption text #2\r
+            \r
+            00:00:11.890 --> 00:00:16.320\r
+            Caption text #3\r
+        '''))
+        vtt = webvtt.read_buffer(buffer)
+        self.assertEqual(len(vtt.captions), 3)
+
     def test_read_malformed_buffer(self):
         malformed_payloads = ['', 'MOCK MELFORMED CONTENT']
         for payload in malformed_payloads:

diff --git a/webvtt/__init__.py b/webvtt/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '0.4.4'
+__version__ = '0.4.5'
 
 from .webvtt import *
 from .segmenter import *

diff --git a/webvtt/parsers.py b/webvtt/parsers.py
@@ -51,7 +51,7 @@ def _read_file_encoding(self, file_path):
 
     def _read_content_lines(self, file_obj):
 
-        lines = [line.rstrip('\n') for line in file_obj.readlines()]
+        lines = [line.rstrip('\n\r') for line in file_obj.readlines()]
 
         if not lines:
             raise MalformedFileError('The file is empty.')