Skip to content

Commit

Permalink
Strip separators from matches
Browse files Browse the repository at this point in the history
This is useful in advanced mode for raw values like title, episode_title and release_group to have separators stripped off.
  • Loading branch information
Toilal committed Dec 4, 2016
1 parent d434514 commit 605526e
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 2 deletions.
1 change: 1 addition & 0 deletions guessit/rules/common/__init__.py
Expand Up @@ -6,6 +6,7 @@
import re

seps = r' [](){}+*|=-_~#/\\.,;:' # list of tags/words separators
seps_no_groups = seps.replace('[](){}', '')
seps_no_fs = seps.replace('/', '').replace('\\', '')

title_seps = r'-+/\|' # separators for title
Expand Down
24 changes: 23 additions & 1 deletion guessit/rules/processors.py
Expand Up @@ -9,6 +9,8 @@
import six

from rebulk import Rebulk, Rule, CustomRule, POST_PROCESS, PRE_PROCESS, AppendMatch, RemoveMatch

from guessit.rules.common import seps_no_groups
from guessit.rules.common.words import iter_words
from .common.formatters import cleanup
from .common.comparators import marker_sorted
Expand Down Expand Up @@ -205,11 +207,31 @@ def then(self, matches, when_response, context): # pragma: no cover
pass


class StripSeparators(CustomRule):
"""
Strip separators from matches. Keep separators if they are from acronyms, like in ".S.H.I.E.L.D."
"""
priority = POST_PROCESS

def when(self, matches, context):
return matches

def then(self, matches, when_response, context): # pragma: no cover
for match in matches:
for _ in range(0, len(match.span)):
if match.raw[0] in seps_no_groups and (len(match.raw) < 3 or match.raw[2] not in seps_no_groups):
match.raw_start += 1

for _ in reversed(range(0, len(match.span))):
if match.raw[-1] in seps_no_groups and (len(match.raw) < 3 or match.raw[-3] not in seps_no_groups):
match.raw_end -= 1


def processors():
"""
Builder for rebulk object.
:return: Created Rebulk object
:rtype: Rebulk
"""
return Rebulk().rules(EnlargeGroupMatches, EquivalentHoles, RemoveLessSpecificSeasonEpisode,
RemoveAmbiguous, SeasonYear, Processors)
RemoveAmbiguous, SeasonYear, Processors, StripSeparators)
4 changes: 3 additions & 1 deletion guessit/rules/properties/container.py
Expand Up @@ -6,6 +6,8 @@
from rebulk.remodule import re

from rebulk import Rebulk

from guessit.rules.common import seps
from ..common.validators import seps_surround
from ...reutils import build_or_pattern

Expand All @@ -18,7 +20,7 @@ def container():
"""
rebulk = Rebulk().regex_defaults(flags=re.IGNORECASE).string_defaults(ignore_case=True)
rebulk.defaults(name='container',
formatter=lambda value: value[1:],
formatter=lambda value: value.strip(seps),
tags=['extension'],
conflict_solver=lambda match, other: other
if other.name in ['format', 'video_codec'] or
Expand Down
46 changes: 46 additions & 0 deletions guessit/test/rules/processors_test.py
@@ -0,0 +1,46 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# pylint: disable=no-self-use, pointless-statement, missing-docstring, invalid-name, pointless-string-statement

from rebulk.match import Matches, Match

from guessit.rules.processors import StripSeparators


def test_strip_separators():
strip_separators = StripSeparators()

matches = Matches()

m = Match(3, 11, input_string="pre.ABCDEF.post")

assert m.raw == '.ABCDEF.'
matches.append(m)

returned_matches = strip_separators.when(matches, None)
assert returned_matches == matches

strip_separators.then(matches, returned_matches, None)

assert m.raw == 'ABCDEF'


def test_strip_separators_keep_acronyms():
strip_separators = StripSeparators()

matches = Matches()

m = Match(0, 13, input_string=".S.H.I.E.L.D.")
m2 = Match(0, 22, input_string=".Agent.Of.S.H.I.E.L.D.")

assert m.raw == '.S.H.I.E.L.D.'
matches.append(m)
matches.append(m2)

returned_matches = strip_separators.when(matches, None)
assert returned_matches == matches

strip_separators.then(matches, returned_matches, None)

assert m.raw == '.S.H.I.E.L.D.'
assert m2.raw == 'Agent.Of.S.H.I.E.L.D.'

0 comments on commit 605526e

Please sign in to comment.