Skip to content

Commit

Permalink
Title tag normalization: performance optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
desbma committed Oct 19, 2018
1 parent 7246714 commit 7065d1d
Showing 1 changed file with 22 additions and 17 deletions.
39 changes: 22 additions & 17 deletions amg/tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@
import functools
import logging
import operator
import re
import string

import mutagen
import mutagen.easyid3
import mutagen.easymp4
try:
import re2 as re
except ImportError:
import re
import unidecode

from amg import sanitize
Expand All @@ -32,7 +35,7 @@ def __init__(self, artist, album):
self.registerCleaner(RegexSuffixCleaner("taken from .*, out ", execute_once=True))

# detect and remove 'album: xxx track yy'
self.registerCleaner(RegexCleaner("(album: .* )?track [0-9]+"))
self.registerCleaner(RegexCleaner("(album: .* )?track [0-9]+", execute_once=True))

# detect and remove 'from xxx LP' suffix
self.registerCleaner(RegexSuffixCleaner("from .* LP", execute_once=True))
Expand All @@ -55,33 +58,35 @@ def __init__(self, artist, album):
# detect and remove 'xxx entertainment' suffix
self.registerCleaner(RegexSuffixCleaner("[\[\( ][a-z]+ entertainment$", execute_once=True))

# detect and remove 'xxx productions' suffix
self.registerCleaner(RegexSuffixCleaner("[\[\( ][a-z ]+ productions$"))

# detect and remove 'record label xxx' suffix
self.registerCleaner(RegexSuffixCleaner("record label:? [a-z0-9 ]*$", execute_once=True))

# detect and remove 'record label xxx' suffix
self.registerCleaner(RegexSuffixCleaner("next concert: .*$", execute_once=True))

# detect and remove 'feat.xxx' suffix
self.registerCleaner(RegexSuffixCleaner("feat\..*$"))

# detect and remove track number prefix
self.registerCleaner(RegexPrefixCleaner("^[0-9]+ - "))

# detect and remove 'xxx records' suffix
self.registerCleaner(RecordsSuffixCleaner("recordings"))
self.registerCleaner(RecordsSuffixCleaner("records"))
self.registerCleaner(RegexSuffixCleaner("feat\..*$", execute_once=True))

# detect and remove '- xxx metal' suffix
for genre in ("metal", "crust", "grindcore", "grind"):
self.registerCleaner(RegexSuffixCleaner("[\-|\(\[/\]]+[ ]*(?:[0-9a-z/-]+[ ]*)+" + genre + "( song)?$",
suffixes=(genre, " ".join((genre, "song")))))
suffixes=(genre, " ".join((genre, "song"))),
execute_once=True))

# detect and remove 'xxx metal' prefix
for genre in ("death",):
self.registerCleaner(RegexPrefixCleaner("^" + genre + "[a-z- ]* metal "))
self.registerCleaner(RegexPrefixCleaner("^" + genre + "[a-z- ]* metal ",
execute_once=True))

# detect and remove 'xxx productions' suffix
self.registerCleaner(RegexSuffixCleaner("[\[\( ][a-z ]+ productions$"))

# detect and remove track number prefix
self.registerCleaner(RegexPrefixCleaner("^[0-9]+ - "))

# detect and remove 'xxx records' suffix
self.registerCleaner(RecordsSuffixCleaner("recordings"))
self.registerCleaner(RecordsSuffixCleaner("records"))

# build list of common useless expressions
expressions = []
Expand Down Expand Up @@ -124,10 +129,10 @@ def __init__(self, artist, album):
self.registerCleaner(ArtistCleaner(), (artist,))

# detect and remove starting parenthesis expression
self.registerCleaner(StartParenthesesCleaner())
self.registerCleaner(StartParenthesesCleaner(execute_once=True))

# detect and remove album prefix or suffix
self.registerCleaner(AlbumCleaner(), (album,))
self.registerCleaner(AlbumCleaner(execute_once=True), (album,))

# fix paired chars
self.registerCleaner(PairedCharCleaner(execute_once=True))
Expand Down

0 comments on commit 7065d1d

Please sign in to comment.