Skip to content

Commit

Permalink
Merge pull request #122 from ms-boom/fix_timelex
Browse files Browse the repository at this point in the history
Fix timelex
  • Loading branch information
pganssle committed Oct 31, 2015
2 parents 9645c3d + b0adc1e commit 9e54f8d
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 18 deletions.
27 changes: 9 additions & 18 deletions dateutil/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,6 @@ def __init__(self, instream):
instream = StringIO(instream)

self.instream = instream
self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
'ABCDEFGHIJKLMNOPQRSTUVWXYZ_'
'ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
self.numchars = '0123456789'
self.whitespace = ' \t\r\n'
self.charstack = []
self.tokenstack = []
self.eof = False
Expand All @@ -101,9 +95,6 @@ def get_token(self):
seenletters = False
token = None
state = None
wordchars = self.wordchars
numchars = self.numchars
whitespace = self.whitespace

while not self.eof:
# We only realize that we've reached the end of a token when we
Expand All @@ -124,11 +115,11 @@ def get_token(self):
# First character of the token - determines if we're starting
# to parse a word, a number or something else.
token = nextchar
if nextchar in wordchars:
if nextchar.isalpha():
state = 'a'
elif nextchar in numchars:
elif nextchar.isdigit():
state = '0'
elif nextchar in whitespace:
elif nextchar.isspace():
token = ' '
break # emit token
else:
Expand All @@ -137,7 +128,7 @@ def get_token(self):
# If we've already started reading a word, we keep reading
# letters until we find something that's not part of a word.
seenletters = True
if nextchar in wordchars:
if nextchar.isalpha():
token += nextchar
elif nextchar == '.':
token += nextchar
Expand All @@ -148,7 +139,7 @@ def get_token(self):
elif state == '0':
# If we've already started reading a number, we keep reading
# numbers until we find something that doesn't fit.
if nextchar in numchars:
if nextchar.isdigit():
token += nextchar
elif nextchar == '.' or (nextchar == ',' and len(token) >= 2):
token += nextchar
Expand All @@ -160,9 +151,9 @@ def get_token(self):
# If we've seen some letters and a dot separator, continue
# parsing, and the tokens will be broken up later.
seenletters = True
if nextchar == '.' or nextchar in wordchars:
if nextchar == '.' or nextchar.isalpha():
token += nextchar
elif nextchar in numchars and token[-1] == '.':
elif nextchar.isdigit() and token[-1] == '.':
token += nextchar
state = '0.'
else:
Expand All @@ -171,9 +162,9 @@ def get_token(self):
elif state == '0.':
# If we've seen at least one dot separator, keep going, we'll
# break up the tokens later.
if nextchar == '.' or nextchar in numchars:
if nextchar == '.' or nextchar.isdigit():
token += nextchar
elif nextchar in wordchars and token[-1] == '.':
elif nextchar.isalpha() and token[-1] == '.':
token += nextchar
state = 'a.'
else:
Expand Down
20 changes: 20 additions & 0 deletions dateutil/test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5584,6 +5584,26 @@ def testParserParseStr(self):
self.assertEqual(parser().parse(self.str_str),
parser().parse(self.uni_str))

def testParseUnicodeWords(self):

class rus_parserinfo(parserinfo):
MONTHS = [("янв", "Январь"),
("фев", "Февраль"),
("мар", "Март"),
("апр", "Апрель"),
("май", "Май"),
("июн", "Июнь"),
("июл", "Июль"),
("авг", "Август"),
("сен", "Сентябрь"),
("окт", "Октябрь"),
("ноя", "Ноябрь"),
("дек", "Декабрь")]

self.assertEqual(parse('10 Сентябрь 2015 10:20',
parserinfo=rus_parserinfo()),
datetime(2015, 9, 10, 10, 20))


class EasterTest(unittest.TestCase):
easterlist = [
Expand Down

0 comments on commit 9e54f8d

Please sign in to comment.