Merge pull request #122 from ms-boom/fix_timelex

Fix timelex
dateutil · Oct 31, 2015 · 9e54f8d · 9e54f8d
2 parents 9645c3d + b0adc1e
commit 9e54f8d
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 18 deletions.
diff --git a/dateutil/parser.py b/dateutil/parser.py
@@ -71,12 +71,6 @@ def __init__(self, instream):
             instream = StringIO(instream)
 
         self.instream = instream
-        self.wordchars = ('abcdfeghijklmnopqrstuvwxyz'
-                          'ABCDEFGHIJKLMNOPQRSTUVWXYZ_'
-                          'ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ'
-                          'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ')
-        self.numchars = '0123456789'
-        self.whitespace = ' \t\r\n'
         self.charstack = []
         self.tokenstack = []
         self.eof = False
@@ -101,9 +95,6 @@ def get_token(self):
         seenletters = False
         token = None
         state = None
-        wordchars = self.wordchars
-        numchars = self.numchars
-        whitespace = self.whitespace
 
         while not self.eof:
             # We only realize that we've reached the end of a token when we
@@ -124,11 +115,11 @@ def get_token(self):
                 # First character of the token - determines if we're starting
                 # to parse a word, a number or something else.
                 token = nextchar
-                if nextchar in wordchars:
+                if nextchar.isalpha():
                     state = 'a'
-                elif nextchar in numchars:
+                elif nextchar.isdigit():
                     state = '0'
-                elif nextchar in whitespace:
+                elif nextchar.isspace():
                     token = ' '
                     break  # emit token
                 else:
@@ -137,7 +128,7 @@ def get_token(self):
                 # If we've already started reading a word, we keep reading
                 # letters until we find something that's not part of a word.
                 seenletters = True
-                if nextchar in wordchars:
+                if nextchar.isalpha():
                     token += nextchar
                 elif nextchar == '.':
                     token += nextchar
@@ -148,7 +139,7 @@ def get_token(self):
             elif state == '0':
                 # If we've already started reading a number, we keep reading
                 # numbers until we find something that doesn't fit.
-                if nextchar in numchars:
+                if nextchar.isdigit():
                     token += nextchar
                 elif nextchar == '.' or (nextchar == ',' and len(token) >= 2):
                     token += nextchar
@@ -160,9 +151,9 @@ def get_token(self):
                 # If we've seen some letters and a dot separator, continue
                 # parsing, and the tokens will be broken up later.
                 seenletters = True
-                if nextchar == '.' or nextchar in wordchars:
+                if nextchar == '.' or nextchar.isalpha():
                     token += nextchar
-                elif nextchar in numchars and token[-1] == '.':
+                elif nextchar.isdigit() and token[-1] == '.':
                     token += nextchar
                     state = '0.'
                 else:
@@ -171,9 +162,9 @@ def get_token(self):
             elif state == '0.':
                 # If we've seen at least one dot separator, keep going, we'll
                 # break up the tokens later.
-                if nextchar == '.' or nextchar in numchars:
+                if nextchar == '.' or nextchar.isdigit():
                     token += nextchar
-                elif nextchar in wordchars and token[-1] == '.':
+                elif nextchar.isalpha() and token[-1] == '.':
                     token += nextchar
                     state = 'a.'
                 else:

diff --git a/dateutil/test/test.py b/dateutil/test/test.py
@@ -5584,6 +5584,26 @@ def testParserParseStr(self):
         self.assertEqual(parser().parse(self.str_str),
                          parser().parse(self.uni_str))
 
+    def testParseUnicodeWords(self):
+
+        class rus_parserinfo(parserinfo):
+            MONTHS = [("янв", "Январь"),
+                      ("фев", "Февраль"),
+                      ("мар", "Март"),
+                      ("апр", "Апрель"),
+                      ("май", "Май"),
+                      ("июн", "Июнь"),
+                      ("июл", "Июль"),
+                      ("авг", "Август"),
+                      ("сен", "Сентябрь"),
+                      ("окт", "Октябрь"),
+                      ("ноя", "Ноябрь"),
+                      ("дек", "Декабрь")]
+
+        self.assertEqual(parse('10 Сентябрь 2015 10:20',
+                               parserinfo=rus_parserinfo()),
+                         datetime(2015, 9, 10, 10, 20))
+
 
 class EasterTest(unittest.TestCase):
     easterlist = [