@@ -141,30 +141,37 @@ def _split(self, data):
141141
142142 return data [:len1 ], binary , data [idx :]
143143
144- _whitespace = re .compile (br'[\0\t\r\014\n ]+' )
145- _token = re .compile (br'/{0,2}[^]\0\t\r\v\n ()<>{}/%[]+' )
146- _comment = re .compile (br'%[^\r\n\v]*' )
147- _instring = re .compile (br'[()\\]' )
144+ _whitespace_re = re .compile (br'[\0\t\r\014\n ]+' )
145+ _token_re = re .compile (br'/{0,2}[^]\0\t\r\v\n ()<>{}/%[]+' )
146+ _comment_re = re .compile (br'%[^\r\n\v]*' )
147+ _instring_re = re .compile (br'[()\\]' )
148+
149+ # token types
150+ _whitespace = object ()
151+ _name = object ()
152+ _string = object ()
153+ _delimiter = object ()
154+ _number = object ()
148155
149156 @classmethod
150157 def _tokens (cls , text ):
151158 """
152159 A PostScript tokenizer. Yield (token, value) pairs such as
153- ('whitespace' , ' ') or ('name' , '/Foobar').
160+ (cls._whitespace , ' ') or (cls._name , '/Foobar').
154161 """
155162 pos = 0
156163 while pos < len (text ):
157- match = (cls ._comment .match (text [pos :]) or
158- cls ._whitespace .match (text [pos :]))
164+ match = (cls ._comment_re .match (text [pos :]) or
165+ cls ._whitespace_re .match (text [pos :]))
159166 if match :
160- yield ('whitespace' , match .group ())
167+ yield (cls . _whitespace , match .group ())
161168 pos += match .end ()
162169 elif text [pos ] == '(' :
163170 start = pos
164171 pos += 1
165172 depth = 1
166173 while depth :
167- match = cls ._instring .search (text [pos :])
174+ match = cls ._instring_re .search (text [pos :])
168175 if match is None :
169176 return
170177 pos += match .end ()
@@ -174,25 +181,25 @@ def _tokens(cls, text):
174181 depth -= 1
175182 else : # a backslash - skip the next character
176183 pos += 1
177- yield ('string' , text [start :pos ])
184+ yield (cls . _string , text [start :pos ])
178185 elif text [pos :pos + 2 ] in ('<<' , '>>' ):
179- yield ('delimiter' , text [pos :pos + 2 ])
186+ yield (cls . _delimiter , text [pos :pos + 2 ])
180187 pos += 2
181188 elif text [pos ] == '<' :
182189 start = pos
183190 pos += text [pos :].index ('>' )
184- yield ('string' , text [start :pos ])
191+ yield (cls . _string , text [start :pos ])
185192 else :
186- match = cls ._token .match (text [pos :])
193+ match = cls ._token_re .match (text [pos :])
187194 if match :
188195 try :
189196 float (match .group ())
190- yield ('number' , match .group ())
197+ yield (cls . _number , match .group ())
191198 except ValueError :
192- yield ('name' , match .group ())
199+ yield (cls . _name , match .group ())
193200 pos += match .end ()
194201 else :
195- yield ('delimiter' , text [pos ])
202+ yield (cls . _delimiter , text [pos : pos + 1 ])
196203 pos += 1
197204
198205 def _parse (self ):
@@ -205,26 +212,30 @@ def _parse(self):
205212 prop = {'weight' : 'Regular' , 'ItalicAngle' : 0.0 , 'isFixedPitch' : False ,
206213 'UnderlinePosition' : - 100 , 'UnderlineThickness' : 50 }
207214 tokenizer = self ._tokens (self .parts [0 ])
208- filtered = filter (lambda x : x [0 ] != 'whitespace' , tokenizer )
215+ filtered = filter (lambda x : x [0 ] != self ._whitespace , tokenizer )
216+ # The spec calls this an ASCII format; in Python 2.x we could
217+ # just treat the strings and names as opaque bytes but let's
218+ # turn them into proper Unicode, and be lenient in case of high bytes.
219+ convert = lambda x : x .decode ('ascii' , errors = 'replace' )
209220 for token , value in filtered :
210- if token == b'name' and value .startswith (b'/' ):
211- key = value [1 :]
221+ if token is self . _name and value .startswith (b'/' ):
222+ key = convert ( value [1 :])
212223 token , value = next (filtered )
213- if token == b'name' :
224+ if token is self . _name :
214225 if value in (b'true' , b'false' ):
215226 value = value == b'true'
216227 else :
217- value = value .lstrip (b'/' )
218- elif token == b'string' :
219- value = value .lstrip (b'(' ).rstrip (b')' )
220- elif token == b'number' :
228+ value = convert ( value .lstrip (b'/' ) )
229+ elif token is self . _string :
230+ value = convert ( value .lstrip (b'(' ).rstrip (b')' ) )
231+ elif token is self . _number :
221232 if b'.' in value :
222233 value = float (value )
223234 else :
224235 value = int (value )
225236 else : # more complicated value such as an array
226237 value = None
227- if key != b 'FontInfo' and value is not None :
238+ if key != 'FontInfo' and value is not None :
228239 prop [key ] = value
229240
230241 # Fill in the various *Name properties
0 commit comments