-
Notifications
You must be signed in to change notification settings - Fork 3k
/
full_fidelity_lexer.ml
1681 lines (1487 loc) · 59.1 KB
/
full_fidelity_lexer.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
(**
* Copyright (c) 2016, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the "hack" directory of this source tree.
*
*)
module TriviaKind = Full_fidelity_trivia_kind
module TokenKind = Full_fidelity_token_kind
module SourceText = Full_fidelity_source_text
module SyntaxError = Full_fidelity_syntax_error
module Lexer : sig
type t = {
text : SourceText.t;
start : int; (* Both start and offset are absolute offsets in the text. *)
offset : int;
errors : SyntaxError.t list
}
val make : SourceText.t -> t
val start : t -> int
val source : t -> SourceText.t
val errors : t -> SyntaxError.t list
val offset : t -> int
val with_error : t -> string -> t
val with_offset : t -> int -> t
val with_offset_errors : t -> int -> SyntaxError.t list -> t
val start_new_lexeme : t -> t
val advance : t -> int -> t
val with_start_offset : t -> int -> int -> t
end = struct
let padding = String.make 100 '\x00'
(* text consists of a pair consisting of a string, padded by a certain, fixed
* amount of null bytes, and then the rest of the source text *)
type t = {
text : SourceText.t;
start : int; (* Both start and offset are absolute offsets in the text. *)
offset : int;
errors : SyntaxError.t list
}
let make text =
let text' = SourceText.append_padding text padding in
{ text = text'; start = 0; offset = 0; errors = [] }
let start x = x.start
let source x = x.text
let errors x = x.errors
let offset x = x.offset
let with_error lexer message =
let error = SyntaxError.make lexer.start lexer.offset message in
{ lexer with errors = error :: lexer.errors }
let with_offset lexer offset = {lexer with offset = offset}
let with_start_offset lexer start offset = {lexer with start = start; offset = offset}
let with_offset_errors lexer offset errors = {
lexer with offset = offset; errors = errors
}
let start_new_lexeme lexer =
{ lexer with start = lexer.offset }
let advance lexer index =
{ lexer with offset = lexer.offset + index }
end
module WithToken(Token: Lexable_token_sig.LexableToken_S) = struct
module Trivia = Token.Trivia
type lexer = Lexer.t
type t = lexer
let make = Lexer.make
let start = Lexer.start
let source = Lexer.source
let errors = Lexer.errors
let offset = Lexer.offset
let with_error = Lexer.with_error
let with_offset = Lexer.with_offset
let start_new_lexeme = Lexer.start_new_lexeme
let advance = Lexer.advance
let with_offset_errors = Lexer.with_offset_errors
let with_start_offset = Lexer.with_start_offset
let start_offset = start
let end_offset = offset
let invalid = '\000'
let empty = make SourceText.empty
let source_text_string (l : lexer) = SourceText.text (source l)
type string_literal_kind =
| Literal_execution_string
| Literal_double_quoted
| Literal_heredoc of string
(* Housekeeping *)
let peek_char lexer index =
lexer.Lexer.text.SourceText.text.[offset lexer + index]
let peek_string lexer size =
String.sub lexer.Lexer.text.SourceText.text (offset lexer) size
let match_string lexer s =
s = peek_string lexer (String.length s)
let make_error_with_location (l : lexer) (msg : string) =
SyntaxError.make (start l) (offset l) msg
let width lexer =
(offset lexer) - (start lexer)
let current_text lexer =
SourceText.sub (source lexer) (start lexer) (width lexer)
let current_text_at lexer length relative_start =
SourceText.sub (source lexer) ((start lexer) + relative_start) length
let at_end lexer =
(offset lexer) >= SourceText.length (source lexer)
let at_end_index lexer index =
index >= SourceText.length (source lexer)
let remaining lexer =
let r = (SourceText.length (source lexer)) - offset lexer in
if r < 0 then 0 else r
let text_len (l : lexer) =
SourceText.length (source l)
let peek (l : lexer) i =
SourceText.get (source l) i
let peek_def (l: lexer) i ~def =
if i >= SourceText.length (source l) then
def
else
SourceText.get (source l) i
(* Character classification *)
let is_whitespace_no_newline : char -> bool = function
| ' ' | '\t' -> true
| _ -> false
let is_newline = function
| '\r' | '\n' -> true
| _ -> false
let is_binary_digit = function
| '0' | '1' -> true
| _ -> false
let is_octal_digit = function
| '0' .. '7' -> true
| _ -> false
let is_decimal_digit = function
| '0' .. '9' -> true
| _ -> false
let is_hexadecimal_digit = function
| '0' .. '9' | 'a' .. 'f' | 'A' .. 'F' -> true
| _ -> false
let is_name_nondigit = function
| '_' -> true
| 'a' .. 'z' -> true
| 'A' .. 'Z' -> true
| '\x7f' .. '\xff' -> true
| _ -> false
let is_name_letter = function
| '_' -> true
| '0' .. '9' -> true
| 'a' .. 'z' -> true
| 'A' .. 'Z' -> true
| '\x7f' .. '\xff' -> true
| _ -> false
(* Lexing *)
let skip_while_to_offset l p =
let n = SourceText.length (source l) in
let rec aux i =
if i < n && p (peek l i) then aux (i + 1) else i in
aux (offset l)
(* create a new lexer where the offset is advanced as
* long as the predicate is true *)
let skip_while (l : lexer) (p : char -> bool) =
with_offset l (skip_while_to_offset l p)
let str_skip_while ~str ~i ~p =
let n = String.length str in
let rec aux i =
if i < n && p str.[i] then aux (i + 1) else i in
aux i
let skip_whitespace (l : lexer) =
skip_while l is_whitespace_no_newline
let str_skip_whitespace ~str ~i =
str_skip_while ~str ~i ~p:is_whitespace_no_newline
let not_newline ch = not (is_newline ch)
let skip_to_end_of_line (l : lexer) =
skip_while l not_newline
let skip_to_end_of_line_or_end_tag (l : lexer) =
let n = text_len l in
let peek_def i = if i < n then peek l i else invalid in
let should_stop i =
(i >= n) || begin
let ch = peek l i in
(is_newline ch) || (ch = '?' && peek_def (succ i) = '>')
end in
let i = ref (offset l) in
while (not (should_stop !i)) do incr i done;
with_offset l !i
let skip_name_end (l : lexer) =
skip_while l is_name_letter
let skip_end_of_line lexer =
match peek_char lexer 0 with
| '\n' -> advance lexer 1
| '\r' ->
if (peek_char lexer 1) = '\n' then advance lexer 2 else advance lexer 1
| _ -> lexer
let scan_name_impl lexer =
assert (is_name_nondigit (peek_char lexer 0));
skip_name_end (advance lexer 1)
let scan_name lexer =
let lexer = scan_name_impl lexer in
(lexer, TokenKind.Name)
let scan_variable lexer =
assert('$' = peek_char lexer 0);
let lexer = scan_name_impl (advance lexer 1) in
(lexer, TokenKind.Variable)
let scan_with_underscores (l : lexer) accepted_char =
let n = text_len l in
let peek_def i = if i < n then peek l i else invalid in
let rec aux i =
if i >= n then i
else let ch = peek l i in
if accepted_char ch then aux (succ i)
else if ch = ' ' && accepted_char (peek_def (succ i)) then
aux (2 + i)
else i in
with_offset l (aux (offset l))
let scan_decimal_digits (l : lexer) =
skip_while l is_decimal_digit
let scan_decimal_digits_with_underscores lexer =
scan_with_underscores lexer is_decimal_digit
let scan_octal_digits (l : lexer) =
skip_while l is_octal_digit
let scan_octal_digits_with_underscores (l : lexer) =
scan_with_underscores l is_octal_digit
let scan_binary_digits_with_underscores (l : lexer) =
scan_with_underscores l is_binary_digit
let scan_hexadecimal_digits (l : lexer) =
skip_while l is_hexadecimal_digit
let scan_hexadecimal_digits_with_underscores (l : lexer) =
scan_with_underscores l is_hexadecimal_digit
let scan_hex_literal lexer =
let ch = peek_char lexer 0 in
if not (is_hexadecimal_digit ch) then
let lexer = with_error lexer SyntaxError.error0001 in
(lexer, TokenKind.HexadecimalLiteral)
else
(scan_hexadecimal_digits_with_underscores lexer, TokenKind.HexadecimalLiteral)
let scan_binary_literal lexer =
let ch = peek_char lexer 0 in
if not (is_binary_digit ch) then
let lexer = with_error lexer SyntaxError.error0002 in
(lexer, TokenKind.BinaryLiteral)
else
(scan_binary_digits_with_underscores lexer, TokenKind.BinaryLiteral)
let scan_exponent lexer =
let ch = peek_char lexer 1 in
let lexer = if ch = '+' || ch = '-' then (advance lexer 2)
else (advance lexer 1) in
let ch = peek_char lexer 0 in
if not (is_decimal_digit ch) then
let lexer = with_error lexer SyntaxError.error0003 in
(lexer, TokenKind.FloatingLiteral)
else
(scan_decimal_digits lexer, TokenKind.FloatingLiteral)
let scan_after_decimal_point lexer =
let lexer = advance lexer 1 in
let lexer = scan_decimal_digits lexer in
let ch = peek_char lexer 0 in
if ch = 'e' || ch = 'E' then
scan_exponent lexer
else
(lexer, TokenKind.FloatingLiteral)
let scan_octal_or_float lexer =
(* We've scanned a leading zero. *)
(* We have an irritating ambiguity here. 09 is not a legal octal or
* floating literal, but 09e1 and 09.1 are. *)
let lexer = advance lexer 1 in
let ch = peek_char lexer 0 in
match ch with
| '.' -> (* 0. *) scan_after_decimal_point lexer
| 'e' | 'E' -> (* 0e *) scan_exponent lexer
| '0' .. '9' ->
(* 05 *)
let lexer_oct = scan_octal_digits lexer in
let lexer_dec = scan_decimal_digits lexer in
if (width lexer_oct) = (width lexer_dec) then
begin
(* Only octal digits. Could be an octal literal, or could
be a float. *)
let ch = peek_char lexer_oct 0 in
if ch = 'e' || ch = 'E' then scan_exponent lexer_oct
else if ch = '.' then scan_after_decimal_point lexer_oct
else
(* This is irritating - we only want to allow underscores for integer
literals. Deferring the lexing with underscores here allows us to
make sure we're not dealing with floats. *)
let lexer_oct_with_underscores =
scan_octal_digits_with_underscores lexer in
(lexer_oct_with_underscores, TokenKind.OctalLiteral)
end
else
begin
(* We had decimal digits following a leading zero; this is either a
float literal or an octal to be truncated at the first non-octal
digit. *)
let ch = peek_char lexer_dec 0 in
if ch = 'e' || ch = 'E' then
scan_exponent lexer_dec
else if ch = '.' then
scan_after_decimal_point lexer_dec
else (* an octal to be truncated at the first non-octal digit *)
(* Again we differ the lexing with underscores here *)
let lexer_dec_with_underscores =
scan_decimal_digits_with_underscores lexer in
(lexer_dec_with_underscores, TokenKind.OctalLiteral)
end
| _ -> (* 0 *) (lexer, TokenKind.OctalLiteral)
let scan_decimal_or_float lexer =
(* We've scanned a leading non-zero digit. *)
let lexer_no_underscores = scan_decimal_digits lexer in
let lexer_with_underscores = scan_decimal_digits_with_underscores lexer in
let ch = peek_char lexer_no_underscores 0 in
match ch with
| '.' -> (* 123. *) scan_after_decimal_point lexer_no_underscores
| 'e' | 'E' -> (* 123e *) scan_exponent lexer_no_underscores
| _ -> (* 123 *) (lexer_with_underscores, TokenKind.DecimalLiteral)
let scan_single_quote_string_literal (l : lexer) =
(* TODO: What about newlines embedded? *)
(* SPEC:
single-quoted-string-literal::
b-opt ' sq-char-sequence-opt '
TODO: What is this b-opt? We don't lex an optional 'b' before a literal.
sq-char-sequence::
sq-char
sq-char-sequence sq-char
sq-char::
sq-escape-sequence
\opt any character except single-quote (') or backslash (\)
sq-escape-sequence:: one of
\' \\
*)
let n = SourceText.length (source l) in
let peek = SourceText.get (source l) in
let has_error0012 = ref false in
let has_error0006 = ref false in
let rec stepper i =
if i >= n then
(has_error0012 := true; n - 1)
else begin
let ch = peek i in
match ch with
| '\000' -> (has_error0006 := true; stepper (1+i))
| '\\' -> stepper (2+i)
| '\'' -> (1+i)
| _ -> stepper (1+i)
end in
let new_offset = stepper (1 + (offset l)) in
let new_errors =
let err msg = make_error_with_location l msg in
match (!has_error0006, !has_error0012) with
| (true, true) -> (err SyntaxError.error0006 :: err SyntaxError.error0012 :: (errors l))
| (true, false) -> (err SyntaxError.error0006 :: (errors l))
| (false, true) -> (err SyntaxError.error0012 :: (errors l))
| (false, false) -> (errors l) in
let res = with_offset_errors l new_offset new_errors in
(res, TokenKind.SingleQuotedStringLiteral)
let scan_hexadecimal_escape lexer =
let ch2 = peek_char lexer 2 in
let ch3 = peek_char lexer 3 in
if not (is_hexadecimal_digit ch2) then
(* TODO: Consider producing an error for a malformed hex escape *)
(* let lexer = with_error lexer SyntaxError.error0005 in *)
advance lexer 2
else if not (is_hexadecimal_digit ch3) then
(* let lexer = with_error lexer SyntaxError.error0005 in *)
advance lexer 3
else
advance lexer 4
let scan_unicode_escape lexer =
(* At present the lexer is pointing at \u *)
if (peek_char lexer 2) = '{' then
if (peek_char lexer 3) = '$' then
(* We have a malformed unicode escape that contains a possible embedded
expression. Eat the \u and keep on processing the embedded expression. *)
(* TODO: Consider producing a warning for a malformed unicode escape. *)
advance lexer 2
else
(* We have a possibly well-formed escape sequence, and at least we know
that it is not an embedded expression. *)
(* TODO: Consider producing an error if the digits are out of range
of legal Unicode characters. *)
(* TODO: Consider producing an error if there are no digits. *)
(* Skip over the slash, u and brace, and start lexing the number. *)
let lexer = advance lexer 3 in
let lexer = scan_hexadecimal_digits lexer in
let ch = peek_char lexer 0 in
if ch != '}' then
(* TODO: Consider producing a warning for a malformed unicode escape. *)
lexer
else
advance lexer 1
else
(* We have a malformed unicode escape sequence. Bail out. *)
(* TODO: Consider producing a warning for a malformed unicode escape. *)
advance lexer 2
let skip_uninteresting_double_quote_like_string_characters (l : lexer) start_char =
let is_uninteresting ch =
match ch with
| '\000' | '\\' | '$' | '{' | '[' | ']' | '-'
| '0' .. '9' -> false
| ch -> ch <> start_char && not (is_name_nondigit ch) in
skip_while l is_uninteresting
let scan_integer_literal_in_string lexer =
if (peek_char lexer 0) = '0' then
match peek_char lexer 1 with
| 'x' | 'X' -> scan_hex_literal (advance lexer 2)
| 'b' | 'B' -> scan_binary_literal (advance lexer 2)
| _ ->
(* An integer literal starting with 0 in a string will actually
always be treated as a string index in HHVM, and not as an octal.
In such a case, HHVM actually scans all decimal digits to create the
token. TODO: we may want to change this behavior to something more
sensible *)
(scan_decimal_digits_with_underscores lexer, TokenKind.DecimalLiteral)
else
(scan_decimal_digits_with_underscores lexer, TokenKind.DecimalLiteral)
(* scans double quoted or execution string literals - they have similar rules
for content interpretation except for \"" character - it is escaped in
double quoted string and remain intact in execution string literals *)
let scan_double_quote_like_string_literal_from_start lexer start_char =
let literal_token_kind =
if start_char = '`' then TokenKind.ExecutionStringLiteral
else TokenKind.DoubleQuotedStringLiteral in
let head_token_kind =
if start_char = '`' then TokenKind.ExecutionStringLiteralHead
else TokenKind.DoubleQuotedStringLiteralHead in
let rec aux lexer =
(* If there's nothing interesting in this double-quoted string then
we can just hand it back as-is. *)
let lexer =
skip_uninteresting_double_quote_like_string_characters lexer start_char in
match peek_char lexer 0 with
| '\000' ->
(* If the string is unterminated then give an error; if this is an
embedded zero character then give an error and recurse; we might
be able to make more progress. *)
if at_end lexer then
let lexer = with_error lexer SyntaxError.error0012 in
(lexer, literal_token_kind)
else
let lexer = with_error lexer SyntaxError.error0006 in
aux (advance lexer 1)
| '`' | '"' ->
(* We made it to the end without finding a special character. *)
(advance lexer 1, literal_token_kind)
| _ -> (* We've found a backslash, dollar or brace. *)
(lexer, head_token_kind) in
aux (advance lexer 1)
let is_heredoc_tail lexer name =
(* A heredoc tail is the identifier immediately preceded by a newline
and immediately followed by an optional semi and then a newline.
Note that the newline and optional semi are not part of the literal;
the literal's lexeme ends at the end of the name. Either there is
no trivia and the next token is a semi-with-trailing-newline, or
the trailing trivia is a newline.
This odd rule is to ensure that both
$x = <<<HERE
something
HERE;
and
$x = <<<HERE
something
HERE
. "something else";
are legal.
*)
if not (is_newline (peek_char lexer (-1))) then
false
else
let len = String.length name in
let ch0 = peek_char lexer len in
let ch1 = peek_char lexer (len + 1) in
((is_newline ch0) || ch0 = ';' && (is_newline ch1)) &&
(peek_string lexer len) = name
let get_tail_token_kind literal_kind =
match literal_kind with
| Literal_heredoc _-> TokenKind.HeredocStringLiteralTail
| Literal_execution_string -> TokenKind.ExecutionStringLiteralTail
| Literal_double_quoted -> TokenKind.DoubleQuotedStringLiteralTail
let get_string_literal_body_or_double_quoted_tail literal_kind =
if literal_kind = Literal_double_quoted
then TokenKind.DoubleQuotedStringLiteralTail
else TokenKind.StringLiteralBody
let scan_string_literal_in_progress lexer literal_kind =
let is_heredoc, name =
match literal_kind with
| Literal_heredoc name -> true, name
| _ -> false, "" in
let start_char =
if literal_kind = Literal_execution_string then '`'
else '"' in
let ch0 = peek_char lexer 0 in
if is_name_nondigit ch0 then
if is_heredoc && (is_heredoc_tail lexer name) then
(scan_name_impl lexer, TokenKind.HeredocStringLiteralTail)
else
(scan_name_impl lexer, TokenKind.Name)
else
match ch0 with
| '\000' ->
if at_end lexer then
let lexer = with_error lexer SyntaxError.error0012 in
(lexer, get_tail_token_kind literal_kind)
else
let lexer = with_error lexer SyntaxError.error0006 in
let lexer = advance lexer 1 in
let lexer =
skip_uninteresting_double_quote_like_string_characters
lexer
start_char in
(lexer, TokenKind.StringLiteralBody)
| '`' when literal_kind = Literal_execution_string ->
(* '`' terminates execution string *)
(advance lexer 1, TokenKind.ExecutionStringLiteralTail)
| '"' ->
let kind = get_string_literal_body_or_double_quoted_tail literal_kind in
(advance lexer 1, kind)
| '$' ->
if is_name_nondigit (peek_char lexer 1) then scan_variable lexer
else (advance lexer 1, TokenKind.Dollar)
| '{' -> (advance lexer 1, TokenKind.LeftBrace)
| '\\' -> begin
match peek_char lexer 1 with
(* In these cases we just skip the escape sequence and
keep on scanning for special characters. *)
| '\\' | '"' | '$' | 'e' | 'f' | 'n' | 'r' | 't' | 'v' | '`'
(* Same in these cases; there might be more octal characters following but
if there are, we'll just eat them as normal characters. *)
| '0' .. '7' ->
let lexer = advance lexer 2 in
let lexer =
skip_uninteresting_double_quote_like_string_characters
lexer start_char in
(lexer, TokenKind.StringLiteralBody)
| 'x' ->
let lexer = scan_hexadecimal_escape lexer in
let lexer =
skip_uninteresting_double_quote_like_string_characters
lexer start_char in
(lexer, TokenKind.StringLiteralBody)
| 'u' ->
let lexer = scan_unicode_escape lexer in
let lexer =
skip_uninteresting_double_quote_like_string_characters
lexer start_char in
(lexer, TokenKind.StringLiteralBody)
| '{' ->
(* The rules for escaping open braces in Hack are bizarre. Suppose we
have
$x = 123;
$y = 456;
$z = "\{$x,$y\}";
What is the value of $z? Naively you would think that the backslash
escapes the braces, and the variables are embedded, so {123,456}. But
that's not what happens. Yes, the backslash makes the brace no longer
the opening brace of an expression. But the backslash is still part
of the string! This is the string \{123,456\}.
TODO: We might want to fix this because this is very strange. *)
(* Eat the backslash and the brace. *)
let lexer = advance lexer 2 in
(lexer, TokenKind.StringLiteralBody)
| _ ->
(* TODO: A backslash followed by something other than an escape sequence
is legal in hack, and treated as though it was just the backslash
and the character. However we might consider making this a warning.
It is particularly egregious when we have something like:
$x = "abcdef \
ghi";
The author of the code likely means the backslash to mean line
continuation but in fact it just means to put a backslash and newline
in the string.
*)
let lexer = advance lexer 1 in
let lexer =
skip_uninteresting_double_quote_like_string_characters
lexer start_char in
(lexer, TokenKind.StringLiteralBody)
end
| '[' ->
let lexer = advance lexer 1 in
(lexer, TokenKind.LeftBracket)
| ']' ->
let lexer = advance lexer 1 in
(lexer, TokenKind.RightBracket)
| '-' ->
if (peek_char lexer 1) = '>' then
let lexer = advance lexer 2 in
(lexer, TokenKind.MinusGreaterThan)
else
(* Nothing interesting here. Skip it and find the next
interesting character. *)
let lexer = advance lexer 1 in
let lexer =
skip_uninteresting_double_quote_like_string_characters
lexer start_char in
(lexer, TokenKind.StringLiteralBody)
| '0' .. '9' ->
let (lexer1, _) as literal = scan_integer_literal_in_string lexer in
if errors lexer == errors lexer1 then literal else
(* If we failed to scan a literal, do not interpret the literal *)
(with_offset lexer (offset lexer1), TokenKind.StringLiteralBody)
| _ ->
(* Nothing interesting here. Skip it and find the next
interesting character. *)
let lexer = advance lexer 1 in
let lexer =
skip_uninteresting_double_quote_like_string_characters
lexer start_char in
(lexer, TokenKind.StringLiteralBody)
(* A heredoc string literal has the form
header
optional body
trailer
The header is:
<<< (optional whitespace) name (no whitespace) (newline)
The optional body is:
any characters whatsoever including newlines (newline)
The trailer is:
(no whitespace) name (no whitespace) (optional semi) (no whitespace) (newline)
The names must be identical. The trailing semi and newline must be present.
The body is any and all characters, up to the first line that exactly matches
the trailer.
The body may contain embedded expressions.
A nowdoc string literal has the same form except that the first name is
enclosed in single quotes, and it may not contain embedded expressions.
*)
let scan_docstring_name_actual lexer =
let ch = peek_char lexer 0 in
if is_name_nondigit ch then
let end_lexer = skip_name_end (advance lexer 1) in
let name = SourceText.sub
(source lexer) (offset lexer) (offset end_lexer - offset lexer) in
(end_lexer, name)
else
let lexer = with_error lexer SyntaxError.error0008 in
(lexer, "")
let scan_docstring_name lexer =
let lexer = skip_whitespace lexer in
let ch = peek_char lexer 0 in
let kind =
if ch = '\'' then TokenKind.NowdocStringLiteral
else TokenKind.HeredocStringLiteral in
let (lexer, name) =
if ch = '\'' then
let (lexer, name) = scan_docstring_name_actual (advance lexer 1) in
if (peek_char lexer 0) = '\'' then
(advance lexer 1, name)
else
(with_error lexer SyntaxError.error0010, name)
else
(* Starting with PHP 5.3.0, the opening Heredoc identifier
may optionally be enclosed in double quotes:*)
let lexer = if ch = '"' then advance lexer 1 else lexer in
let lexer, name = scan_docstring_name_actual lexer in
let lexer =
if ch = '"' then
(* same logic as above, just for double quote *)
if peek_char lexer 0 = '\"' then
advance lexer 1
else
with_error lexer SyntaxError.missing_double_quote
else
lexer
in
lexer, name
in
(lexer, name, kind)
let scan_docstring_header lexer =
let ch = peek_char lexer 0 in
(* Skip 3 for <<< or 4 for b<<< *)
let skip_count = if ch = 'b' then 4 else 3 in
let lexer = advance lexer skip_count in
let (lexer, name, kind) = scan_docstring_name lexer in
let ch = peek_char lexer 0 in
let lexer =
if is_newline ch then lexer
else with_error lexer SyntaxError.error0011 in
let lexer = skip_to_end_of_line lexer in
let lexer = skip_end_of_line lexer in
(lexer, name, kind)
let scan_docstring_remainder name lexer =
let len = String.length name in
let rec aux lexer =
let ch0 = peek_char lexer len in
let ch1 = peek_char lexer (len + 1) in
if ((is_newline ch0) || ch0 = ';' && (is_newline ch1)) &&
(peek_string lexer len) = name then
advance lexer len
else
let lexer = skip_to_end_of_line lexer in
let ch = peek_char lexer 0 in
if is_newline ch then
aux (skip_end_of_line lexer)
else
(* If we got here then we ran off the end of the file without
finding a newline. Just bail. *)
with_error lexer SyntaxError.error0011 in
aux lexer
let scan_docstring_literal lexer =
let (lexer, name, kind) = scan_docstring_header lexer in
let lexer = scan_docstring_remainder name lexer in
(lexer, kind)
let scan_xhp_label lexer =
(* An XHP label has the same grammar as a Hack name. *)
let (lexer, _) = scan_name lexer in
lexer
let rec scan_xhp_element_name ?(attribute=false) lexer =
(* An XHP element name is a sequence of one or more XHP labels each separated
by a single : or -. Note that it is possible for an XHP element name to be
followed immediately by a : or - that is the next token, so if we find
a : or - not followed by a label, we need to terminate the token. *)
let lexer = scan_xhp_label lexer in
let ch0 = peek_char lexer 0 in
let ch1 = peek_char lexer 1 in
if (not attribute && ch0 = ':' || ch0 = '-') && is_name_nondigit ch1 then
scan_xhp_element_name (advance lexer 1)
else
(lexer, TokenKind.XHPElementName)
(* Is the next token we're going to lex a possible xhp class name? *)
let is_xhp_class_name lexer =
(peek_char lexer 0 = ':') && (is_name_nondigit (peek_char lexer 1))
let scan_xhp_class_name lexer =
(* An XHP class name is a colon followed by an xhp name. *)
if is_xhp_class_name lexer then
let (lexer, _) = scan_xhp_element_name (advance lexer 1) in
(lexer, TokenKind.XHPClassName)
else
let lexer = with_error lexer SyntaxError.error0008 in
(advance lexer 1, TokenKind.ErrorToken)
let scan_xhp_string_literal lexer =
(* XHP string literals are just straight up "find the closing quote"
strings. Embedded newlines are legal. *)
let rec aux lexer offset =
match peek_char lexer offset with
| '\000' ->
let lexer = advance lexer offset in
if at_end lexer then
let lexer = with_error lexer SyntaxError.error0012 in
(lexer, TokenKind.XHPStringLiteral)
else
let lexer = with_error lexer SyntaxError.error0006 in
aux lexer 1
| '"' -> (advance lexer (offset + 1), TokenKind.XHPStringLiteral)
| _ -> aux lexer (offset + 1) in
aux lexer 1
(* Note that this does not scan an XHP body *)
let scan_xhp_token lexer =
(* TODO: HHVM requires that there be no trivia between < and name in an
opening tag, but does allow trivia between </ and name in a closing tag.
Consider allowing trivia in an opening tag. *)
let ch0 = peek_char lexer 0 in
if ch0 = invalid && at_end lexer then
(lexer, TokenKind.EndOfFile)
else if is_name_nondigit ch0 then
scan_xhp_element_name lexer
else match ch0 with
| '{' -> (advance lexer 1, TokenKind.LeftBrace)
| '}' -> (advance lexer 1, TokenKind.RightBrace)
| '=' -> (advance lexer 1, TokenKind.Equal)
| '<' ->
if (peek_char lexer 1) = '/' then
(advance lexer 2, TokenKind.LessThanSlash)
else
(advance lexer 1, TokenKind.LessThan)
| '"' -> scan_xhp_string_literal lexer
| '/' ->
if (peek_char lexer 1) = '>' then
(advance lexer 2, TokenKind.SlashGreaterThan)
else
let lexer = with_error lexer SyntaxError.error0006 in
(advance lexer 1, TokenKind.ErrorToken)
| '>' -> (advance lexer 1, TokenKind.GreaterThan)
| _ ->
let lexer = with_error lexer SyntaxError.error0006 in
(advance lexer 1, TokenKind.ErrorToken)
let scan_xhp_comment lexer =
let rec aux lexer offset =
let ch0 = peek_char lexer offset in
let ch1 = peek_char lexer (offset + 1) in
let ch2 = peek_char lexer (offset + 2) in
match (ch0, ch1, ch2) with
| ('\000', _, _) -> with_error (advance lexer offset) SyntaxError.error0014
| ('-', '-', '>') -> (advance lexer (offset + 3))
| _ -> aux lexer (offset + 1) in
aux lexer 4
let scan_xhp_body lexer =
(* Naively you might think that an XHP body is just a bunch of characters,
terminated by an embedded { } expression or a tag. However, whitespace
and newlines are relevant in XHP bodies because they are "soft".
That is, any section of contiguous trivia has the same semantics as a
single space or newline -- just as in HTML.
Obviously this is of relevance to code formatters.
Therefore we detect whitespace and newlines within XHP bodies and treat
it as trivia surrounding the tokens within the body.
TODO: Is this also true of whitespace within XHP comments? If so then
we need to make XHP comments a sequence of tokens, rather than a
single token as they are now.
*)
let rec aux lexer offset =
let ch = peek_char lexer offset in
match ch with
| '\000' ->
let lexer = advance lexer offset in
if at_end lexer then
let lexer = with_error lexer SyntaxError.error0013 in
lexer
else
let lexer = with_error lexer SyntaxError.error0006 in
aux lexer 1
| '\t' | ' ' | '\r' | '\n' | '{' | '}' | '<' -> advance lexer offset
| _ -> aux lexer (offset + 1) in
let ch0 = peek_char lexer 0 in
match ch0 with
| '\000' when at_end lexer -> (lexer, TokenKind.EndOfFile)
| '{' -> (advance lexer 1, TokenKind.LeftBrace)
| '}' -> (advance lexer 1, TokenKind.RightBrace)
| '<' -> begin
let ch1 = peek_char lexer 1 in
let ch2 = peek_char lexer 2 in
let ch3 = peek_char lexer 3 in
match (ch1, ch2, ch3) with
| ('!', '-', '-') -> (scan_xhp_comment lexer, TokenKind.XHPComment)
| ('/', _, _) -> (advance lexer 2, TokenKind.LessThanSlash)
| _ -> (advance lexer 1, TokenKind.LessThan)
end
| _ -> ((aux lexer 0), TokenKind.XHPBody)
let scan_dollar_token lexer =
(*
We have a problem here. We wish to be able to lexically analyze both
PHP and Hack, but the introduction of $$ to Hack makes them incompatible.
"$$x" and "$$ $x" are legal in PHP, but illegal in Hack.
The rule in PHP seems to be that $ is a prefix operator, it is a token,
it can be followed by trivia, but the next token has to be another $
operator, a variable $x, or a {.
Here's a reasonable compromise. (TODO: Review this decision.)
$$x lexes as $ $x
$$$x lexes as $ $ $x
and so on.
$$ followed by anything other than a name or a $ lexes as $$.
This means that lexing a PHP program which contains "$$ $x" is different
will fail at parse time, but I'm willing to live with that.
This means that lexing a Hack program which contains
"$x |> $$instanceof Foo" produces an error as well.
If these decisions are unacceptable then we will need to make the lexer
be aware of whether it is lexing PHP or Hack; thus far we have not had
to make this distinction.
*)
(* We are already at $. *)
let ch1 = peek_char lexer 1 in
match ch1 with
| '$' ->
let ch2 = peek_char lexer 2 in
if ch2 = '$' || ch2 = '{' || is_name_nondigit ch2 then
(advance lexer 1, TokenKind.Dollar) (* $$x or $$$*)
else
(advance lexer 2, TokenKind.DollarDollar) (* $$ *)
| _ ->
if is_name_nondigit ch1 then scan_variable lexer (* $x *)
else (advance lexer 1, TokenKind.Dollar) (* $ *)
let rec scan_token_impl : bool -> lexer -> (lexer * TokenKind.t) =
fun in_type lexer ->
let ch0 = peek_char lexer 0 in
match ch0 with
| '[' -> (advance lexer 1, TokenKind.LeftBracket)
| ']' -> (advance lexer 1, TokenKind.RightBracket)
| '(' -> (advance lexer 1, TokenKind.LeftParen)
| ')' -> (advance lexer 1, TokenKind.RightParen)
| '{' -> (advance lexer 1, TokenKind.LeftBrace)
| '}' -> (advance lexer 1, TokenKind.RightBrace)
| '.' -> begin