This repository has been archived by the owner on Jun 4, 2019. It is now read-only.
/
lexer_php.mll.nw
2070 lines (1723 loc) · 62.5 KB
/
lexer_php.mll.nw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% also contains part of parse_php.ml, the parts related to lexing
% and parts of token_helpers.mli
\section{Overview}
The code in [[lexer_php.mll]] is mostly a copy paste of the Flex
scanner in the PHP Zend source code (included in
[[pfff/docs/official-grammar/5.2.11/zend_language_scanner.l]]) adapted
for [[ocamllex]]:
%TODO:I kept some of the original C comments.
<<lexer_php.mll>>=
{
<<Facebook copyright>>
open Common
<<basic pfff module open and aliases>>
open Parser_php
(*****************************************************************************)
(* Prelude *)
(*****************************************************************************)
(* The PHP lexer.
*
* There are a few tricks to go around ocamllex restrictions
* because PHP has different lexing rules depending on some "contexts"
* (similar to Perl, e.g. the <<<END context).
*)
(*****************************************************************************)
(* Wrappers *)
(*****************************************************************************)
let pr2, pr2_once = Common.mk_pr2_wrappers Flag.verbose_lexing
(*****************************************************************************)
(* Helpers *)
(*****************************************************************************)
exception Lexical of string
<<lexer helpers>>
let xhp_or_t_ident ii fii =
if !Flag.xhp_builtin
then fii ii
else T_IDENT(Ast.str_of_info ii, ii)
let lang_ext_or_t_ident ii fii =
if !Flag.facebook_lang_extensions
then fii ii
else T_IDENT(Ast.str_of_info ii, ii)
(* ---------------------------------------------------------------------- *)
(* Keywords *)
(* ---------------------------------------------------------------------- *)
<<keywords_table hash>>
(* ---------------------------------------------------------------------- *)
(* Lexer State *)
(* ---------------------------------------------------------------------- *)
<<type state_mode>>
<<lexer state trick helpers>>
(* xhp: the function below is used to disambiguate the use
* of ":" and "%" as either a naming (unary) operator or binary operator.
*
* How to know the following lists of tokens is correct ?
* We should compute FOLLOW(tok) for all tokens and check
* if "%" or ":" can be in it ?
*)
let is_in_binary_operator_position last_tok =
match last_tok with
| Some (
(* if we are after a number or any kind of scalar, then it's ok to
* have a binary operator *)
T_LNUMBER _ | T_DNUMBER _
| T_CONSTANT_ENCAPSED_STRING _ | TGUIL _ | TBACKQUOTE _
(* same for ']' or ')'; anything that "terminates" an expression *)
| TCBRA _ | TCPAR _
| T_IDENT _ | T_VARIABLE _
)
-> true
| _ -> false
}
(*****************************************************************************)
(* Regexps aliases *)
(*****************************************************************************)
<<regexp aliases>>
(*****************************************************************************)
(* Rule in script *)
(*****************************************************************************)
<<rule st_in_scripting>>
(*****************************************************************************)
(* Rule initial (html) *)
(*****************************************************************************)
<<rule initial>>
(*****************************************************************************)
(* Rule looking_for_xxx *)
(*****************************************************************************)
<<rule st_looking_for_property>>
(*****************************************************************************)
<<rule st_looking_for_varname>>
(*****************************************************************************)
<<rule st_var_offset>>
(*****************************************************************************)
(* Rule strings *)
(*****************************************************************************)
<<rule st_double_quotes>>
(* ----------------------------------------------------------------------- *)
<<rule st_backquote>>
(* ----------------------------------------------------------------------- *)
<<rule st_start_heredoc>>
(*****************************************************************************)
(* Rules for XHP *)
(*****************************************************************************)
(* XHP lexing states and rules *)
and st_in_xhp_tag current_tag = parse
(* The original XHP parser have some special handlings of
* whitespace and enforce to use certain whitespace at
* certain places. Not sure I need to enforce this too.
* Simpler to ignore whitespaces.
*
* todo? allow comments too there ?
*)
| [' ' '\t']+ { TSpaces(tokinfo lexbuf) }
| ['\n' '\r'] { TNewline(tokinfo lexbuf) }
(* attribute management *)
| XHPATTR { T_XHP_ATTR(tok lexbuf, tokinfo lexbuf) }
| "=" { TEQ(tokinfo lexbuf) }
(* not sure if XHP strings needs the interpolation support *)
| ['"'] {
push_mode ST_DOUBLE_QUOTES;
TGUIL(tokinfo lexbuf)
}
| "{" {
push_mode ST_IN_SCRIPTING;
TOBRACE(tokinfo lexbuf)
}
(* a singleton tag *)
| "/>" {
pop_mode ();
T_XHP_SLASH_GT (tokinfo lexbuf)
}
(* When we see a ">", it means it's just the end of
* the opening tag. Transit to IN_XHP_TEXT.
*)
| ">" {
set_mode (ST_IN_XHP_TEXT current_tag);
T_XHP_GT (tokinfo lexbuf)
}
| eof { EOF (tokinfo lexbuf +> Ast.rewrap_str "") }
| _ {
if !Flag.verbose_lexing
then pr2_once ("LEXER:unrecognised symbol, in XHP tag:"^tok lexbuf);
TUnknown (tokinfo lexbuf)
}
(* ----------------------------------------------------------------------- *)
and st_in_xhp_text current_tag = parse
(* a nested xhp construct *)
| "<" (XHPTAG as tag) {
let xs = Common.split ":" tag in
push_mode (ST_IN_XHP_TAG xs);
T_XHP_OPEN_TAG(xs, tokinfo lexbuf)
}
| "<" "/" (XHPTAG as tag) ">" {
let xs = Common.split ":" tag in
if (xs <> current_tag)
then begin
pr2 (spf "XHP: wrong closing tag for, %s != %s"
(Common.join ":" xs)
(Common.join ":" current_tag));
end;
pop_mode ();
T_XHP_CLOSE_TAG(Some xs, tokinfo lexbuf)
}
(* shortcut for closing tag ? *)
| "<" "/" ">" {
(* no check :( *)
pop_mode ();
T_XHP_CLOSE_TAG(None, tokinfo lexbuf)
}
(* PHP interpolation. How the user can produce a { ? &;something ? *)
| "{" {
push_mode ST_IN_SCRIPTING;
TOBRACE(tokinfo lexbuf)
}
(* opti: *)
| [^'<' '{']+ { T_XHP_TEXT (tok lexbuf, tokinfo lexbuf) }
| eof { EOF (tokinfo lexbuf +> Ast.rewrap_str "") }
| _ {
if !Flag.verbose_lexing
then pr2_once ("LEXER:unrecognised symbol, in XHP text:"^tok lexbuf);
TUnknown (tokinfo lexbuf)
}
(*****************************************************************************)
(* Rule comment *)
(*****************************************************************************)
<<rule st_comment>>
<<rule st_one_line_comment>>
@
The file defines mainly the functions [[Lexer_php.st_initial]]
and [[Lexer_php.st_scripting]], auto generated by [[ocamllex]],
to respectively lex a file in HTML mode (the default initial mode) and PHP
mode (aka scripting mode). As usual with Lex and Yacc
the tokens are actually specified in the Yacc file
(see Section~\ref{sec:token-spec}), hence
the [[open Parser_php]] at the beginning of the file.
<<regexp aliases>>=
let ANY_CHAR = (_ | ['\n'] )
@
%let TOKENS = [';'':'',''.''['']''('')''|''^''&''+''-''/''*''=''%''!''~''$''<''>''?''@']
\section{Lex states and other [[ocamllex]] hacks}
\subsection{Contextual lexing}
The lexer needs a contextual capability. This is because PHP allows
to embed HTML snippets directly into the code, where tokens have a
different meaning. This is also because some tokens like [[if]] mean
something in one context (a statement keyword) and something else in another
(they are allowed as name of properties for instance).
Also, like in Perl, PHP allows HereDoc, and a few other tricks that
makes the job of the lexer slightly more complicated than in other
programming languages.
Contextual lexing is available in Flex but not really in [[ocamllex]].
So the lexing logic is splitted into this file and into a small
function in [[parse_php.ml]] that handles some state machine. See also
the [[state_mode]] type below.
<<type state_mode>>=
(* In most languages the lexer has no state and all strings are always
* encoded in the same way, in the same token, wherever the string is
* located in the file (except for strings inside comments). In PHP
* some characters, e.g. "'", as in "I don't like you" or "'foo'" can
* mean different things. Indeed the PHP language in fact supports
* multiple languages or "modes" inside the same script (which also
* make emacs mode for such language harder to define).
*
* Inside the PHP script code part, the quote is the start of a string
* and there must be a corresponding quote ending the string. Inside
* the HTML part of a PHP file it's just a character like any other
* character. Inside heredocs (the '<<<XXX' construct) it is again
* considered as any other character. In the same way some strings such
* as 'if' can again mean different things; when they are preceded by a
* '->' they correspond to the possible name of a field, otherwise
* they are special PHP keywords.
*
* Because all of this, the lexer has multiple states which are
* represented below and adjusted via some push/pop_mode function
* below. Depending on the state the lexer behaves differently.
*)
type state_mode =
(* aka HTML mode *)
| INITIAL
(* started with <?php or <?, finished by ?> *)
| ST_IN_SCRIPTING
(* started with <?=, finished by ?> *)
| ST_IN_SCRIPTING2
(* handled by using ocamllex ability to define multiple lexers
* | ST_COMMENT
* | ST_DOC_COMMENT
* | ST_ONE_LINE_COMMENT
*)
(* started with ", finished with ". In most languages strings
* are a single tokens but PHP allow interpolation which means
* a string can contain nested PHP variables or expressions.
*)
| ST_DOUBLE_QUOTES
(* started with "`", finished with "`" *)
| ST_BACKQUOTE
(* started with ->, finished after reading one fieldname *)
| ST_LOOKING_FOR_PROPERTY
(* started with ${ *)
| ST_LOOKING_FOR_VARNAME
(* started with $xxx[ *)
| ST_VAR_OFFSET
(* started with <<<XXX, finished by XXX; *)
| ST_START_HEREDOC of string
(* started with <<<'XXX', finished by XXX; *)
| ST_START_NOWDOC of string
(* started with <xx when preceded by a certain token (e.g. 'return' '<xx'),
* finished by '>' by transiting to ST_IN_XHP_TEXT, or really finished
* by '/>'.
*)
| ST_IN_XHP_TAG of Ast_php.xhp_tag (* the current tag, e,g, ["x";"frag"] *)
(* started with the '>' of an opening tag, finished when '</x>' *)
| ST_IN_XHP_TEXT of Ast_php.xhp_tag (* the current tag *)
@
<<lexer state trick helpers>>=
<<lexer state global variables>>
<<lexer state global reinitializer>>
<<lexer state function hepers>>
@
<<lexer state global variables>>=
let default_state = INITIAL
let _mode_stack =
ref [default_state]
@
<<lexer state global reinitializer>>=
let reset () =
_mode_stack := [default_state];
<<auxillary reset lexing actions>>
_last_non_whitespace_like_token := None;
()
@
<<lexer state function hepers>>=
let rec current_mode () =
try
Common.top !_mode_stack
with Failure("hd") ->
pr2("LEXER: mode_stack is empty, defaulting to INITIAL");
reset();
current_mode ()
@
<<function tokens>>=
let tokens2 ?(init_state=Lexer_php.INITIAL) file =
let table = Parse_info.full_charpos_to_pos_large file in
Common.with_open_infile file (fun chan ->
let lexbuf = Lexing.from_channel chan in
Lexer_php.reset();
Lexer_php._mode_stack := [init_state];
try
<<function phptoken>>
let rec tokens_aux acc =
let tok = phptoken lexbuf in
if !Flag.debug_lexer then Common.pr2_gen tok;
if not (TH.is_comment tok)
then Lexer_php._last_non_whitespace_like_token := Some tok;
<<fill in the line and col information for tok>>
if TH.is_eof tok
then List.rev (tok::acc)
else tokens_aux (tok::acc)
in
tokens_aux []
with
| Lexer_php.Lexical s ->
failwith ("lexical error " ^ s ^ "\n =" ^
(Parse_info.error_message file (lexbuf_to_strpos lexbuf)))
| e -> raise e
)
@
<<function tokens>>=
let tokens ?init_state a =
Common.profile_code "Parse_php.tokens" (fun () -> tokens2 ?init_state a)
@
<<function phptoken>>=
let phptoken lexbuf =
<<yyless trick in phptoken>>
(match Lexer_php.current_mode () with
| Lexer_php.INITIAL ->
Lexer_php.initial lexbuf
| Lexer_php.ST_IN_SCRIPTING ->
Lexer_php.st_in_scripting lexbuf
| Lexer_php.ST_IN_SCRIPTING2 ->
Lexer_php.st_in_scripting lexbuf
| Lexer_php.ST_DOUBLE_QUOTES ->
Lexer_php.st_double_quotes lexbuf
| Lexer_php.ST_BACKQUOTE ->
Lexer_php.st_backquote lexbuf
| Lexer_php.ST_LOOKING_FOR_PROPERTY ->
Lexer_php.st_looking_for_property lexbuf
| Lexer_php.ST_LOOKING_FOR_VARNAME ->
Lexer_php.st_looking_for_varname lexbuf
| Lexer_php.ST_VAR_OFFSET ->
Lexer_php.st_var_offset lexbuf
| Lexer_php.ST_START_HEREDOC s ->
Lexer_php.st_start_heredoc s lexbuf
| Lexer_php.ST_START_NOWDOC s ->
Lexer_php.st_start_nowdoc s lexbuf
(* xhp: *)
| Lexer_php.ST_IN_XHP_TAG current_tag ->
if not !Flag.xhp_builtin
then raise Impossible;
Lexer_php.st_in_xhp_tag current_tag lexbuf
| Lexer_php.ST_IN_XHP_TEXT current_tag ->
if not !Flag.xhp_builtin
then raise Impossible;
Lexer_php.st_in_xhp_text current_tag lexbuf
)
in
@
<<lexer state function hepers>>=
let push_mode mode = Common.push2 mode _mode_stack
let pop_mode () = ignore(Common.pop2 _mode_stack)
(* What is the semantic of BEGIN() in flex ? start from scratch with empty
* stack ?
*)
let set_mode mode =
pop_mode();
push_mode mode;
()
(* Here is an example of state transition. Given a php file like:
*
* <?php return <x>foo<y>bar</y></x>; ?>
*
* we start with the stack in [INITIAL]. The transitions are then:
*
* '<?php' -> [IN_SCRIPTING], via set_mode()
* ' ' -> [IN_SCRIPTING]
* 'return' -> [IN_SCRIPTING]
* '<x' -> [IN_XHP_TAG "x"; IN_SCRIPTING], via push_mode()
* '>' -> [IN_XHP_TEXT "x"; IN_SCRIPTING], via set_mode()
* 'foo' -> [IN_XHP_TEXT "x"; IN_SCRIPTING]
* '<y' -> [IN_XHP_TAG "y";IN_XHP_TEXT "x"; IN_SCRIPTING], via push_mode()
* '>' -> [IN_XHP_TEXT "y"; IN_XHP_TEXT "x";IN_SCRIPTING], via set_mode()
* 'bar' -> [IN_XHP_TEXT "y"; IN_XHP_TEXT "x"; IN_SCRIPTING]
* '</y>' -> [IN_XHP_TEXT "x"; IN_SCRIPTING], via pop_mode()
* '</x>' -> [IN_SCRIPTING], via pop_mode()
* ';' -> [IN_SCRIPTING]
* ' ' -> [IN_SCRIPTING]
* '?> -> [INITIAL], via set_mode()
*
*)
@
\subsection{Position information}
<<fill in the line and col information for tok>>=
let tok = tok +> TH.visitor_info_of_tok (fun ii ->
{ ii with Parse_info.token=
(* could assert pinfo.filename = file ? *)
match Ast.pinfo_of_info ii with
| Parse_info.OriginTok pi ->
Parse_info.OriginTok
(Parse_info.complete_parse_info_large file table pi)
| Parse_info.FakeTokStr _
| Parse_info.Ab
| Parse_info.ExpandedTok _
-> raise Impossible
})
in
@
\subsection{Filtering comments}
\label{sec:filter-comments-in-lexer}
Below you will see that we use a special lexing scheme.
Why use this lexing scheme ? Why not classically give a regular lexer func
to the parser ? Because we keep the comments in the lexer. Could
just do a simple wrapper that when comment asks again for a token,
but probably simpler to use the [[cur_tok]] technique.
%The use of local refs ([[remaining_tokens]], [[passed_tokens]], ...) makes
%also possible error recovery. Indeed, they allow to skip some tokens and
%still be able to call again the ocamlyacc parser. It is ugly code
%because we cant modify ocamllex and ocamlyacc. As we want some
%extended lexing tricks, we have to use such refs.
%
%Those refs can also used for my lalr(k) technique. Indeed They
%store the futur and previous tokens that were parsed, and so
%provide enough context information for powerful lex trick.
%
%\begin{itemize}
%- [[passed_tokens_last_ckp]] stores the passed tokens since last
% checkpoint. Used for [[NotParsedCorrectly]] and also to build the
% [[info_item]] attached to each [[program_element]].
%- [[passed_tokens_clean]] is used for lookahead, in fact for lookback.
%- [[remaining_tokens_clean]] is used for lookahead. Now [[remaining_tokens]]
% contain some comments and so would make pattern matching difficult
% in lookahead. Hence this variable. We would like also to get rid
% of cpp instruction because sometimes a cpp instruction is between
% two tokens and makes a pattern matching fail. But lookahead also
% transform some cpp instruction (in comment) so can't remove them.
%\end{itemize}
%
%So [[remaining_tokens]], [[passed_tokens_last_ckp]] contain
%comment-tokens,
%whereas [[passed_tokens_clean]] and [[remaining_tokens_clean]] does not contain
%comment-tokens.
%Normally we have:
%toks = (reverse passed_tok) ++ cur_tok ++ remaining_tokens
% after the call to pop2.
%toks = (reverse passed_tok) ++ remaining_tokens
% at the and of the lexer_function call.
%At the very beginning, cur_tok and remaining_tokens overlap, but not after.
%At the end of lexer_function call, cur_tok overlap with passed_tok.
%
%convention: I use "tr" for "tokens refs"
%
%I now also need this lexing trick because the lexer return comment
%tokens.
<<parse tokens_state helper>>=
@
<<parse tokens_state helper>>=
@
<<parse tokens_state helper>>=
(* Hacked lex. This function use refs passed by parse.
* 'tr' means 'token refs'.
*)
let rec lexer_function tr = fun lexbuf ->
match tr.PI.rest with
| [] -> (pr2 "LEXER: ALREADY AT END"; tr.PI.current)
| v::xs ->
tr.PI.rest <- xs;
tr.PI.current <- v;
tr.PI.passed <- v::tr.PI.passed;
if TH.is_comment v ||
(* TODO a little bit specific to FB ? *)
(match v with
| Parser_php.T_OPEN_TAG _ -> true
| Parser_php.T_CLOSE_TAG _ -> true
| _ -> false
)
then lexer_function (*~pass*) tr lexbuf
else v
@
\l (*~pass*)
\subsection{Other hacks}
<<lexer state global variables>>=
(* because ocamllex does not have the yyless feature, have to cheat.
* update: in fact can hack my own yyless so maybe should revisit
* this code.
*)
let _pending_tokens =
ref ([]: Parser_php.token list)
(* The logic to modify _last_non_whitespace_like_token is in the
* caller of the lexer, that is in Parse_php.tokens.
* Used for XHP.
*)
let _last_non_whitespace_like_token =
ref (None: Parser_php.token option)
@
<<auxillary reset lexing actions>>=
_pending_tokens := [];
@
<<lexer state function hepers>>=
let push_token tok =
_pending_tokens := tok::!_pending_tokens
@
<<yyless trick in phptoken>>=
(* for yyless emulation *)
match !Lexer_php._pending_tokens with
| x::xs ->
Lexer_php._pending_tokens := xs;
x
| [] ->
@
<<lexer helpers>>=
(* pad: hack around ocamllex to emulate the yyless of flex. It seems
* to work.
*)
let yyless n lexbuf =
lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - n;
let currp = lexbuf.Lexing.lex_curr_p in
lexbuf.Lexing.lex_curr_p <- { currp with
Lexing.pos_cnum = currp.Lexing.pos_cnum - n;
}
@
\section{Initial state (HTML mode)}
% #ifdef ZEND_MULTIBYTE
% if (SCNG(output_filter)) {
% int readsize;
% readsize = SCNG(output_filter)(&(zendlval->value.str.val), &(zendlval->value.str.len), yytext, yyleng TSRMLS_CC);
% if (readsize < yyleng) {
% yyless(readsize);
% }
% } else {
% zendlval->value.str.val = (char * ) estrndup(yytext, yyleng);
% zendlval->value.str.len = yyleng;
% }
% #else /* !ZEND_MULTIBYTE */
% zendlval->value.str.val = (char * ) estrndup(yytext, yyleng);
% zendlval->value.str.len = yyleng;
% #endif /* ZEND_MULTIBYTE */
% zendlval->type = IS_STRING;
% HANDLE_NEWLINES(yytext, yyleng);
% return T_INLINE_HTML;
%}
<<rule initial>>=
and initial = parse
| "<?php" ([' ''\t']|NEWLINE)
{
(* I now do a yyless to not eat the newline which is more
* consistent with how I treat newlines elsewhere
*)
yyless 1 lexbuf;
set_mode ST_IN_SCRIPTING;
T_OPEN_TAG(tokinfo lexbuf)
}
| "<?PHP"([' ''\t']|NEWLINE)
| "<?Php"([' ''\t']|NEWLINE)
{
(* "BAD USE OF <PHP at initial state, replace by <?php"; *)
set_mode ST_IN_SCRIPTING;
T_OPEN_TAG(tokinfo lexbuf)
}
| (([^'<']|"<"[^'?''%''s''<'])+(*{1,400}*))|"<s"|"<" {
(* more? cf orinal lexer *)
T_INLINE_HTML(tok lexbuf, tokinfo lexbuf)
}
| "<?=" {
(* less: if short_tags normally, otherwise T_INLINE_HTML *)
set_mode ST_IN_SCRIPTING2;
(* todo? ugly, may be better ot generate a real T_ECHO token
* with maybe a FakeTok or ExpandeTok.
*)
T_OPEN_TAG_WITH_ECHO(tokinfo lexbuf);
}
| "<?" | "<script" WHITESPACE+ "language" WHITESPACE* "=" WHITESPACE *
("php"|"\"php\""|"\'php\'") WHITESPACE*">"
{
(* XXX if short_tags normally otherwise T_INLINE_HTML *)
(* pr2 "BAD USE OF <? at initial state, replace by <?php"; *)
set_mode ST_IN_SCRIPTING;
T_OPEN_TAG(tokinfo lexbuf);
}
(*------------------------------------------------------------------------ *)
| eof { EOF (tokinfo lexbuf +> Ast.rewrap_str "") }
| _ (* ANY_CHAR *) {
if !Flag.verbose_lexing
then pr2_once ("LEXER:unrecognised symbol, in token rule:"^tok lexbuf);
TUnknown (tokinfo lexbuf)
}
@
\section{Script state (PHP mode)}
<<rule st_in_scripting>>=
rule st_in_scripting = parse
(* ----------------------------------------------------------------------- *)
(* spacing/comments *)
(* ----------------------------------------------------------------------- *)
<<comments rules>>
(* ----------------------------------------------------------------------- *)
(* Symbols *)
(* ----------------------------------------------------------------------- *)
<<symbol rules>>
(* ----------------------------------------------------------------------- *)
(* Keywords and ident *)
(* ----------------------------------------------------------------------- *)
(* ugly: 'self' and 'parent' should be keywords forbidden to be used
* as regular identifiers. But PHP is case insensitive and does not
* consider self/parent or SELF/PARENT as keywords. I think it's
* bad so I now consider self/parent as keywords, but still allow
* at least the uppercase form to be used as identifier, hence those
* two rules below.
*)
| "SELF" { T_IDENT (tok lexbuf, tokinfo lexbuf) }
| "PARENT" { T_IDENT (tok lexbuf, tokinfo lexbuf) }
<<keyword and ident rules>>
(* ----------------------------------------------------------------------- *)
(* Constant *)
(* ----------------------------------------------------------------------- *)
<<constant rules>>
(* ----------------------------------------------------------------------- *)
(* Strings *)
(* ----------------------------------------------------------------------- *)
<<strings rules>>
(* ----------------------------------------------------------------------- *)
(* Misc *)
(* ----------------------------------------------------------------------- *)
<<misc rules>>
(* ----------------------------------------------------------------------- *)
<<semi repetitive st_in_scripting rules for eof and error handling>>
@
<<semi repetitive st_in_scripting rules for eof and error handling>>=
| eof { EOF (tokinfo lexbuf +> Ast.rewrap_str "") }
| _ {
if !Flag.verbose_lexing
then pr2_once ("LEXER:unrecognised symbol, in token rule:"^tok lexbuf);
TUnknown (tokinfo lexbuf)
}
@
\subsection{Comments}
This lexer generate tokens for comments which is very unusual
for a compiler. Usually a compiler frontend will just drops
everything that is not relevant to generate code. But in some
contexts (refactoring, source code visualization) it is useful
to keep those comments somehow in the AST.
So one can not give
this lexer as-is to the parsing function. The caller must preprocess
it, e.g. by using techniques like [[cur_tok]] ref in [[parse_php.ml]]
as described in Section~\ref{sec:filter-comments-in-lexer}.
%Still? We also generate a separate token for newlines, so now
%the caller may also have to reagglomerate all those commentspace
%tokens if he was assuming that spaces were agglomerate in a single
%token.
<<comments rules>>=
| "/*" {
let info = tokinfo lexbuf in
let com = st_comment lexbuf in
T_COMMENT(info +> tok_add_s com)
}
| "/**/" { T_COMMENT(tokinfo lexbuf) }
| "/**" { (* RESET_DOC_COMMENT(); *)
let info = tokinfo lexbuf in
let com = st_comment lexbuf in
T_DOC_COMMENT(info +> tok_add_s com)
}
| "#"|"//" {
let info = tokinfo lexbuf in
let com = st_one_line_comment lexbuf in
T_COMMENT(info +> tok_add_s com)
}
(* old: | WHITESPACE { T_WHITESPACE(tokinfo lexbuf) } *)
| [' ' '\t']+ { TSpaces(tokinfo lexbuf) }
| ['\n' '\r'] { TNewline(tokinfo lexbuf) }
@
<<regexp aliases>>=
(* \x7f-\xff ???*)
let WHITESPACE = [' ' '\n' '\r' '\t']+
let TABS_AND_SPACES = [' ''\t']*
let NEWLINE = ("\r"|"\n"|"\r\n")
let WHITESPACEOPT = [' ' '\n' '\r' '\t']*
@
<<rule st_comment>>=
and st_comment = parse
| "*/" { tok lexbuf }
(* noteopti: *)
| [^'*']+ { let s = tok lexbuf in s ^ st_comment lexbuf }
| "*" { let s = tok lexbuf in s ^ st_comment lexbuf }
<<repetitive st_comment rules for error handling>>
@
\ifwantrepetitivecode
<<repetitive st_comment rules for error handling>>=
| eof { pr2 "LEXER: end of file in comment"; "*/"}
| _ {
let s = tok lexbuf in
pr2 ("LEXER: unrecognised symbol in comment:"^s);
s ^ st_comment lexbuf
}
@
\fi
% (*
% switch (yytext[yyleng-1]) {
% case '?': case '%': case '>':
% yyless(yyleng-1);
% yymore();
% break;
% case '\n':
% CG(zend_lineno)++;
% /* intentional fall through */
% default:
% zendlval->value.str.val = yytext; /* no copying - intentional */
% zendlval->value.str.len = yyleng;
% zendlval->type = IS_STRING;
% BEGIN(ST_IN_SCRIPTING);
% return T_COMMENT;
% }
% *)
%at the raise Todo below:
% (*
% if (CG(asp_tags) || yytext[yyleng-2] != '%') { /* asp comment? */
% zendlval->value.str.val = yytext; /* no copying - intentional */
% zendlval->value.str.len = yyleng-2;
% zendlval->type = IS_STRING;
% yyless(yyleng-2);
% BEGIN(ST_IN_SCRIPTING);
% return T_COMMENT;
% } else {
% yymore();
% }
% *)
<<rule st_one_line_comment>>=
and st_one_line_comment = parse
| "?"|"%"|">" { let s = tok lexbuf in s ^ st_one_line_comment lexbuf }
| ([^'\n' '\r' '?''%''>']* as start) (ANY_CHAR as x)
{
(match x with
| '?' | '%' | '>' ->
yyless 1 lexbuf;
start ^ st_one_line_comment lexbuf
(* end of recursion when new line or other character *)
| '\n' ->
(* don't want the newline to be part of the comment *)
yyless 1 lexbuf;
start
| c -> start ^ String.make 1 c
)
}
| NEWLINE {
(* don't want the newline to be part of the comment *)
yyless 1 lexbuf;
""
}
| "?>" {
(* "%>" is only when use asp_tags *)
yyless 2 lexbuf;
""
}
<<repetitive st_one_line_comment rules for error handling>>
@
\ifwantrepetitivecode
<<repetitive st_one_line_comment rules for error handling>>=
| eof { pr2 "LEXER: end of file in comment"; "*/" }
| _ {
if !Flag.verbose_lexing
then pr2_once ("LEXER:unrecognised symbol, in st_one_line_comment rule:"^tok lexbuf);
tok lexbuf
}
@
\fi
\subsection{Symbols}
<<symbol rules>>=
| '+' { TPLUS(tokinfo lexbuf) } | '-' { TMINUS(tokinfo lexbuf) }
| '*' { TMUL(tokinfo lexbuf) } | '/' { TDIV(tokinfo lexbuf) }
| '%' { TMOD(tokinfo lexbuf) }
| "++" { T_INC(tokinfo lexbuf) } | "--" { T_DEC(tokinfo lexbuf) }
| "=" { TEQ(tokinfo lexbuf) }
<<repetitive symbol rules>>
@
<<symbol rules>>=
(* Flex/Bison allow to use single characters directly as-is in the grammar
* by adding this in the lexer:
*
* <ST_IN_SCRIPTING>{TOKENS} { return yytext[0];}
*
* We don't, so we have transformed all those tokens in proper tokens with
* a name in the parser, and return them in the lexer.
*)
| '.' { TDOT(tokinfo lexbuf) }
| ',' { TCOMMA(tokinfo lexbuf) }
| '@' { T__AT(tokinfo lexbuf) }
| "=>" { T_DOUBLE_ARROW(tokinfo lexbuf) }
| "~" { TTILDE(tokinfo lexbuf) }
| ";" { TSEMICOLON(tokinfo lexbuf) }
| "!" { TBANG(tokinfo lexbuf) }
| "::" { TCOLCOL (tokinfo lexbuf) } (* was called T_PAAMAYIM_NEKUDOTAYIM *)
| '(' { TOPAR(tokinfo lexbuf) } | ')' { TCPAR(tokinfo lexbuf) }
| '[' { TOBRA(tokinfo lexbuf) } | ']' { TCBRA(tokinfo lexbuf) }
| ":" { TCOLON(tokinfo lexbuf) }
| "?" { TQUESTION(tokinfo lexbuf) }
(* semantic grep *)
| "..." { TDOTS(tokinfo lexbuf) }
@
<<symbol rules>>=
(* we may come from a st_looking_for_xxx context, like in string
* interpolation, so seeing a } we pop_mode!
*)
| '}' {
pop_mode ();
(* RESET_DOC_COMMENT(); ??? *)
TCBRACE(tokinfo lexbuf)
}
| '{' {
push_mode ST_IN_SCRIPTING;
TOBRACE(tokinfo lexbuf)
}
@
<<symbol rules>>=
| ("->" as sym) (WHITESPACEOPT as white) (LABEL as label) {
(* TODO: The ST_LOOKING_FOR_PROPERTY state does not work for now because
* it requires a yyless(1) which is not available in ocamllex (or is it ?)
* So have to cheat and use instead the pending_token with push_token.
*
* buggy: push_mode ST_LOOKING_FOR_PROPERTY;
*)
let info = tokinfo lexbuf in
let syminfo = rewrap_str sym info in
let parse_info = Ast.parse_info_of_info info in
let pos_after_sym =
parse_info.Parse_info.charpos + String.length sym in
let pos_after_white = pos_after_sym + String.length white in
let whiteinfo = Parse_info.tokinfo_str_pos white pos_after_sym in
let lblinfo = Parse_info.tokinfo_str_pos label pos_after_white in
push_token (T_IDENT (label, lblinfo));
(* todo: could be newline ... *)
push_token (TSpaces (whiteinfo));
T_OBJECT_OPERATOR(syminfo)