This repository has been archived by the owner on Jun 4, 2019. It is now read-only.
/
parsing_hacks_typedef.ml
375 lines (334 loc) · 13 KB
/
parsing_hacks_typedef.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
(* Yoann Padioleau
*
* Copyright (C) 2011,2014 Facebook
* Copyright (C) 2002-2008 Yoann Padioleau
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License (GPL)
* version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* file license.txt for more details.
*)
open Common
module TV = Token_views_cpp
module TH = Token_helpers_cpp
module Ast = Ast_cpp
open Parser_cpp
open Token_views_cpp
open Parsing_hacks_lib
(*****************************************************************************)
(* Prelude *)
(*****************************************************************************)
(*
* This file gathers parsing heuristics related to the typedefs.
* C does not have a context-free grammar; C requires the parser to know when
* an ident corresponds to a typedef or an ident. This normally means that
* we must call cpp on the file and have the lexer and parser cooperate
* to remember what is what. In lang_cpp/ we want to parse as-is,
* which means we need to infer back whether an identifier is
* a typedef or not.
*
* In this module we use a view that is more convenient for
* typedefs detection. We got rid of:
* - template arguments (see find_template_commentize())
* - qualifiers (see find_qualifier_commentize)
* - differences between & and * (filter_for_typedef() below)
* - differences between TIdent and TOperator,
* - const, volatile, restrict keywords
* - TODO merge multiple ** or *& or whatever
*
* history:
* - We used to make the lexer and parser cooperate in a lexerParser.ml file
* - this was not enough because of declarations such as 'acpi acpi;'
* and so we had to enable/disable the ident->typedef mechanism
* which requires even more lexer/parser cooperation
* - this was ugly too so now we use a typedef "inference" mechanism
* - we refined the typedef inference to sometimes use InParameter hint
* and more contextual information from token_views_context.ml
*)
(*****************************************************************************)
(* Helpers *)
(*****************************************************************************)
let look_like_multiplication_context tok_before =
match tok_before with
| TEq _ | TAssign _
| TWhy _
| Treturn _
| TDot _ | TPtrOp _ | TPtrOpStar _ | TDotStar _
| TOCro _
-> true
| tok when TH.is_binary_operator_except_star tok -> true
| _ -> false
let look_like_declaration_context tok_before =
match tok_before with
| TOBrace _
| TPtVirg _
| TCommentNewline_DefineEndOfMacro _
| TInclude _
(* no!! | TCBrace _, I think because of nested struct so can have
* struct { ... } v;
*)
-> true
| _ when TH.is_privacy_keyword tok_before -> true
| _ -> false
let fakeInfo = { Parse_info.
token = Parse_info.FakeTokStr ("",None);
transfo = Parse_info.NoTransfo;
}
(*****************************************************************************)
(* Better View *)
(*****************************************************************************)
let filter_for_typedef multi_groups =
(* a sentinel, which helps a few typedef heuristics which look
* for a token before which would not work for the first toplevel
* declaration.
*)
let multi_groups =
Tok(mk_token_fake (TPtVirg (fakeInfo)))::multi_groups in
let _template_args = ref [] in
(* remove template and other things
* less: right now this is less useful because we actually
* comment template args in a previous pass, but at some point this
* will be useful.
*)
let rec aux xs =
xs +> Common.map_filter (function
| TV.Angle (_, _, _) ->
(* todo: analayze xs!! add in _template_args
* todo: add the t1,t2 around xs to have
* some sentinel for the typedef heuristics patterns
* who often look for the token just before the typedef.
*)
None
| TV.Braces (t1, xs, t2) ->
Some (TV.Braces (t1, aux xs, t2))
| TV.Parens (t1, xs, t2) ->
Some (TV.Parens (t1, aux xs, t2))
(* remove other noise for the typedef inference *)
| TV.Tok t1 ->
match t1.TV.t with
(* const is a strong signal for having a typedef, so why skip it?
* because it forces to duplicate rules. We need to infer
* the type anyway even when there is no const around.
* todo? maybe could do a special pass first that infer typedef
* using only const rules, and then remove those const so
* have best of both worlds.
*)
| Tconst _ | Tvolatile _
| Trestrict _
-> None
| Tregister _ | Tstatic _ | Tauto _ | Textern _
| Ttypedef _
-> None
| Tvirtual _ | Tfriend _ | Tinline _ | Tmutable _
-> None
(* let's transform all '&' into '*'
* todo: need propagate also the where?
*)
| TAnd ii -> Some (TV.Tok (mk_token_extended (TMul ii)))
(* and operator into TIdent
* TODO: skip the token just after the operator keyword?
* could help some heuristics too
*)
| Toperator ii ->
Some (TV.Tok (mk_token_extended (TIdent ("operator", ii))))
| _ -> Some (TV.Tok t1)
)
in
let xs = aux multi_groups in
(* todo: look also for _template_args *)
[TV.tokens_of_multi_grouped xs]
(*****************************************************************************)
(* Main heuristics *)
(*****************************************************************************)
(*
* Below we assume a view without:
* - comments and cpp-directives
* - template stuff and qualifiers (but not TIdent_ClassnameAsQualifier)
* - const/volatile/restrict
* - & => *
*
* With such a view we can write less patterns.
*
* Note that qualifiers are slightly less important to filter because
* most of the heuristics below look for tokens after the ident
* and qualifiers are usually before.
*
* todo: do it on multi view? all those rules with TComma and TOPar
* are ugly.
*)
let find_typedefs xxs =
let rec aux xs =
match xs with
| [] -> ()
(* struct x ...
* those identifiers (called tags) must not be transformed in typedefs *)
| {t=(Tstruct _ | Tunion _ | Tenum _ | Tclass _)}::{t=TIdent _}::xs ->
aux xs
(* xx yy *)
| ({t=TIdent (s,i1)} as tok1)::{t=TIdent _}::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux xs
(* xx ( *yy )( *)
| ({t=TIdent (s,i1)} as tok1)::{t=TOPar _}::{t=TMul _}
::{t=TIdent _}::{t=TCPar _}::({t=TOPar _} as tok2)::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux (tok2::xs)
(* xx* ( *yy )( *)
| ({t=TIdent (s,i1)} as tok1)::{t=TMul _}::{t=TOPar _}::{t=TMul _}
::{t=TIdent _}::{t=TCPar _}::({t=TOPar _} as tok2)::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux (tok2::xs)
(* xx ( *yy[x] )( *)
| ({t=TIdent (s,i1)} as tok1)::{t=TOPar _}::{t=TMul _}
::{t=TIdent _}::{t=TOCro _}::_::{t=TCCro _}::{t=TCPar _}::({t=TOPar _} as tok2)::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux (tok2::xs)
(* xx* ( *yy[x] )( *)
| ({t=TIdent (s,i1)} as tok1)::{t=TMul _}::{t=TOPar _}::{t=TMul _}
::{t=TIdent _}::{t=TOCro _}::_::{t=TCCro _}::{t=TCPar _}::({t=TOPar _} as tok2)::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux (tok2::xs)
(* xx ( *yy[]) *)
| ({t=TIdent (s,i1)} as tok1)::{t=TOPar _}::{t=TMul _}
::{t=TIdent _}::{t=TOCro _}::{t=TCCro _}::{t=TCPar _}::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux xs
(* + xx * yy *)
| {t=tok_before}::{t=TIdent (_s,_)}::{t=TMul _}::{t=TIdent _}::xs
when look_like_multiplication_context tok_before ->
aux xs
(* { xx * yy *)
| {t=tok_before}::({t=TIdent (s,i1)} as tok1)::{t=TMul _}::{t=TIdent _}::xs
when look_like_declaration_context tok_before ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux xs
(* } xx * yy *)
(* because TCBrace is not anymore in look_like_declaration_context *)
| {t=TCBrace _}::({t=TIdent (s,i1)} as tok1)::{t=TMul _}::{t=TIdent _}::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux xs
(* xx * yy
* could be a multiplication too, so need InParameter guard/
* less: the InParameter has some FPs, so maybe better to
* rely on the spacing hint, see the rule below.
*)
| ({t=TIdent (s,i1);where=InParameter::_} as tok1)::{t=TMul _}
::{t=TIdent _}::xs
->
change_tok tok1 (TIdent_Typedef (s, i1));
aux xs
(* xx *yy *)
| ({t=TIdent (s,i1);col=c0} as tok1)::{t=TMul _;col=c1}::{t=TIdent _;col=c2}::xs
when c2 = c1 + 1 && c1 >= c0 + String.length s + 1
->
change_tok tok1 (TIdent_Typedef (s, i1));
aux xs
(* xx* yy *)
| ({t=TIdent(s,i1);col=c0}as tok1)::{t=TMul _;col=c1}::{t=TIdent _;col=c2}::xs
when c1 = c0 + String.length s && c2 >= c1 + 2
->
change_tok tok1 (TIdent_Typedef (s, i1));
aux xs
(* xx ** yy
* less could be a multiplication too, but with less probability
*)
| ({t=TIdent (s,i1)} as tok1)::{t=TMul _}::{t=TMul _}::{t=TIdent _}::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux xs
(* (xx) yy and not a if/while before '(' (and yy can also be a constant) *)
| {t=tok1}::{t=TOPar _}::({t=TIdent(s, i1)} as tok3)::{t=TCPar _}
::{t = (TIdent _|TInt _|TString _|TFloat _|TTilde _|TOPar _) }::xs
when not (TH.is_stuff_taking_parenthized tok1) (* && line are the same?*)->
change_tok tok3 (TIdent_Typedef (s, i1));
(* todo? recurse on bigger ? *)
aux xs
(* todo: = (xx) ..., |= (xx) ..., (xx)~, ... *)
(* (xx){ gccext: kenccext: *)
| {t=tok1}::{t=TOPar _}::({t=TIdent(s, i1)} as tok3)::{t=TCPar _}
::({t=TOBrace _} as tok5)::xs
when not (TH.is_stuff_taking_parenthized tok1) ->
change_tok tok3 (TIdent_Typedef (s, i1));
aux (tok5::xs)
(* (xx * ), not that pointer function are ( *xx ), so star before.
* TODO: does not really need the closing paren?
* TODO: check that not InParameter or InArgument?
*)
| {t=TOPar _}::({t=TIdent(s, i1)} as tok3)::{t=TMul _}::{t=TCPar _}::xs ->
change_tok tok3 (TIdent_Typedef (s, i1));
aux xs
(* (xx ** ) *)
| {t=TOPar _}::({t=TIdent(s, i1)} as tok3)
::{t=TMul _}::{t=TMul _}::{t=TCPar _}::xs ->
change_tok tok3 (TIdent_Typedef (s, i1));
aux xs
(* xx* [,)]
* don't forget to recurse by reinjecting the comma or closing paren
*)
| ({t=TIdent(s, i1)} as tok1)::{t=TMul _}
::({t=(TComma _| TCPar _)} as x)::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux (x::xs)
(* xx** [,)] *)
| ({t=TIdent(s, i1)} as tok1)::{t=TMul _}::{t=TMul _}
::({t=(TComma _| TCPar _)} as x)::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux (x::xs)
(* xx*** [,)] *)
| ({t=TIdent(s, i1)} as tok1)::{t=TMul _}::{t=TMul _}::{t=TMul _}
::({t=(TComma _| TCPar _)} as x)::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux (x::xs)
(* xx*[] [,)] *)
| ({t=TIdent(s, i1)} as tok1)::{t=TMul _}::{t=TOCro _}::{t=TCCro _}
::({t=(TComma _| TCPar _)} as x)::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux (x::xs)
(* [(,] xx [),] where InParameter *)
(* hmmm: todo: some false positives on InParameter, see mini/constants.c,
* so now simpler to add a TIdent in the parameter_decl rule
*)
| {t=(TOPar _ | TComma _)}::({t=TIdent (s, i1); where=InParameter::_} as tok1)
::({t=(TCPar _ | TComma _)} as tok2)::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux (tok2::xs)
(* [(,] xx[X] [),] where InParameter *)
| {t=(TOPar _ | TComma _)}
::({t=TIdent (s, i1); where=InParameter::_} as tok1)
::{t=TOCro _}::_::{t=TCCro _}
::({t=(TCPar _ | TComma _)} as tok2)::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux (tok2::xs)
(* [(,] xx[...] could be a array access, so need InParameter guard *)
| {t=(TOPar _ | TComma _)}::({t=TIdent (s,i1);where=InParameter::_} as tok1)
::{t=TOCro _}::_tok::{t=TCCro _}::xs
->
change_tok tok1 (TIdent_Typedef (s, i1));
aux xs
(* kencc-ext: xx; where InStruct *)
| {t=tok_before}::({t=TIdent (s, i1)} as tok1)::({t=TPtVirg _} as tok2)::xs
when look_like_declaration_context tok_before ->
(match tok1.where with
| (InClassStruct _)::_ ->
change_tok tok1 (TIdent_Typedef (s, i1));
| _ -> ()
);
aux (tok2::xs)
(* sizeof(xx) sizeof expr does not require extra parenthesis, but
* in practice people do, so guard it with what looks_like_typedef
*)
| {t=Tsizeof _}::{t=TOPar _}::({t=TIdent (s, i1)} as tok1)::{t=TCPar _}::xs
when Token_views_context.look_like_typedef s || s =~ "^[A-Z].*" ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux xs
(* new Xxx *)
| {t=Tnew _}::({t=TIdent (s, i1)} as tok1)::xs ->
change_tok tok1 (TIdent_Typedef (s, i1));
aux xs
(* recurse *)
| _::xs -> aux xs
in
xxs +> List.iter aux