Permalink
Browse files

ISO C++ grammar added

git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@794 ab42f6e0-554d-0410-b580-99e487e6eeb2
  • Loading branch information...
grammarware committed May 7, 2010
1 parent 6ffba02 commit 6cdf9913da0bc67e8cf16cac719d895043f57fa3
@@ -15,6 +15,7 @@
('**','DOUBLESTAR'),
('*=','MULTIPLICATIONASSIGNMENT'),
('*','STAR'),
('.*','DOTSTAR'),
('++','DOUBLEPLUS'),
('+=','ADDITIONASSIGNMENT'),
('+','PLUS'),
@@ -23,7 +24,8 @@
('(','LEFTPARENTHESIS'),
(')','RIGHTPARENTHESIS'),
('{','LEFTCURLYBRACKET'),
('}','RIGHTCURLYBRACKET')
('}','RIGHTCURLYBRACKET'),
('->*','ARROWSTAR')
)
# these special symbols get transformed into HTML entities
@@ -9,18 +9,30 @@
double = {}
current = ''
keys=[]
reported = ['identifier','keyword','literal']
ignored = ['identifier','keyword','literal','string-literal']
reported = []
punctuators = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
#bannedLines = ('44','45','46',"Annex A","SPECIFICATION","A.2.")
bannedLines = []
knownTerminals = []
def quote(x):
return '"'+x+'"'
def unquote(x):
if x[0]=='"' and x[-1]=='"':
return x[1:-1]
else:
return x
def assignNewCurrent(c):
global current
if c not in keys:
keys.append(c)
current = c
d = performReplacements(c)
if d not in keys:
keys.append(d)
current = d
def readBannedLinesList(f):
lst = open(f,'r')
@@ -34,56 +46,91 @@ def readTerminalsList(f):
for kw in ' '.join(lst.readlines()).split():
knownTerminals.append(kw)
lst.close()
print knownTerminals
#print knownTerminals
for kw in knownTerminals:
if not kw.isalpha():
try:
punctuators[len(kw)-1].append(kw)
except IndexError,e:
print 'index error with',kw,len(kw)
punctuators.reverse()
print 'Punctuators:',punctuators
knownPostfixes = ('+','*','?')
knownReplacements = \
(
('opt',' OPTIONALITYMETASYMBOL'),
('','"-"')
('','"-"'),
('˜','"~"'),
('','fi'),
('[',' ['),
('(',' ('),
(']',' ]'),
(')',' )'),
)
oneof = False
def processline(line):
global oneof
global current
global oneof
rline = line.strip()
if rline == '':
return ''
if rline[-1]==':' and rline[-2].isalpha():
oneof = False
assignNewCurrent(rline[:-1])
# getting rid of leading stuff (perhaps labels)
assignNewCurrent(rline[:-1].split()[-1])
if current in grammar.keys():
#print 'Warning: double declaration of',current
double[current] = grammar[current][:]
grammar[current]=[]
grammar[current] = []
return
if rline.find('one of')>0:
oneof = True
assignNewCurrent(rline.replace('one of','').strip()[:-1])
assignNewCurrent(rline.replace('one of','').strip()[:-1].split()[-1])
if current in grammar.keys():
#print 'Warning: double declaration of',current,': the first one',grammar[current],'discarded'
double[current] = grammar[current][:]
grammar[current]=[]
grammar[current] = []
return
if oneof:
for t in processLineTokens(rline):
grammar[current].append(t)
for t in rline.split():
grammar[current].append(' '.join(processLineTokens(t)))
#for t in processLineTokens(rline):
# grammar[current].append(t)
else:
grammar[current].append(' '.join(processLineTokens(rline)))
return
def processLineTokens(rline):
iline = rline[:]
def performReplacements(line):
for x,y in knownReplacements:
iline = iline.replace(x,y)
line = line.replace(x,y)
return line
def processLineTokens(rline):
iline = performReplacements(rline)
tokens = iline.split()
for i in range(0,len(tokens)):
if tokens[i] in knownTerminals:
tokens[i] = '"'+tokens[i]+'"'
tokens[i] = quote(tokens[i])
tokens[i] = splitLeading(tokens[i],punctuators)
return tokens
def splitLeading(t,arrays):
for ps in arrays:
for p in ps:
if t.find(p)==0:
t = quote(p)+' '+' '.join(processLineTokens(t[len(p):]))
return t
def splitTrailing(t,array):
for p in array:
if t.find(p)>-1 and t.find(p)==len(t)-len(p):
#print 'Found',p,'in',t,'at',t.find(p),'- result at {'+(t[:-len(p)])+' "'+p+'"'+'}'
t = t[:-len(p)]+' '+quote(p)
return t
def readLines(f):
print 'Reading the PDF lines...'
@@ -126,7 +173,7 @@ def massageGrammarRule(context,nt):
tokens = context[nt][i].split()
# special case: a postfix metasymbol (e.g., *) occurs in the beggining of the line
if tokens[0] in knownPostfixes:
tokens[0] = '"'+tokens[0]+'"'
tokens[0] = quote(tokens[0])
# special case: arithmetic operations versus context metasymbols
if len(tokens) == 3 and tokens[1] == '*' and tokens[0]+' "/" '+tokens[2] in context[nt]:
print 'A suspicious metasymbol * converted to an arithmetic operator'
@@ -143,9 +190,26 @@ def massageGrammarRule(context,nt):
# REPORTING undefined nonterminals
if tokens[j][0] != '"'\
and tokens[j] not in grammar.keys()\
and tokens[j] not in ignored\
and tokens[j] not in reported:
print 'Warning: nonterminal',tokens[j],'undefined!'
reported.append(tokens[j])
ts = splitLeading(tokens[j],[knownTerminals])
if unquote(ts.split()[-1]) in grammar.keys():
# false positive in nonterminal -> terminal conversion
tss = ts.split()
tss[-1] = unquote(tss[-1])
ts = ' '.join(tss)
if ts.find(' ')>-1 and (ts.split()[-1] in grammar.keys() or ts.split()[-1] in ignored):
print 'L-Splitting',tokens[j],'into',ts
tokens[j] = ts
else:
print 'NOT L-splitting',tokens[j],'into',ts
ts = splitTrailing(tokens[j],knownTerminals)
if ts.find(' ')>-1 and (ts.split()[0] in grammar.keys() or ts.split()[-1] in ignored):
print 'T-Splitting',tokens[j],'into',ts
tokens[j] = ts
else:
print 'Warning: nonterminal',tokens[j],'undefined, but used in',nt
reported.append(tokens[j])
#if tokens[j] not in knownNonterminals:
# tokens[j]='"'+tokens[j]+'"'
# nt2t += 1
@@ -0,0 +1,13 @@
build:
../../../../shared/tools/pdf2bgf iso-is-annex.txt iso-output.bgf banned-lines.lst keywords.lst
../../../../shared/tools/checkxml bgf iso-output.bgf
../../../../shared/tools/normbgf iso-output.bgf iso-is-grammar.bgf
../../../../shared/tools/bgf2bnf iso-is-grammar.bgf iso-is.bnf
rm -f iso-output.bgf
clean:
rm -f iso-output.bgf iso-is-grammar.bgf iso-is.bnf
test:
make build
../../../../shared/tools/gdts iso-is-grammar.bgf extracted-grammar.bgf
@@ -0,0 +1,32 @@
INTERNATIONAL STANDARD ISO/IEC 14882
First edition
1998-09-01
Programming languages — C++
Reference number
ISO/IEC 14882:1998(E)
© ISO/IEC 1999
All rights reserved. Unless otherwise specified, no part of this publication may be reproduced or utilized in any form or by any means, electronic
or mechanical, including photocopying and microfilm, without permission in writing from either ISO at the address below or ISO's member body
in the country of the requester.
ISO copyright office
Case postale 56 ������ CH-1211 Geneva 20
Tel. + 41 22 749 01 11
Fax + 41 22 734 10 79
E-mail copyright@iso.ch
Web www.iso.ch
Printed in Switzerland
Fixes that were necessary for extraction:
namespaceidentifier{namespace-body }
was changed to
namespaceidentifier {namespace-body }
(the extractor cannot be expected to split this into a sequence
of a terminal, a reserved nonterminal, a non-alphanum terminal and a defined nonterminal)
There are various layout problems (say, ";" formatted in italics),
but we don't even care because we copy-paste the raw text anyway.
@@ -0,0 +1,5 @@
14882:1998
A.
[gram.
67
68
Oops, something went wrong.

0 comments on commit 6cdf991

Please sign in to comment.