Skip to content

Commit

Permalink
extracting grammars from language documents and collecting them
Browse files Browse the repository at this point in the history
git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@789 ab42f6e0-554d-0410-b580-99e487e6eeb2
  • Loading branch information
grammarware committed May 6, 2010
1 parent 99013f6 commit dda6992
Show file tree
Hide file tree
Showing 20 changed files with 3,031 additions and 86 deletions.
12 changes: 9 additions & 3 deletions shared/tools/pdf2bgf
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,22 @@ cd ../..
SLPS=${PWD}
cd ${LOCAL1}

if [ $# -lt 2 ]; then
if [ $# -ne 4 ]; then
echo "This tool produces a BGF grammar from a grammar copy-pasted from a PDF"
echo "Usage:"
echo " pdf2bgf [<keywords-list>] <input-lll> <output-bgf>"
exit 1
elif [ ! -r $1 ]; then
echo "Oops: $1 not found or not readable."
exit 1
elif [ $# -eq 2 ]; then
python ${SLPS}/topics/extraction/bnf2bgf/pdf2lll.py $1 intermediate.lll
elif [ ! -r $3 ]; then
echo "Oops: $1 not found or not readable."
exit 1
elif [ ! -r $4 ]; then
echo "Oops: $1 not found or not readable."
exit 1
else
python ${SLPS}/topics/extraction/bnf2bgf/pdf2lll.py $1 intermediate.lll $3 $4
python ${SLPS}/topics/extraction/bnf2bgf/lll2bgf.py intermediate.lll $2
rm -f intermediate.lll
fi
3 changes: 2 additions & 1 deletion topics/extraction/bnf2bgf/lll2bgf.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ def serialiseExpression(ts,debug):
return '\n\t'.join(s)

def serialiseFormula(name,tokens):
print 'Processing',name,'...'
# Useful yet annoying
#print 'Processing',name,'...'
if name=='BREAKPOINT':
print tokens
return '<bgf:production><nonterminal>'+name+'</nonterminal>'+serialiseExpression(tokens,True)+'</bgf:production>'
Expand Down
157 changes: 75 additions & 82 deletions topics/extraction/bnf2bgf/pdf2lll.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,75 +2,48 @@
# -*- coding: utf-8 -*-
import sys

nt2t = 0

lines = []
grammar = {}
double = {}
current = ''
keys=[]
reported = ['identifier','keyword','literal']


#bannedLines = ('44','45','46',"Annex A","SPECIFICATION","A.2.")
bannedLines = []
knownTerminals = []

def assignNewCurrent(c):
global current
if c not in keys:
keys.append(c)
current = c

forbiddedLines = ('44','45','46',"Annex A","SPECIFICATION","A.2.")

knownNonterminals = ('identifier','literal','right-shift-assignment','right-shift','keyword')

knownTerminalsBefore = \
(
'.',',','++','--','-','!','~','/','%','??','?','<<',':','::',
'[',']','(',')','{','}','<','>',
'&','^','|','&&','||',
'+=','-=','*=','/=','%=','&=','|=','^=','<<=','<=','>=','==','!=','='
)
def readBannedLinesList(f):
lst = open(f,'r')
for line in lst.readlines():
if line.strip():
bannedLines.append(line.strip())
lst.close()

def readTerminalsList(f):
lst = open(f,'r')
for kw in ' '.join(lst.readlines()).split():
knownTerminals.append(kw)
lst.close()
print knownTerminals

knownPostfixes = ('+','*','?')

knownTerminalsAfter = \
(
';'
)
# * +

knownReplacements = \
(
('opt',' ?'),
('opt',' OPTIONALITYMETASYMBOL'),
('–','"-"')
)

screenedTerminals = \
(
(';','SEMICOLON'),
(':','COLON'),
('**','DOUBLESTAR'),
('*=','MULTIPLICATIONASSIGNMENT'),
('*','STAR'),
('++','DOUBLEPLUS'),
('+=','ADDITIONASSIGNMENT'),
('+','PLUS'),
('?','QUESTION'),
('(','LEFTPARENTHESIS'),
(')','RIGHTPARENTHESIS'),
('{','LEFTCURLYBRACKET'),
('}','RIGHTCURLYBRACKET')
)

# these special symbols get transformed into HTML entities
htmlEntities = \
(
('&','amp'),
('<','lt'),
('>','gt')
)

fresh = 0
# 0 - the first production
# 1 - the first line of a production
# 2 - more lines in a production
# 3 - the first line in a one-of production
# 4 - more lines in a one-of production
oneof = False

def processline(line):
Expand Down Expand Up @@ -103,16 +76,12 @@ def processline(line):
return

def processLineTokens(rline):
tokens = rline.split()
for i in range(0,len(tokens)):
if tokens[i] in knownTerminalsBefore:
tokens[i] = '"'+tokens[i]+'"'
iline = ' '.join(tokens)
iline = rline[:]
for x,y in knownReplacements:
iline = iline.replace(x,y)
tokens = iline.split()
for i in range(0,len(tokens)):
if tokens[i] in knownTerminalsAfter:
if tokens[i] in knownTerminals:
tokens[i] = '"'+tokens[i]+'"'
return tokens

Expand All @@ -123,7 +92,7 @@ def readLines(f):
for line in pdf.readlines():
cx += 1
include = True
for x in forbiddedLines:
for x in bannedLines:
if line.find(x)>-1:
include = False
if include:
Expand Down Expand Up @@ -151,34 +120,49 @@ def writeGrammar(f):
lll.write(';\n\n')
lll.close()

def massageGrammarRule(context,nt):
global nt2t
for i in range(0,len(context[nt])):
tokens = context[nt][i].split()
# special case: a postfix metasymbol (e.g., *) occurs in the beggining of the line
if tokens[0] in knownPostfixes:
tokens[0] = '"'+tokens[0]+'"'
# special case: arithmetic operations versus context metasymbols
if len(tokens) == 3 and tokens[1] == '*' and tokens[0]+' "/" '+tokens[2] in context[nt]:
print 'A suspicious metasymbol * converted to an arithmetic operator'
tokens[1] = '"*"'
if len(tokens) == 3 and tokens[1] == '+' and tokens[0]+' "-" '+tokens[2] in context[nt]:
print 'A suspicious metasymbol + converted to an arithmetic operator'
tokens[1] = '"+"'
for j in range(0,len(tokens)):
# putting back the optionality metasymbol
if tokens[j] == 'OPTIONALITYMETASYMBOL':
tokens[j] = '?'
continue
# NOT converting undefined nonterminals to terminals
# REPORTING undefined nonterminals
if tokens[j][0] != '"'\
and tokens[j] not in grammar.keys()\
and tokens[j] not in reported:
print 'Warning: nonterminal',tokens[j],'undefined!'
reported.append(tokens[j])
#if tokens[j] not in knownNonterminals:
# tokens[j]='"'+tokens[j]+'"'
# nt2t += 1
context[nt][i] = ' '.join(tokens)
return

def massageGrammar():
#print len(keys),'vs',len(grammar.keys())
nt2t = 0
global nt2t
# massaging the main grammar
for nt in grammar.keys():
for i in range(0,len(grammar[nt])):
tokens = grammar[nt][i].split()
# special case: a postfix metasymbol (e.g., *) occurs in the beggining of the line
if tokens[0] in knownPostfixes:
tokens[0] = '"'+tokens[0]+'"'
# special case: arithmetic operations versus grammar metasymbols
if len(tokens) == 3 and tokens[1] == '*' and tokens[0]+' "/" '+tokens[2] in grammar[nt]:
print 'A suspicious metasymbol * converted to an arithmetic operator'
tokens[1] = '"*"'
if len(tokens) == 3 and tokens[1] == '+' and tokens[0]+' "-" '+tokens[2] in grammar[nt]:
print 'A suspicious metasymbol + converted to an arithmetic operator'
tokens[1] = '"+"'
# converting undefined nonterminals to terminals
for j in range(0,len(tokens)):
if tokens[j][0] != '"'\
and tokens[j] not in grammar.keys()\
and tokens[j] not in knownPostfixes:
#print 'Warning: nonterminal',tokens[j],'undefined!'
if tokens[j] not in knownNonterminals:
tokens[j]='"'+tokens[j]+'"'
nt2t += 1
grammar[nt][i] = ' '.join(tokens)
massageGrammarRule(grammar,nt)
# massaging the double rules (for matching purposes)
for nt in double.keys():
massageGrammarRule(double,nt)
if nt2t:
print 'Warning:',nt2t,'undefined nonterminals were converted to terminals.'
# matching double rules
for nt in double.keys():
if double[nt]!=grammar[nt]:
print 'Warning: double definition of',nt
Expand All @@ -188,17 +172,26 @@ def massageGrammar():
if s not in grammar[nt]:
grammar[nt].append(s)
print 'Opted for the union of them:',grammar[nt]
# add keywords!!!
if 'keyword' not in grammar.keys():
keys.append('keyword')
grammar['keyword'] = []
for kw in knownTerminals:
if kw.isalpha():
grammar['keyword'].append(kw)
return

if __name__ == "__main__":
print 'PDF (rather txt copy-pasted from a PDF) pre-processor: produces an LLL grammar suitable to be fed into an LLL2BGF extractor.'
if len(sys.argv) == 3:
if len(sys.argv) == 5:
readBannedLinesList(sys.argv[3])
readTerminalsList(sys.argv[4])
readLines(sys.argv[1])
readGrammar(lines)
massageGrammar()
writeGrammar(sys.argv[2])
sys.exit(0)
else:
print 'Usage:'
print ' ',sys.argv[0],'''<input-txt> <output-lll>'''
print ' ',sys.argv[0],'''<input-txt> <output-lll> <list-of-banned-lines> <list-of-known-keywords>'''
sys.exit(1)
6 changes: 6 additions & 0 deletions topics/grammars/csharp/ecma-334-2005/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
all:
../../../../shared/tools/pdf2bgf ecma-334-annex.txt ecma-output.bgf banned-lines.lst keywords.lst
../../../../shared/tools/checkxml bgf ecma-output.bgf
../../../../shared/tools/normbgf ecma-output.bgf ecma-334-grammar.bgf
../../../../shared/tools/bgf2bnf ecma-334-grammar.bgf ecma-334.bnf
rm -f ecma-output.bgf
9 changes: 9 additions & 0 deletions topics/grammars/csharp/ecma-334-2005/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Errors spotted right away:

page 467 of C# pdf
additive-expression – multiplicative-expressionshift-expression:
(proper newline added manually to ecma-334-annex.txt)

page 458 of C# pdf
the definition of keyword lacks: get set yield add remove alias partial where
(added manually to keywords.lst)
6 changes: 6 additions & 0 deletions topics/grammars/csharp/ecma-334-2005/banned-lines.lst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
44
45
46
Annex A
SPECIFICATION
A.2.
Loading

0 comments on commit dda6992

Please sign in to comment.