-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.py
140 lines (127 loc) · 5.41 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python3
#from parser import *
from read_table import *
''' The LEXER always takes a STRING as INPUT.
When the PARSER will communicate with the LEXER, it should transform TOKENS (LISTS) into STRINGS.
Vice versa, the PARSER only takes TOKENS (LISTS) as INPUT.
TOKENS have no TRUTH VALUE. It is the job of the PARSER to assess it.
'''
class Token:
def __init__(self,category: str ="",symbol: str ="",prop: list = [""],index: int = 0):
self.category = category
self.symbol = symbol
self.length_symbol = len(symbol)
self.prop = prop
self.index = index
def get_category(token): return token.category
def get_symbol(token): return token.symbol
def get_index(token): return token.index
def get_name(token): return token.name
def is_surrounder(token): return token.category == 'SUR'
def is_surrounder_open(token):
if is_surrounder(token): return token.action == 'open'
return False
def is_surrounder_close(token):
if is_surrounder(token): return token.action == 'close'
return False
def is_surrounder_wrap(token):
if is_surrounder(token): return token.action == 'wrap'
return False
def is_operator(token): return token.category == 'OP'
def is_operator_unary(token): return len(token.operands_place) == 1
def is_operator_binary(token): return len(token.operands_place) == 2
def is_operator_terary(token): return len(token.operands_place) == 3
def is_operator_prop_id(token): return token.name == 'PROP_ID'
def get_operator_operands_position(token): return token.operands_place
def get_operator_precedence(token):
if is_operator(token): return token.precedence
return False
def get_operands(token):
if is_operator(token): return token.operands
return False
def is_meta(token): return token.category == 'META'
def is_string(token): return token.category == 'STR'
def attribute_properties(token: Token):
token.name = token.prop[0]
if is_surrounder(token): attribute_surrounders_properties(token)
if is_operator(token): attribute_operators_properties(token)
#if is_meta(token): attribute_meta_properties(token)
#if is_string(token): attribute_string_properties(token)
def attribute_surrounders_properties(token: Token):
token.action = token.prop[1]
def attribute_operators_properties(token: Token):
token.operands_place = token.prop[1]
token.precedence = int(token.prop[2])
token.operands = []
def TOKENIZE(INPUT: str) -> list:
CHAIN = []
INPUT = INPUT.lstrip().rstrip()
if INPUT == '' : return [Token()]
while len(INPUT) > 0:
if INPUT[0:len(NULL)] == NULL:
INPUT=INPUT[len(NULL):]
continue
found_category = False
max_len = 0
for i in range(len(SYMBOLS['CATEGORIES'])):
for potential_symbol in reversed(sorted(SYMBOLS[SYMBOLS['CATEGORIES'][i]], key=lambda x: len(x))):
if INPUT[0:len(potential_symbol)] == potential_symbol :
if len(potential_symbol) <= max_len: continue
max_len = len(potential_symbol)
chosen_symbol = potential_symbol
category = SYMBOLS['CATEGORIES'][i]
found_category = True
if found_category:
cur_token = tokenize_general(chosen_symbol,category)
attribute_properties(cur_token)
CHAIN.append(cur_token)
INPUT = INPUT[len(chosen_symbol):]
continue
else:
cur_token = tokenize_string(INPUT,CHAIN)
attribute_properties(cur_token)
CHAIN.append(cur_token)
string_length = cur_token.length_symbol
INPUT = INPUT[string_length:]
'''Replace old indexes in TOKEN by new correct indexes'''
for i in range(len(CHAIN)):
CHAIN[i].index = i
return CHAIN
def tokenize_general(SYMBOL,CATEGORY):
category_index = SYMBOLS['CATEGORIES'].index(CATEGORY)
symbol_index = SYMBOLS[CATEGORY].index(SYMBOL)
cat = CATEGORY # index 0 in token
symbol = SYMBOL # index 1 in token
prop = [] # index 2 in token
index = 0 # always at last index of token
for subcat in SYMBOLS['SUBCATEGORIES'][category_index]:
property_index_for_symbol = SYMBOLS[subcat][symbol_index]
prop.append(property_index_for_symbol)
return Token(cat,symbol,prop,index)
def tokenize_string(INPUT,CHAIN):
cat = "STR"
string = ""
prop = []
index = 0
''' CHECK IF QUOTATION '''
quoting = False
if len(CHAIN) > 0 and \
CHAIN[len(CHAIN)-1].category == 'SUR' and \
CHAIN[len(CHAIN)-1].prop[1] == 'wrap' :
quoting = True
quoting_symbol = CHAIN[len(CHAIN)-1].symbol
while len(INPUT) > 0 :
if not quoting and INPUT[0:len(NULL)] == NULL: break
found_category = False
# Stop if encounter a symbol other than string (except if we are quoting and did not meet the quoting symbol)
for i in range(len(SYMBOLS['CATEGORIES'])):
for potential_symbol in reversed(sorted(SYMBOLS[SYMBOLS['CATEGORIES'][i]], key=lambda x: len(x))):
if INPUT[0:len(potential_symbol)] == potential_symbol :
if not quoting or potential_symbol == quoting_symbol:
found_category = True
if found_category: break
string += str(INPUT[0])
INPUT = INPUT[1:]
prop.append('variable')
prop.append(len(string))
return Token(cat,string,prop,index)