Skip to content

Commit

Permalink
Allow returning errors from lexer
Browse files Browse the repository at this point in the history
  • Loading branch information
exellentcoin26 committed Sep 25, 2023
1 parent 041dcdf commit 088c57e
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 24 deletions.
17 changes: 17 additions & 0 deletions pango-lexer/src/lexer/error.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
pub type LexResult<T> = Result<T, LexError>;

#[derive(Debug)]
#[cfg_attr(test, derive(PartialEq, Eq))]
pub struct LexError(pub(super) usize, pub(super) usize);

impl std::fmt::Display for LexError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(
f,
"[ERROR] ({}, {}): Lexer could not identify token",
self.0, self.1
)
}
}

impl std::error::Error for LexError {}
8 changes: 6 additions & 2 deletions pango-lexer/src/lexer/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,21 @@ impl Default for InputIterToken {

impl InputIterToken {
/// Creates a new [`InputIterToken`].
pub(self) fn new(token_start: usize) -> Self {
fn new(token_start: usize) -> Self {
Self {
source: String::new(),
pos: (token_start, token_start),
}
}

/// Updates the end position of the token.
pub(self) fn update_pos_end(&mut self, new_end: usize) {
fn update_pos_end(&mut self, new_end: usize) {
self.pos.1 = new_end;
}

pub(super) fn is_empty(&self) -> bool {
self.source.is_empty()
}
}

impl Iterator for InputIter<'_> {
Expand Down
86 changes: 65 additions & 21 deletions pango-lexer/src/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
use self::input::{InputIter, InputIterToken};
use self::{
error::{LexError, LexResult},
input::{InputIter, InputIterToken},
};
use crate::{
fsm::{NDSimulate, Nfa, NfaCompiler, Simulatable, Simulate, StateId},
regex::{self, parser::error::ParseResult},
};

use std::collections::BTreeMap;
use std::{collections::BTreeMap, iter::FusedIterator};

mod error;
mod input;

/// Finite-state machine based lexer.
Expand Down Expand Up @@ -63,7 +67,7 @@ where
for<'a> Fsm: Simulatable + 'a,
for<'a> Fsm::Simulator<'a>: NDSimulate,
{
type Item = Token<TokenKind>;
type Item = LexResult<Token<TokenKind>>;

fn next(&mut self) -> Option<Self::Item> {
let mut sim = self.fsm.to_simulator();
Expand All @@ -73,8 +77,16 @@ where
if !sim.can_feed(ch) {
// the current token is the longest token we can 'munch'

return token_kind.map(|token_kind| {
Token::from_input_iter_token(self.iter.consume_token(), token_kind)
return Some(match token_kind {
Some(token_kind) => Ok(Token::from_input_iter_token(
self.iter.consume_token(),
token_kind,
)),
None => {
// tokenizer could not identify the token
let token = self.iter.get_token();
Err(LexError(token.pos.0, token.pos.1))
}
});
}

Expand All @@ -85,7 +97,7 @@ where

// Get the first final state the Simulator is in.
//
// Note: this assumes that the lower final state ids, have the highest
// NOTE: this assumes that the lower final state ids, have the highest
// precedence. This is enforced by the order of expression
// compilation.
let final_state = sim
Expand All @@ -112,14 +124,33 @@ where

if sim.is_accepting() {
token_kind.map(|token_kind| {
Token::from_input_iter_token(self.iter.consume_token(), token_kind)
Ok(Token::from_input_iter_token(
self.iter.consume_token(),
token_kind,
))
})
} else {
None
// check if characters are remaining
self.iter.accept_suffix();
let token = self.iter.get_token();

if token.is_empty() {
None
} else {
let token = self.iter.consume_token();
Some(Err(LexError(token.pos.0, token.pos.1)))
}
}
}
}

impl<TokenKind, Fsm> FusedIterator for Lexer<'_, TokenKind, Fsm>
where
for<'a> Fsm: Simulatable + 'a,
for<'a> Fsm::Simulator<'a>: NDSimulate,
{
}

impl<TokenKind> Lexer<'_, TokenKind, Nfa> {
/// Creates a `LexerGenerator`.
pub fn builder() -> LexerGenerator<TokenKind> {
Expand Down Expand Up @@ -247,7 +278,7 @@ mod tests {
macro_rules! assert_eq_tokens {
($lhs:expr, $rhs:expr) => {
for (expected, actual) in $lhs.into_iter().zip($rhs) {
assert_eq!(expected, actual);
assert_eq!(Ok(expected), actual);
}
};
}
Expand All @@ -266,22 +297,24 @@ mod tests {
};
}

use crate::lexer::error::LexError;

use super::{Lexer, Token};

#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
enum Foo {
#[default]
A,
B,
C,
D {
len: usize,
},
E,
}

#[test]
fn lexer() -> Result<(), Box<dyn std::error::Error>> {
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
enum Foo {
#[default]
A,
B,
C,
D {
len: usize,
},
E,
}

let tokens = Lexer::builder()
.with_token_unit("aaaa", Foo::A)?
.with_token_unit("b{4,}", Foo::B)?
Expand All @@ -308,4 +341,15 @@ mod tests {

Ok(())
}

#[test]
fn token_error() -> Result<(), Box<dyn std::error::Error>> {
let tokens = Lexer::builder()
.with_token_unit("aa", Foo::A)?
.tokenize("aaa");

assert_eq!(tokens.last(), Some(Err(LexError(2, 3))));

Ok(())
}
}
7 changes: 6 additions & 1 deletion pango-parser/src/parser/slr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,12 @@ where
state: current_state,
}]);

while let Some(token) = next_token.or_else(|| input.next()) {
while let Some(token) = next_token.map(Ok).or_else(|| input.next()) {
let token = match token {
Ok(token) => token,
Err(_) => return Err(()),
};

next_token = Some(token);
let (parse_node, accept) =
self.handle_token(&mut next_token, &mut stack, current_state)?;
Expand Down

0 comments on commit 088c57e

Please sign in to comment.