Skip to content

Commit

Permalink
Merge pull request #4 from exellentcoin26/slr-parser-generator
Browse files Browse the repository at this point in the history
Implement SLR parser generator
  • Loading branch information
exellentcoin26 committed Sep 23, 2023
2 parents b995e4f + 858203f commit 0fec398
Show file tree
Hide file tree
Showing 18 changed files with 1,407 additions and 253 deletions.
2 changes: 1 addition & 1 deletion .github/codecov.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# ref: https://docs.codecov.com/docs/codecovyml-reference
coverage:
range: 75..100
range: 75..90
round: down
precision: 1
status:
Expand Down
10 changes: 10 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ members = [
]

[dependencies]
dyn-clone = "1"
pango-lexer = {path = "./pango-lexer"}
93 changes: 58 additions & 35 deletions pango-lexer/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use std::collections::BTreeMap;
mod input;

/// Finite-state machine based lexer.
pub struct Lexer<'input, TokenKind, Fsm: Simulatable> {
pub struct Lexer<'input, TokenKind, Fsm: Simulatable = Nfa> {
/// Input to be lexed.
pub input: &'input str,
/// Iterator over the input.
Expand All @@ -22,7 +22,8 @@ pub struct Lexer<'input, TokenKind, Fsm: Simulatable> {
tokens: BTreeMap<StateId, TokenKindGenerator<TokenKind>>,
}

/// `TokenKind`s can be generated based on the token or can just be cloned.
/// `TokenKind`s can be generated based on the token or can just be created
/// (e.g., unit structs).
///
/// This is useful for doing some parsing of the internal structure of the
/// token, for example, converting to [si units](https://en.wikipedia.org/wiki/International_System_of_Units). In rust,
Expand All @@ -31,29 +32,23 @@ pub struct Lexer<'input, TokenKind, Fsm: Simulatable> {
enum TokenKindGenerator<TokenKind> {
/// Generate a `TokenKind` based on the token source.
Map(Box<dyn FnMut(&str) -> TokenKind>),
/// Clone the `TokenKind`.
Unit(TokenKind),
/// Create the `TokenKind`.
Unit(Box<dyn FnMut() -> TokenKind>),
}

/// [`Token`] returned by the [`Lexer`].
#[derive(Debug, PartialEq, Eq)]
pub struct Token<TokenKind>
where
TokenKind: Clone,
{
pub struct Token<TokenKind> {
/// Kind of token configured by the user.
kind: TokenKind,
pub kind: TokenKind,
/// Source string representation of the token.
source: String,
pub source: String,
/// Position of the token in the input. The end points one position beyond
/// the end of the token.
pos: (usize, usize),
pub pos: (usize, usize),
}

impl<TokenKind> Token<TokenKind>
where
TokenKind: Clone,
{
impl<TokenKind> Token<TokenKind> {
/// Creates a new [`Token`] from the [`InputIterToken`].
fn from_input_iter_token(
InputIterToken { source, pos }: InputIterToken,
Expand All @@ -67,7 +62,6 @@ impl<TokenKind, Fsm> Iterator for Lexer<'_, TokenKind, Fsm>
where
for<'a> Fsm: Simulatable + 'a,
for<'a> Fsm::Simulator<'a>: NDSimulate,
TokenKind: Clone,
{
type Item = Token<TokenKind>;

Expand Down Expand Up @@ -110,7 +104,7 @@ where
token_kind_gen(&self.iter.get_token().source)
}

TokenKindGenerator::Unit(token_kind) => token_kind.clone(),
TokenKindGenerator::Unit(token_kind_gen) => token_kind_gen(),
},
);
}
Expand Down Expand Up @@ -154,14 +148,38 @@ impl<TokenKind> LexerGenerator<TokenKind> {
Self::default()
}

/// Adds a single token to the [`Lexer`]. The function is called when
/// generating the token. It is not given the source representation of
/// the token. If you need this use [`with_token_map`] instead.
///
/// # Fails
///
/// When the provided `token` is invalid regex.
///
/// [`with_token_map`]: Self::with_token_map
#[inline]
pub fn with_token(mut self, token: &str, token_kind: fn() -> TokenKind) -> ParseResult<Self>
where
TokenKind: 'static,
{
self.add_token(token, TokenKindGenerator::Unit(Box::new(token_kind)))?;
Ok(self)
}

/// Adds a single token to the [`Lexer`].
///
/// # Fails
///
/// When the provided `token` is invalid regex.
#[inline]
pub fn with_token(mut self, token: &str, token_kind: TokenKind) -> ParseResult<Self> {
self.add_token(token, TokenKindGenerator::Unit(token_kind))?;
pub fn with_token_unit(mut self, token: &str, token_kind: TokenKind) -> ParseResult<Self>
where
TokenKind: Copy + 'static,
{
self.add_token(
token,
TokenKindGenerator::Unit(Box::new(move || token_kind)),
)?;
Ok(self)
}

Expand All @@ -176,9 +194,13 @@ impl<TokenKind> LexerGenerator<TokenKind> {
pub fn with_token_map(
mut self,
token: &str,
token_kind_map: Box<dyn FnMut(&str) -> TokenKind>,
) -> ParseResult<Self> {
self.add_token(token, TokenKindGenerator::Map(token_kind_map))?;
// token_kind_map: Box<dyn FnMut(&str) -> TokenKind>,
token_kind_map: fn(&str) -> TokenKind,
) -> ParseResult<Self>
where
TokenKind: 'static,
{
self.add_token(token, TokenKindGenerator::Map(Box::new(token_kind_map)))?;
Ok(self)
}

Expand Down Expand Up @@ -248,27 +270,28 @@ mod tests {

#[test]
fn lexer() -> Result<(), Box<dyn std::error::Error>> {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
enum Foo {
#[default]
A,
B,
C,
D { len: usize },
D {
len: usize,
},
E,
}

let tokens = Lexer::builder()
.with_token("aaaa", Foo::A)?
.with_token("b{4,}", Foo::B)?
.with_token("b{4}", Foo::C)?
.with_token_map(
"d*",
Box::new(|token_source| Foo::D {
len: token_source.len(),
}),
)?
.with_token(r"/\* .*", Foo::A)?
.with_token(r"/\* .* \*/", Foo::E)?
.with_token_unit("aaaa", Foo::A)?
.with_token_unit("b{4,}", Foo::B)?
.with_token_unit("b{4}", Foo::C)?
.with_token_map("d*", |token_source| Foo::D {
len: token_source.len(),
})?
.with_token_unit(r"/\* .*", Foo::A)?
.with_token_unit(r"/\* .* \*/", Foo::E)?
.with_token(r"a", Default::default)?
.tokenize(
"bbbbbbbaaaabbbbddddddddddddddddd/* foo bar baz **** * /* foo bar baz ***** */",
);
Expand Down
2 changes: 1 addition & 1 deletion pango-lexer/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pub use lexer::Lexer;
pub use lexer::{Lexer, Token};

mod fsm;
mod iter;
Expand Down
19 changes: 8 additions & 11 deletions pango-lexer/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,14 @@ enum Foo {

fn main() -> Result<(), Box<dyn std::error::Error>> {
let lexer = Lexer::builder()
.with_token("aaaa", Foo::A)?
.with_token("b{4,}", Foo::B)?
.with_token("b{4}", Foo::C)?
.with_token_map(
"d*",
Box::new(|token_source| Foo::D {
len: token_source.len(),
}),
)?
.with_token(r"/\* .*", Foo::A)?
.with_token(r"/\* .* \*/", Foo::E)?;
.with_token_unit("aaaa", Foo::A)?
.with_token_unit("b{4,}", Foo::B)?
.with_token_unit("b{4}", Foo::C)?
.with_token_map("d*", |token_source| Foo::D {
len: token_source.len(),
})?
.with_token_unit(r"/\* .*", Foo::A)?
.with_token_unit(r"/\* .* \*/", Foo::E)?;

for token in lexer.tokenize("bbbbbbbaaaabbbbddddddddddddddddd/* foo bar baz **** *") {
println!("{:?}", token);
Expand Down
7 changes: 4 additions & 3 deletions src/cfsm/dot.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
use super::{item::ItemBody, state::State, Cfsm, Grammar};
use super::{item::ItemBody, state::State, Cfsm};
use crate::Symbol;

use std::fmt::{Debug, Display};

impl<V, T> Cfsm<'_, V, T>
impl<V, T> Cfsm<V, T>
where
V: Copy + Debug,
T: Debug,
Grammar<V, T>: Clone,
{
/// Converts the CFSM to dot language using the [grahviz](https://graphviz.org/docs/layouts/dot/)
/// dot language format.
Expand Down Expand Up @@ -49,6 +48,8 @@ where
id, transitions, ..
}| {
transitions.iter().map(|(symbol, state)| {
let symbol = symbol.as_ref();

format!(
"{} -> {} [label = \"{}\"]",
*id,
Expand Down
24 changes: 12 additions & 12 deletions src/cfsm/item.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
use crate::{Body, Grammar, Symbol};

use std::{
collections::{HashMap, HashSet, VecDeque},
fmt::Debug,
hash::Hash,
ptr::NonNull,
};

use crate::{Body, Grammar, Symbol};

/// Set of [`ItemBody`] structs.
pub(super) type ItemBodies<V, T> = HashSet<ItemBody<V, T>>;

/// Wrapper around [`Body`] containing a bullet/cursor for reading symbols.
pub(super) struct ItemBody<V, T> {
pub(crate) struct ItemBody<V, T> {
body: NonNull<Body<V, T>>,
pub(super) cursor: usize,
}
Expand All @@ -21,26 +21,26 @@ pub(super) struct ItemBody<V, T> {
///
/// [`Cfsm`]: super::Cfsm
#[derive(Debug)]
pub(super) struct ItemSet<V, T> {
pub(crate) struct ItemSet<V, T> {
items: HashMap<V, ItemBodies<V, T>>,
}

impl<V, T> ItemBody<V, T> {
/// Returns the [`Body`] the [`ItemBody`] references.
pub(super) fn get_body(&self) -> &Body<V, T> {
pub(crate) fn get_body(&self) -> &Body<V, T> {
// SAFETY: The struct containing the grammar the body is from, is pinned and
// upholds the invariant of never being moved.
unsafe { self.body.as_ref() }
}

/// Returns the [`Symbol`] the bullet/cursor is currently reading.
pub(super) fn get_cursor_symbol(&self) -> Option<&Symbol<V, T>> {
pub(crate) fn get_cursor_symbol(&self) -> Option<&Symbol<V, T>> {
self.get_body().get(self.cursor)
}

/// Returns the [`Variable`](Symbol::Variable) the bullet/cursor is
/// currently reading.
pub(super) fn get_cursor_variable(&self) -> Option<&V> {
pub(crate) fn get_cursor_variable(&self) -> Option<&V> {
self.get_cursor_symbol().and_then(|s| match s {
Symbol::Variable(v) => Some(v),
Symbol::Terminal(_) => None,
Expand All @@ -50,7 +50,7 @@ impl<V, T> ItemBody<V, T> {

/// Returns the [`Terminal`](Symbol::Terminal) the bullet/cursor is
/// currently reading.
pub(super) fn get_cursor_terminal(&self) -> Option<&T> {
pub(crate) fn get_cursor_terminal(&self) -> Option<&T> {

Check warning on line 53 in src/cfsm/item.rs

View workflow job for this annotation

GitHub Actions / clippy

method `get_cursor_terminal` is never used

warning: method `get_cursor_terminal` is never used --> src/cfsm/item.rs:53:19 | 28 | impl<V, T> ItemBody<V, T> { | ------------------------- method in this implementation ... 53 | pub(crate) fn get_cursor_terminal(&self) -> Option<&T> { | ^^^^^^^^^^^^^^^^^^^ | = note: `#[warn(dead_code)]` on by default
self.get_cursor_symbol().and_then(|s| match s {
Symbol::Terminal(t) => Some(t),
Symbol::Variable(_) => None,
Expand Down Expand Up @@ -116,7 +116,7 @@ where
{
/// Returns an iterator over the [`Variable`](Symbol::Variable)-[`ItemBody`]
/// pairs in the [`ItemSet`].
pub(super) fn iter(&self) -> impl Iterator<Item = (V, ItemBody<V, T>)> + '_ {
pub(crate) fn iter(&self) -> impl Iterator<Item = (V, ItemBody<V, T>)> + '_ {
self.items
.iter()
.flat_map(|(head, bodies)| std::iter::repeat(*head).zip(bodies.iter().copied()))
Expand Down Expand Up @@ -237,13 +237,13 @@ where
}
}

impl<V, T> From<((&V, &HashSet<Body<V, T>>), &Grammar<V, T>)> for ItemSet<V, T>
impl<V, T> From<((V, &HashSet<Body<V, T>>), &Grammar<V, T>)> for ItemSet<V, T>
where
V: Copy + Eq + Hash,
ItemBody<V, T>: Eq + Hash,
{
fn from(((head, bodies), grammar): ((&V, &HashSet<Body<V, T>>), &Grammar<V, T>)) -> Self {
let items = HashMap::from([(*head, bodies.iter().map(ItemBody::from).collect())]);
fn from(((head, bodies), grammar): ((V, &HashSet<Body<V, T>>), &Grammar<V, T>)) -> Self {
let items = HashMap::from([(head, bodies.iter().map(ItemBody::from).collect())]);
Self::from_incomplete_map(items, grammar)
}
}
Expand Down
Loading

0 comments on commit 0fec398

Please sign in to comment.