Merge pull request #4 from exellentcoin26/slr-parser-generator

Implement SLR parser generator
exellentcoin26 · Sep 23, 2023 · 0fec398 · 0fec398
2 parents b995e4f + 858203f
commit 0fec398
Show file tree

Hide file tree

Showing 18 changed files with 1,407 additions and 253 deletions.
diff --git a/.github/codecov.yml b/.github/codecov.yml
@@ -1,6 +1,6 @@
 # ref: https://docs.codecov.com/docs/codecovyml-reference
 coverage:
-  range: 75..100
+  range: 75..90
   round: down
   precision: 1
   status:

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -11,3 +11,5 @@ members = [
 ]
 
 [dependencies]
+dyn-clone = "1"
+pango-lexer = {path = "./pango-lexer"}
diff --git a/pango-lexer/src/lexer/mod.rs b/pango-lexer/src/lexer/mod.rs
@@ -9,7 +9,7 @@ use std::collections::BTreeMap;
 mod input;
 
 /// Finite-state machine based lexer.
-pub struct Lexer<'input, TokenKind, Fsm: Simulatable> {
+pub struct Lexer<'input, TokenKind, Fsm: Simulatable = Nfa> {
     /// Input to be lexed.
     pub input: &'input str,
     /// Iterator over the input.
@@ -22,7 +22,8 @@ pub struct Lexer<'input, TokenKind, Fsm: Simulatable> {
     tokens: BTreeMap<StateId, TokenKindGenerator<TokenKind>>,
 }
 
-/// `TokenKind`s can be generated based on the token or can just be cloned.
+/// `TokenKind`s can be generated based on the token or can just be created
+/// (e.g., unit structs).
 ///
 /// This is useful for doing some parsing of the internal structure of the
 /// token, for example, converting to [si units](https://en.wikipedia.org/wiki/International_System_of_Units). In rust,
@@ -31,29 +32,23 @@ pub struct Lexer<'input, TokenKind, Fsm: Simulatable> {
 enum TokenKindGenerator<TokenKind> {
     /// Generate a `TokenKind` based on the token source.
     Map(Box<dyn FnMut(&str) -> TokenKind>),
-    /// Clone the `TokenKind`.
-    Unit(TokenKind),
+    /// Create the `TokenKind`.
+    Unit(Box<dyn FnMut() -> TokenKind>),
 }
 
 /// [`Token`] returned by the [`Lexer`].
 #[derive(Debug, PartialEq, Eq)]
-pub struct Token<TokenKind>
-where
-    TokenKind: Clone,
-{
+pub struct Token<TokenKind> {
     /// Kind of token configured by the user.
-    kind: TokenKind,
+    pub kind: TokenKind,
     /// Source string representation of the token.
-    source: String,
+    pub source: String,
     /// Position of the token in the input. The end points one position beyond
     /// the end of the token.
-    pos: (usize, usize),
+    pub pos: (usize, usize),
 }
 
-impl<TokenKind> Token<TokenKind>
-where
-    TokenKind: Clone,
-{
+impl<TokenKind> Token<TokenKind> {
     /// Creates a new [`Token`] from the [`InputIterToken`].
     fn from_input_iter_token(
         InputIterToken { source, pos }: InputIterToken,
@@ -67,7 +62,6 @@ impl<TokenKind, Fsm> Iterator for Lexer<'_, TokenKind, Fsm>
 where
     for<'a> Fsm: Simulatable + 'a,
     for<'a> Fsm::Simulator<'a>: NDSimulate,
-    TokenKind: Clone,
 {
     type Item = Token<TokenKind>;
 
@@ -110,7 +104,7 @@ where
                             token_kind_gen(&self.iter.get_token().source)
                         }
 
-                        TokenKindGenerator::Unit(token_kind) => token_kind.clone(),
+                        TokenKindGenerator::Unit(token_kind_gen) => token_kind_gen(),
                     },
                 );
             }
@@ -154,14 +148,38 @@ impl<TokenKind> LexerGenerator<TokenKind> {
         Self::default()
     }
 
+    /// Adds a single token to the [`Lexer`]. The function is called when
+    /// generating the token. It is not given the source representation of
+    /// the token. If you need this use [`with_token_map`] instead.
+    ///
+    /// # Fails
+    ///
+    /// When the provided `token` is invalid regex.
+    ///
+    /// [`with_token_map`]: Self::with_token_map
+    #[inline]
+    pub fn with_token(mut self, token: &str, token_kind: fn() -> TokenKind) -> ParseResult<Self>
+    where
+        TokenKind: 'static,
+    {
+        self.add_token(token, TokenKindGenerator::Unit(Box::new(token_kind)))?;
+        Ok(self)
+    }
+
     /// Adds a single token to the [`Lexer`].
     ///
     /// # Fails
     ///
     /// When the provided `token` is invalid regex.
     #[inline]
-    pub fn with_token(mut self, token: &str, token_kind: TokenKind) -> ParseResult<Self> {
-        self.add_token(token, TokenKindGenerator::Unit(token_kind))?;
+    pub fn with_token_unit(mut self, token: &str, token_kind: TokenKind) -> ParseResult<Self>
+    where
+        TokenKind: Copy + 'static,
+    {
+        self.add_token(
+            token,
+            TokenKindGenerator::Unit(Box::new(move || token_kind)),
+        )?;
         Ok(self)
     }
 
@@ -176,9 +194,13 @@ impl<TokenKind> LexerGenerator<TokenKind> {
     pub fn with_token_map(
         mut self,
         token: &str,
-        token_kind_map: Box<dyn FnMut(&str) -> TokenKind>,
-    ) -> ParseResult<Self> {
-        self.add_token(token, TokenKindGenerator::Map(token_kind_map))?;
+        // token_kind_map: Box<dyn FnMut(&str) -> TokenKind>,
+        token_kind_map: fn(&str) -> TokenKind,
+    ) -> ParseResult<Self>
+    where
+        TokenKind: 'static,
+    {
+        self.add_token(token, TokenKindGenerator::Map(Box::new(token_kind_map)))?;
         Ok(self)
     }
 
@@ -248,27 +270,28 @@ mod tests {
 
     #[test]
     fn lexer() -> Result<(), Box<dyn std::error::Error>> {
-        #[derive(Debug, Clone, Copy, PartialEq, Eq)]
+        #[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
         enum Foo {
+            #[default]
             A,
             B,
             C,
-            D { len: usize },
+            D {
+                len: usize,
+            },
             E,
         }
 
         let tokens = Lexer::builder()
-            .with_token("aaaa", Foo::A)?
-            .with_token("b{4,}", Foo::B)?
-            .with_token("b{4}", Foo::C)?
-            .with_token_map(
-                "d*",
-                Box::new(|token_source| Foo::D {
-                    len: token_source.len(),
-                }),
-            )?
-            .with_token(r"/\* .*", Foo::A)?
-            .with_token(r"/\* .* \*/", Foo::E)?
+            .with_token_unit("aaaa", Foo::A)?
+            .with_token_unit("b{4,}", Foo::B)?
+            .with_token_unit("b{4}", Foo::C)?
+            .with_token_map("d*", |token_source| Foo::D {
+                len: token_source.len(),
+            })?
+            .with_token_unit(r"/\* .*", Foo::A)?
+            .with_token_unit(r"/\* .* \*/", Foo::E)?
+            .with_token(r"a", Default::default)?
             .tokenize(
                 "bbbbbbbaaaabbbbddddddddddddddddd/* foo bar baz **** * /* foo bar baz ***** */",
             );

diff --git a/pango-lexer/src/lib.rs b/pango-lexer/src/lib.rs
@@ -1,4 +1,4 @@
-pub use lexer::Lexer;
+pub use lexer::{Lexer, Token};
 
 mod fsm;
 mod iter;

diff --git a/pango-lexer/src/main.rs b/pango-lexer/src/main.rs
@@ -11,17 +11,14 @@ enum Foo {
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
     let lexer = Lexer::builder()
-        .with_token("aaaa", Foo::A)?
-        .with_token("b{4,}", Foo::B)?
-        .with_token("b{4}", Foo::C)?
-        .with_token_map(
-            "d*",
-            Box::new(|token_source| Foo::D {
-                len: token_source.len(),
-            }),
-        )?
-        .with_token(r"/\* .*", Foo::A)?
-        .with_token(r"/\* .* \*/", Foo::E)?;
+        .with_token_unit("aaaa", Foo::A)?
+        .with_token_unit("b{4,}", Foo::B)?
+        .with_token_unit("b{4}", Foo::C)?
+        .with_token_map("d*", |token_source| Foo::D {
+            len: token_source.len(),
+        })?
+        .with_token_unit(r"/\* .*", Foo::A)?
+        .with_token_unit(r"/\* .* \*/", Foo::E)?;
 
     for token in lexer.tokenize("bbbbbbbaaaabbbbddddddddddddddddd/* foo bar baz **** *") {
         println!("{:?}", token);

diff --git a/src/cfsm/dot.rs b/src/cfsm/dot.rs
@@ -1,13 +1,12 @@
-use super::{item::ItemBody, state::State, Cfsm, Grammar};
+use super::{item::ItemBody, state::State, Cfsm};
 use crate::Symbol;
 
 use std::fmt::{Debug, Display};
 
-impl<V, T> Cfsm<'_, V, T>
+impl<V, T> Cfsm<V, T>
 where
     V: Copy + Debug,
     T: Debug,
-    Grammar<V, T>: Clone,
 {
     /// Converts the CFSM to dot language using the [grahviz](https://graphviz.org/docs/layouts/dot/)
     /// dot language format.
@@ -49,6 +48,8 @@ where
                  id, transitions, ..
              }| {
                 transitions.iter().map(|(symbol, state)| {
+                    let symbol = symbol.as_ref();
+
                     format!(
                         "{} -> {} [label = \"{}\"]",
                         *id,

diff --git a/src/cfsm/item.rs b/src/cfsm/item.rs
@@ -1,17 +1,17 @@
-use crate::{Body, Grammar, Symbol};
-
 use std::{
     collections::{HashMap, HashSet, VecDeque},
     fmt::Debug,
     hash::Hash,
     ptr::NonNull,
 };
 
+use crate::{Body, Grammar, Symbol};
+
 /// Set of [`ItemBody`] structs.
 pub(super) type ItemBodies<V, T> = HashSet<ItemBody<V, T>>;
 
 /// Wrapper around [`Body`] containing a bullet/cursor for reading symbols.
-pub(super) struct ItemBody<V, T> {
+pub(crate) struct ItemBody<V, T> {
     body: NonNull<Body<V, T>>,
     pub(super) cursor: usize,
 }
@@ -21,26 +21,26 @@ pub(super) struct ItemBody<V, T> {
 ///
 /// [`Cfsm`]: super::Cfsm
 #[derive(Debug)]
-pub(super) struct ItemSet<V, T> {
+pub(crate) struct ItemSet<V, T> {
     items: HashMap<V, ItemBodies<V, T>>,
 }
 
 impl<V, T> ItemBody<V, T> {
     /// Returns the [`Body`] the [`ItemBody`] references.
-    pub(super) fn get_body(&self) -> &Body<V, T> {
+    pub(crate) fn get_body(&self) -> &Body<V, T> {
         // SAFETY: The struct containing the grammar the body is from, is pinned and
         // upholds the invariant of never being moved.
         unsafe { self.body.as_ref() }
     }
 
     /// Returns the [`Symbol`] the bullet/cursor is currently reading.
-    pub(super) fn get_cursor_symbol(&self) -> Option<&Symbol<V, T>> {
+    pub(crate) fn get_cursor_symbol(&self) -> Option<&Symbol<V, T>> {
         self.get_body().get(self.cursor)
     }
 
     /// Returns the [`Variable`](Symbol::Variable) the bullet/cursor is
     /// currently reading.
-    pub(super) fn get_cursor_variable(&self) -> Option<&V> {
+    pub(crate) fn get_cursor_variable(&self) -> Option<&V> {
         self.get_cursor_symbol().and_then(|s| match s {
             Symbol::Variable(v) => Some(v),
             Symbol::Terminal(_) => None,
@@ -50,7 +50,7 @@ impl<V, T> ItemBody<V, T> {
 
     /// Returns the [`Terminal`](Symbol::Terminal) the bullet/cursor is
     /// currently reading.
-    pub(super) fn get_cursor_terminal(&self) -> Option<&T> {
+    pub(crate) fn get_cursor_terminal(&self) -> Option<&T> {
         self.get_cursor_symbol().and_then(|s| match s {
             Symbol::Terminal(t) => Some(t),
             Symbol::Variable(_) => None,
@@ -116,7 +116,7 @@ where
 {
     /// Returns an iterator over the [`Variable`](Symbol::Variable)-[`ItemBody`]
     /// pairs in the [`ItemSet`].
-    pub(super) fn iter(&self) -> impl Iterator<Item = (V, ItemBody<V, T>)> + '_ {
+    pub(crate) fn iter(&self) -> impl Iterator<Item = (V, ItemBody<V, T>)> + '_ {
         self.items
             .iter()
             .flat_map(|(head, bodies)| std::iter::repeat(*head).zip(bodies.iter().copied()))
@@ -237,13 +237,13 @@ where
     }
 }
 
-impl<V, T> From<((&V, &HashSet<Body<V, T>>), &Grammar<V, T>)> for ItemSet<V, T>
+impl<V, T> From<((V, &HashSet<Body<V, T>>), &Grammar<V, T>)> for ItemSet<V, T>
 where
     V: Copy + Eq + Hash,
     ItemBody<V, T>: Eq + Hash,
 {
-    fn from(((head, bodies), grammar): ((&V, &HashSet<Body<V, T>>), &Grammar<V, T>)) -> Self {
-        let items = HashMap::from([(*head, bodies.iter().map(ItemBody::from).collect())]);
+    fn from(((head, bodies), grammar): ((V, &HashSet<Body<V, T>>), &Grammar<V, T>)) -> Self {
+        let items = HashMap::from([(head, bodies.iter().map(ItemBody::from).collect())]);
         Self::from_incomplete_map(items, grammar)
     }
 }