Skip to content

Commit

Permalink
Merge pull request #58 from darakian/enable-commonmark-atx_headings-t…
Browse files Browse the repository at this point in the history
…ests

Enable commonmark atx headings tests
  • Loading branch information
darakian committed Sep 16, 2022
2 parents 4a0e0f4 + cae67c7 commit 9f22dc1
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 489 deletions.
106 changes: 66 additions & 40 deletions src/lexer.rs
@@ -1,12 +1,12 @@
use crate::MiniIter;

/// Tokens are the intermediate representation format in the markdown to html conversion
#[derive(Debug, PartialEq)]
#[derive(Debug, PartialEq, Eq)]
pub enum Token {
/// String: Body of unstructured text
Plaintext(String),
/// u8: Header level (1..=6). str: Header text. Option<str>: html label
Header(u8, String, Option<String>),
Header(usize, String, Option<String>),
/// str: Text for list entry
UnorderedListEntry(String),
/// str: Text for list entry
Expand Down Expand Up @@ -50,7 +50,7 @@ pub enum Token {
}

/// Holds the possible states of a taskbox in a task list
#[derive(Debug, PartialEq)]
#[derive(Debug, PartialEq, Eq)]
pub enum TaskBox {
Checked,
Unchecked,
Expand All @@ -71,7 +71,7 @@ impl Token{
}

/// Holds the alignment states for the table token
#[derive(Debug, PartialEq, Clone)]
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum Alignment {
Left,
Right,
Expand Down Expand Up @@ -106,20 +106,43 @@ pub(crate) fn push_str<'a>(t: &mut Vec<Token>, s: &'a str) {

pub(crate) fn lex_heading<'a>(char_iter: &mut MiniIter<'a>) -> Result<Token, ParseError<'a>> {
let hashes = char_iter.consume_while_case_holds(&|c| c == "#").unwrap_or("");
if char_iter.next_if_eq(&" ").is_none(){
if hashes.len() > 6 {
return Err(ParseError{content: hashes});
}
if char_iter.next_if_eq(&" ").is_none() && char_iter.next_if_eq(&"\t").is_none() && char_iter.peek() != Some(&"\n"){
return Err(ParseError{content: hashes});
}
let level = std::cmp::min(6, hashes.len() as u8);
let line = char_iter.consume_while_case_holds(&|c| c != "\n").unwrap_or("");
let mut heading = "";
if line.contains("{#") &&
line.contains('}') {
let (heading, _title) = line.split_once("{").unwrap_or(("",""));
let line = line.strip_prefix(&heading).unwrap()
.strip_prefix("{#").unwrap()
.strip_suffix("}").unwrap();
return Ok(Token::Header(level, heading.trim().to_string(), Some(line.to_string())));
.strip_prefix("{#").unwrap()
.strip_suffix("}").unwrap();
}
return Ok(Token::Header(level, line.to_string(), None));
let line_without_optional_trailing_hash_sequence = match line.trim_end().rsplit_once(' ') {
Some((left, right)) => {
match right.chars().all(|c| c == '#') {
true => left,
false => line,
}
},
None => line,
};
if line.chars().all(|c| c == '#') {
return Ok(Token::Header(hashes.len(), "".to_string(), None));
}
let parsed_line = crate::render_ignore(line_without_optional_trailing_hash_sequence.trim_end_matches(&[' ', '\t']), &['#'])
.strip_prefix("<p>").unwrap_or("")
.strip_suffix("</p>\n").unwrap_or("").trim().to_string();
println!("line: {:?}", line);
println!("parsed_line: {:?}", parsed_line);
println!("line_without_optional_trailing_hash_sequence: {:?}", line_without_optional_trailing_hash_sequence);
if heading != "" {
return Ok(Token::Header(hashes.len(), heading.trim().to_string(), Some(parsed_line)));
}
return Ok(Token::Header(hashes.len(), parsed_line, None));
}

pub(crate) fn lex_asterisk_underscore<'a>(char_iter: &mut MiniIter<'a>) -> Result<Token, ParseError<'a>> {
Expand Down Expand Up @@ -171,23 +194,34 @@ pub(crate) fn lex_asterisk_underscore<'a>(char_iter: &mut MiniIter<'a>) -> Resul
}
}

pub(crate) fn lex_spaces<'a>(char_iter: &mut MiniIter<'a>) -> Result<Token, ParseError<'a>>{
let spaces = char_iter.consume_while_case_holds(&|c| c == " ").unwrap_or("");
// Case 1: space in text => return char
if spaces.len() == 1 {
return Err(ParseError{content: spaces})
}
// Case 2: two or more spaces followed by \n => line break
if char_iter.next_if_eq("\n").is_some() {
return Ok(Token::LineBreak);
}
// Case 3: Tokenize for parser
match spaces.len(){
4 => return Ok(Token::Tab),
8 => return Ok(Token::DoubleTab),
_ => {}
}
Err(ParseError{content: spaces})
pub(crate) fn lex_tabs_spaces<'a>(char_iter: &mut MiniIter<'a>, tokens: &Vec<Token>) -> Result<Token, ParseError<'a>> {
let start_index = char_iter.get_index();
let whitespace = char_iter.consume_while_case_holds(&|c| c == "\t" || c == " ");
match whitespace {
None => return Err(ParseError{content: ""}),
Some(s) if (1..=3).contains(&s.len()) && !s.contains("\t") => return Err(ParseError{content: s}),
Some(s) if s.len() >= 2 &&
!s.contains("\t") &&
char_iter.peek() == Some("\n") => return Ok(Token::LineBreak),
Some(_s) => {},
}
let whitespace = whitespace.unwrap_or("");
let line = char_iter.consume_until_tail_is("\n").unwrap_or("");
match whitespace {
" " if (matches!(tokens.last(), Some(Token::Plaintext(_))) && line.contains('#')) => return Err(ParseError{content: line}),
" " if (matches!(tokens.last(), Some(Token::Newline)) && line.contains('#')) => return Err(ParseError{content: line}),
"\t" | " " => return Ok(Token::Code(line.to_string())),
_ => {},
}
if char_iter.peek() == Some("\t") || char_iter.peek() == Some(" ") {
match lex_tabs_spaces(char_iter, tokens) {
Ok(Token::CodeBlock(_content, _lang)) => {
return Ok(Token::CodeBlock(char_iter.get_substring_from(start_index).unwrap_or("").to_string(),"".to_string()))},
Err(e) => return Err(e),
Ok(_) => return Err(ParseError{content: ""}),
}
}
return Err(ParseError{content: char_iter.get_substring_from(start_index).unwrap_or("")})
}

pub(crate) fn lex_backticks<'a>(char_iter: &mut MiniIter<'a>) -> Result<Token, ParseError<'a>> {
Expand Down Expand Up @@ -225,18 +259,10 @@ pub(crate) fn lex_backticks<'a>(char_iter: &mut MiniIter<'a>) -> Result<Token, P

}

pub(crate) fn lex_newlines<'a>(char_iter: &mut MiniIter<'a>) -> Result<Token, ParseError<'a>> {
pub(crate) fn lex_newlines<'a>(char_iter: &mut MiniIter<'a>, tokens: &Vec<Token>) -> Result<Token, ParseError<'a>> {
match char_iter.consume_while_case_holds(&|c| c == "\n") {
Some(s) if s.len() >= 1 => return Ok(Token::Newline),
Some(s) if s.len() < 1 => return Err(ParseError{content: s}),
_ => return Err(ParseError{content: ""}),
}
}

pub(crate) fn lex_tabs<'a>(char_iter: &mut MiniIter<'a>) -> Result<Token, ParseError<'a>> {
match char_iter.consume_while_case_holds(&|c| c == "\t") {
Some(s) if s.len() > 1 => return Ok(Token::DoubleTab),
Some(s) if s.len() == 1 => return Ok(Token::Tab),
Some(s) if s.len() >= 2 => return Ok(Token::Newline),
Some(s) if s.len() < 2 => return Err(ParseError{content: s}),
_ => return Err(ParseError{content: ""}),
}
}
Expand Down Expand Up @@ -411,7 +437,7 @@ fn parse_details<'a>(char_iter: &mut MiniIter<'a>) -> Result<Token, ParseError<'
closes = remaining_text.matches("</details>").count();
}
}
let inner_tokens = crate::lex(remaining_text.strip_suffix("</details>").unwrap_or(""));
let inner_tokens = crate::lex(remaining_text.strip_suffix("</details>").unwrap_or(""), &[]);
Ok(Token::Detail(summary_line.to_string(), inner_tokens))
}

Expand Down Expand Up @@ -452,7 +478,7 @@ pub(crate) fn lex_pipes<'a>(char_iter: &mut MiniIter<'a>) -> Result<Token, Parse
.collect();
let mut r = Vec::new();
for e in elements.into_iter() {
let mut inner_tokens = crate::lex(&e);
let mut inner_tokens = crate::lex(&e, &[]);
inner_tokens.retain(|token| token.is_usable_in_table());
r.push(inner_tokens);
}
Expand Down
68 changes: 40 additions & 28 deletions src/lib.rs
Expand Up @@ -9,12 +9,12 @@ pub(crate) struct SanitizationError{
}

/// Convert source markdown to an ordered vector of tokens
pub fn lex(source: &str) -> Vec<Token>{
pub fn lex(source: &str, ignore: &[char]) -> Vec<Token>{
let mut char_iter = MiniIter::new(source);
let mut tokens = Vec::new();
while char_iter.peek().is_some(){
match char_iter.peek().unwrap(){
"#" => {
"#" if !ignore.contains(&'#') => {
match lex_heading(&mut char_iter) {
Ok(t) => tokens.push(t),
Err(e) => push_str(&mut tokens, e.content),
Expand All @@ -38,8 +38,8 @@ pub fn lex(source: &str) -> Vec<Token>{
Err(e) => push_str(&mut tokens, e.content),
}
},
" " => {
match lex_spaces(&mut char_iter) {
" " | "\t" => {
match lex_tabs_spaces(&mut char_iter, &tokens) {
Ok(t) => tokens.push(t),
Err(e) => push_str(&mut tokens, e.content),
}
Expand All @@ -51,13 +51,7 @@ pub fn lex(source: &str) -> Vec<Token>{
}
},
"\n" => {
match lex_newlines(&mut char_iter) {
Ok(t) => tokens.push(t),
Err(e) => push_str(&mut tokens, e.content),
}
},
"\t" => {
match lex_tabs(&mut char_iter) {
match lex_newlines(&mut char_iter, &tokens) {
Ok(t) => tokens.push(t),
Err(e) => push_str(&mut tokens, e.content),
}
Expand Down Expand Up @@ -103,7 +97,10 @@ pub fn lex(source: &str) -> Vec<Token>{
// Parse "\" to escape a markdown control character
"\\" => {
char_iter.next();
if char_iter.peek().is_some(){
if char_iter.peek() == Some(&"#"){
let hashes = char_iter.consume_while_case_holds(&|c| c == "#").unwrap_or("");
push_str(&mut tokens, hashes);
} else if char_iter.peek().is_some(){
push_str(&mut tokens, char_iter.next().unwrap());
}
}
Expand Down Expand Up @@ -158,9 +155,9 @@ pub fn parse(tokens: &[Token]) -> String {
Token::BlockQuote(_, _) | Token::Newline if quote_level > 0 => {},
Token::CodeBlock(_, _) | Token::Newline | Token::Header(_, _, _) if in_paragraph => {
in_paragraph = false;
html.push_str("</p>")
html.push_str("</p>\n")
},
Token::Plaintext(_) | Token::Italic(_) | Token::Bold(_) | Token::BoldItalic(_) | Token::Strikethrough(_) | Token::Code(_) if !in_paragraph => {
Token::Plaintext(_) | Token::Italic(_) | Token::Bold(_) | Token::BoldItalic(_) | Token::Strikethrough(_) if !in_paragraph => {
for _i in 0..quote_level {
html.push_str("</blockquote>");
quote_level-=1;
Expand Down Expand Up @@ -191,21 +188,23 @@ pub fn parse(tokens: &[Token]) -> String {
count+=1;
} else {s.push_str(tok)}
}
html.push_str(&s);
html.push_str(&s.trim_end_matches('\n'));
} else {
html.push_str(sanitize_display_text(t.trim_start_matches('\n')).as_str())
html.push_str(sanitize_display_text(t.trim_start_matches('\n')).trim_end_matches('\n'))
}
},
Token::Header(l, t, lbl) => {
let id = match lbl {
Some(text) => text.to_ascii_lowercase(),
None => t.to_ascii_lowercase(),
match lbl {
Some(lbl_text) => html.push_str(format!("<h{level} id=\"{id}\">{text}</h{level}>\n",
level=l,
text=t,
id=sanitize_display_text(&lbl_text.replace(" ", "-")))
.as_str()),
None => html.push_str(format!("<h{level}>{text}</h{level}>\n",
level=l,
text=t)
.as_str()),
};
html.push_str(format!("<h{level} id=\"{id}\">{text}</h{level}>\n",
level=l,
text=sanitize_display_text(t),
id=sanitize_display_text(&id.replace(" ", "-")))
.as_str())
},
Token::TaskListItem(c,t) => {
if in_task_list == false {
Expand Down Expand Up @@ -236,17 +235,22 @@ pub fn parse(tokens: &[Token]) -> String {
}
html.push_str(format!("<li>{}</li>", sanitize_display_text(t)).as_str())
},
Token::Newline => {html.push('\n')},
Token::Newline => {
match html.chars().last() {
Some('\n') => {}
_ => html.push('\n'),
}
},
Token::Tab => {html.push('\t')},
Token::DoubleTab => {html.push_str("\t\t")},
Token::Italic(t) => {html.push_str(format!("<em>{}</em>", sanitize_display_text(t)).as_str())},
Token::Bold(t) => {html.push_str(format!("<strong>{}</strong>", sanitize_display_text(t)).as_str())},
Token::BoldItalic(t) => {html.push_str(format!("<strong><em>{}</em></strong>", sanitize_display_text(t)).as_str())},
Token::LineBreak => {html.push_str("<br>")},
Token::HorizontalRule => {html.push_str("<hr />")},
Token::HorizontalRule => {html.push_str("<hr />\n")},
Token::Strikethrough(t) => {html.push_str(format!("<strike>{}</strike>", sanitize_display_text(t)).as_str())},
Token::Code(t) => {
html.push_str(format!("<code>{}</code>", sanitize_display_text(t)).as_str())},
html.push_str(format!("<pre><code>{}</code></pre>", sanitize_display_text(t)).as_str())},
Token::CodeBlock(t, lang) => {
html.push_str("<pre>");
match lang.as_str() {
Expand Down Expand Up @@ -370,6 +374,7 @@ pub fn parse(tokens: &[Token]) -> String {
}
}


// Add references
if references.len() > 0{
html.push_str("<div class=\"footnotes\" role=\"doc-endnotes\">\n");
Expand All @@ -384,13 +389,20 @@ pub fn parse(tokens: &[Token]) -> String {
html.push_str("\t</ol>\n");
html.push_str("</div>\n");
}
if html.chars().last().unwrap_or(' ') != '\n' {
html.push('\n');
}
html
}

/// Render HTML from a source markdown string
/// Output is sanitized to prevent script injection
pub fn render(source: &str) -> String {
parse(&lex(source))
parse(&lex(source, &[]))
}

pub(crate) fn render_ignore(source: &str, ignore: &[char]) -> String {
parse(&lex(source, ignore))
}

/// Replace potentially unsafe characters with html entities
Expand Down

0 comments on commit 9f22dc1

Please sign in to comment.