Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional html support #102

Merged
merged 4 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
39 changes: 33 additions & 6 deletions lib/discourse_activity_pub/content_parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ class DiscourseActivityPub::ContentParser < Nokogiri::XML::SAX::Document
autolink
list
backticks
newline
code
fence
image
Expand All @@ -36,6 +35,8 @@ class DiscourseActivityPub::ContentParser < Nokogiri::XML::SAX::Document
emphasis
]

PERMITTED_TAGS = %w[p a h1 h2 h3 h4 h5 ul ol li code blockquote em strong]

MAX_TITLE_LENGTH = 60

attr_reader :content
Expand All @@ -49,29 +50,39 @@ def initialize(length, opts = {})

def start_element(name, attributes = [])
case name
when "p"
start_tag(name, attributes)
@in_p = true
when "a"
start_tag(name, attributes)
@in_a = true
when "h1", "h2", "h3", "h4", "h5"
when *PERMITTED_TAGS
start_tag(name, attributes)
when "div"
if attributes.include?(%w[class note])
@content = +""
@current_length = 0
@start_content = true
start_tag("p", []) unless @in_p
end
end
end

def end_element(name)
case name
when "p"
end_tag(name)
@in_p = false
when "a"
end_tag(name)
@in_a = false
when "h1", "h2", "h3", "h4", "h5"
when *PERMITTED_TAGS
end_tag(name)
when "div"
throw :done if @start_content
if @start_content
end_tag("p")
throw :done
end
end
end

Expand Down Expand Up @@ -103,6 +114,7 @@ def characters(string)
@content << string[0..length]
@content << "&hellip;"
@content << "</a>" if @in_a
@content << "</p>" if @in_p
throw :done
end
@content << string
Expand All @@ -115,7 +127,7 @@ def self.cook(text, opts = {})
text,
opts.merge(features_override: MARKDOWN_FEATURES, markdown_it_rules: MARKDOWN_IT_RULES),
)
scrubbed_html(html)
scrubbed_html(remove_newlines(html))
end

def self.scrubbed_html(html)
Expand Down Expand Up @@ -164,6 +176,21 @@ def self.parse(html, length, opts = {})
content_parser = self.new(length, opts)
sax_parser = Nokogiri::HTML::SAX::Parser.new(content_parser)
catch(:done) { sax_parser.parse(html) }
content_parser.content.strip
final_clean(content_parser.content.strip)
end

def self.remove_newlines(html)
html.gsub("\n", "").squeeze(" ")
end

def self.final_clean(html)
fragment = Nokogiri::HTML5.fragment(html)
fragment.traverse do |node|
if node.content.blank?
node.remove
next
end
end
fragment.to_html
end
end
36 changes: 18 additions & 18 deletions spec/lib/discourse_activity_pub/bulk/publish_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ def build_warning_log(key)

it "creates the right post objects" do
described_class.perform(actor_id: actor.id)
expect(post1.activity_pub_object.content).to eq(post1.raw)
expect(post2.activity_pub_object.content).to eq(post2.raw)
expect(post3.activity_pub_object.content).to eq(post3.raw)
expect(post4.activity_pub_object.content).to eq(post4.raw)
expect(post1.activity_pub_object.content).to eq(post1.cooked)
expect(post2.activity_pub_object.content).to eq(post2.cooked)
expect(post3.activity_pub_object.content).to eq(post3.cooked)
expect(post4.activity_pub_object.content).to eq(post4.cooked)
expect(post2.activity_pub_object.reply_to_id).to eq(post1.activity_pub_object.ap_id)
expect(post4.activity_pub_object.reply_to_id).to eq(post3.activity_pub_object.ap_id)
expect(post1.activity_pub_object.collection_id).to eq(post1.topic.activity_pub_object.id)
Expand All @@ -95,10 +95,10 @@ def build_warning_log(key)
expect(post2.activity_pub_object.published_at).to be_within_one_second_of(Time.now)
expect(post3.activity_pub_object.published_at).to be_within_one_second_of(Time.now)
expect(post4.activity_pub_object.published_at).to be_within_one_second_of(Time.now)
expect(post1.activity_pub_content).to eq(post1.raw)
expect(post2.activity_pub_content).to eq(post2.raw)
expect(post3.activity_pub_content).to eq(post3.raw)
expect(post4.activity_pub_content).to eq(post3.raw)
expect(post1.activity_pub_content).to eq(post1.cooked)
expect(post2.activity_pub_content).to eq(post2.cooked)
expect(post3.activity_pub_content).to eq(post3.cooked)
expect(post4.activity_pub_content).to eq(post3.cooked)
expect(post1.activity_pub_visibility).to eq("public")
expect(post2.activity_pub_visibility).to eq("public")
expect(post3.activity_pub_visibility).to eq("public")
Expand Down Expand Up @@ -297,8 +297,8 @@ def build_warning_log(key)

it "creates the right post objects" do
described_class.perform(actor_id: actor.id)
expect(post2.activity_pub_object.content).to eq(post2.raw)
expect(post3.activity_pub_object.content).to eq(post3.raw)
expect(post2.activity_pub_object.content).to eq(post2.cooked)
expect(post3.activity_pub_object.content).to eq(post3.cooked)
expect(post2.activity_pub_object.reply_to_id).to eq(post1.activity_pub_object.ap_id)
expect(post2.activity_pub_object.collection_id).to eq(post2.topic.activity_pub_object.id)
expect(post3.activity_pub_object.collection_id).to eq(post3.topic.activity_pub_object.id)
Expand All @@ -310,8 +310,8 @@ def build_warning_log(key)
)
expect(post2.activity_pub_object.published_at).to be_within_one_second_of(Time.now)
expect(post3.activity_pub_object.published_at).to be_within_one_second_of(Time.now)
expect(post2.activity_pub_content).to eq(post2.raw)
expect(post3.activity_pub_content).to eq(post3.raw)
expect(post2.activity_pub_content).to eq(post2.cooked)
expect(post3.activity_pub_content).to eq(post3.cooked)
expect(post2.activity_pub_visibility).to eq("public")
expect(post3.activity_pub_visibility).to eq("public")
expect(
Expand Down Expand Up @@ -435,7 +435,7 @@ def build_warning_log(key)

it "creates and publishes the right post objects" do
described_class.perform(actor_id: actor.id)
expect(post3.activity_pub_object.content).to eq(post3.raw)
expect(post3.activity_pub_object.content).to eq(post3.cooked)
expect(post3.activity_pub_object.collection_id).to eq(
post3.topic.activity_pub_object.id,
)
Expand All @@ -446,7 +446,7 @@ def build_warning_log(key)
Time.now,
)
expect(post3.activity_pub_object.published_at).to be_within_one_second_of(Time.now)
expect(post3.activity_pub_content).to eq(post3.raw)
expect(post3.activity_pub_content).to eq(post3.cooked)
expect(post3.activity_pub_visibility).to eq("public")
expect(
post2.custom_fields["activity_pub_published_at"].to_time,
Expand Down Expand Up @@ -527,10 +527,10 @@ def build_warning_log(key)

it "creates the right post objects" do
described_class.perform(actor_id: actor.id)
expect(post1.activity_pub_object.content).to eq(post1.raw)
expect(post3.activity_pub_object.content).to eq(post3.raw)
expect(post1.activity_pub_content).to eq(post1.raw)
expect(post3.activity_pub_content).to eq(post3.raw)
expect(post1.activity_pub_object.content).to eq(post1.cooked)
expect(post3.activity_pub_object.content).to eq(post3.cooked)
expect(post1.activity_pub_content).to eq(post1.cooked)
expect(post3.activity_pub_content).to eq(post3.cooked)
expect(post1.activity_pub_visibility).to eq("public")
expect(post3.activity_pub_visibility).to eq("public")
expect(
Expand Down
77 changes: 54 additions & 23 deletions spec/lib/discourse_activity_pub/content_parser_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,40 @@
RSpec.describe DiscourseActivityPub::ContentParser do
describe "#get_note" do
it "handles div note in short post" do
expect(described_class.get_note("<div class='note'>hi</div> test")).to eq("hi")
expect(described_class.get_note("<div class='note'>hi</div> test")).to eq("<p>hi</p>")
end

it "handles div note in long post" do
html = <<~HTML
<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis quam nulla, feugiat venenatis elementum ut, imperdiet eu nisl. Vestibulum dictum luctus tortor, vel consequat lectus tristique non. Sed aliquam at eros et lacinia. Nam viverra libero at tortor semper fringilla non ut velit. Mauris dignissim sapien sed felis consequat, quis ullamcorper augue viverra. Donec elementum nisl ut leo viverra, vel consequat diam facilisis. Donec leo arcu, dictum vel vestibulum sit amet, maximus sed neque. Vestibulum blandit metus ante, sit amet porta lorem maximus id. Suspendisse sed lacus sapien. Nulla dui dui, dapibus vitae quam ut, elementum ultrices ipsum. In congue laoreet eleifend. Sed tincidunt consequat dolor, volutpat posuere arcu molestie a. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos.</p>
<p>Pellentesque feugiat, elit ut aliquam fringilla, lectus eros rhoncus arcu, eget posuere ex velit ac purus. Morbi nec enim iaculis, lobortis lacus id, laoreet neque. Pellentesque et turpis a sapien tincidunt consequat quis a urna. Curabitur id ipsum vitae nisi dapibus tincidunt. Cras hendrerit nunc eget consectetur dapibus. Donec lacinia in sapien ac pellentesque. Phasellus at risus et lorem luctus pretium a eget leo. Nulla pellentesque metus libero, sit amet efficitur diam vehicula ac. Mauris ultrices erat non nulla volutpat tristique et sed arcu. </p><div class="note">hi</div><p></p>
HTML
expect(described_class.get_note(html)).to eq("hi")
expect(described_class.get_note(html)).to eq("<p>hi</p>")
end
end

describe "#get_content" do
it "respects the maxlength site setting for note excerpts" do
SiteSetting.activity_pub_note_excerpt_maxlength = 10
expected_excerpt = "This is a &hellip;"
expected_excerpt = "This is&hellip;"
post = Fabricate(:post_with_long_raw_content)
post.rebake!
expect(described_class.get_content(post)).to eq(expected_excerpt)
expect(described_class.get_content(post)).to eq("<p>This is…</p>")
end

it "does not apply a maxlength if the site setting is 0" do
SiteSetting.activity_pub_note_excerpt_maxlength = 0
post = Fabricate(:post_with_long_raw_content)
post.rebake!
expect(described_class.get_content(post)).to eq(
"This is a sample post with semi-long raw content. The raw content is also more than\ntwo hundred characters to satisfy any test conditions that require content longer\nthan the typical test post raw content. It really is some long content, folks.",
"<p>This is a sample post with semi-long raw content. The raw content is also more than two hundred characters to satisfy any test conditions that require content longer than the typical test post raw content. It really is some long content, folks.</p>",
)
end

it "respects [note] tags" do
content = "[note]This plugin is being developed[/note] by #pavilion for #Discourse"
post = Fabricate(:post, raw: content)
expect(described_class.get_content(post)).to eq("This plugin is being developed")
expect(described_class.get_content(post)).to eq("<p>This plugin is being developed</p>")
end

it "handles line breaks" do
Expand All @@ -50,7 +50,7 @@
Third line
STRING
post = Fabricate(:post, raw: content)
expect(described_class.get_content(post)).to eq("First line\nSecond line")
expect(described_class.get_content(post)).to eq("<p>First line</p><p>Second line</p>")
end

it "handles invalid line breaks" do
Expand All @@ -63,22 +63,24 @@
Third line
STRING
post = Fabricate(:post, raw: content)
expect(described_class.get_content(post)).to eq("First line\nSecond line\nThird line")
expect(described_class.get_content(post)).to eq(
"<p>First line</p><p>Second line</p><p>Third line</p>",
)
end

it "does not convert local hashtags" do
Fabricate(:category, name: "pavilion")
Fabricate(:tag, name: "Discourse")
content = "This plugin is being developed by #pavilion for #Discourse"
post = Fabricate(:post, raw: content)
expect(described_class.get_content(post)).to eq(content)
expect(described_class.get_content(post)).to eq("<p>#{content}</p>")
end

it "does not convert local mentions" do
Fabricate(:user, username: "angus")
content = "This plugin is being developed by @angus@mastodon.pavilion.tech"
post = Fabricate(:post, raw: content)
expect(described_class.get_content(post)).to eq(content)
expect(described_class.get_content(post)).to eq("<p>#{content}</p>")
end

context "with Article" do
Expand All @@ -94,32 +96,61 @@

context "with markdown" do
let(:raw_markdown) { <<~STRING }
# First Header
# First Header

## Second Header

### Third Header

#### Fourth Header

Paragraph

[Link](https://discourse.org)

> This is a quote

- This is an unordered list item

## Second Header
1. This is an ordered list item

### Third Header
```
This is a code block
```

#### Fourth Header
**This is strong text**

Paragraph
*This is emphasised text*

[Link](https://discourse.org)
STRING
let(:cooked_markdown) { <<~HTML }
<h1>First Header</h1>
<h2>Second Header</h2>
<h3>Third Header</h3>
<h4>Fourth Header</h4>
Paragraph
<a href="https://discourse.org">Link</a>
<h1>First Header</h1>
<h2>Second Header</h2>
<h3>Third Header</h3>
<h4>Fourth Header</h4>
<p>Paragraph</p>
<p><a href="https://discourse.org">Link</a></p>
<blockquote>
<p>This is a quote</p>
</blockquote>
<ul>
<li>This is an unordered list item</li>
</ul>
<ol>
<li>This is an ordered list item</li>
</ol>
<code class="lang-auto">This is a code block</code>
<p><strong>This is strong text</strong></p>
<p><em>This is emphasised text</em></p>
HTML
let!(:post) { Fabricate(:post, raw: raw_markdown) }

before { SiteSetting.activity_pub_note_excerpt_maxlength = 1000 }

it "returns html" do
expect(described_class.get_content(post)).to eq(cooked_markdown.strip)
expect(described_class.get_content(post)).to eq(
described_class.remove_newlines(cooked_markdown),
)
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion spec/lib/post_revisor_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
updated_raw = "#{post.raw} revision inside note"
post_revisor.revise!(user, raw: updated_raw)
expect(post.reload.raw).to eq(updated_raw)
expect(post.activity_pub_content).to eq(updated_raw)
expect(post.activity_pub_content).to eq("<p>#{updated_raw}</p>")
end
end

Expand Down
8 changes: 4 additions & 4 deletions spec/models/post_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@

it "sets the post content" do
post.activity_pub_publish!
expect(post.reload.activity_pub_content).to eq(post.raw)
expect(post.reload.activity_pub_content).to eq(post.cooked)
end

it "sets the post visibility" do
Expand Down Expand Up @@ -188,7 +188,7 @@

it "sets the post content" do
post.activity_pub_publish!
expect(post.reload.activity_pub_content).to eq(post.raw)
expect(post.reload.activity_pub_content).to eq(post.cooked)
end

it "sets the post visibility" do
Expand Down Expand Up @@ -217,7 +217,7 @@

it "sets the post content" do
post.activity_pub_publish!
expect(post.reload.activity_pub_content).to eq(post.raw)
expect(post.reload.activity_pub_content).to eq(post.cooked)
end

it "sets the post visibility" do
Expand Down Expand Up @@ -249,7 +249,7 @@

it "sets the post content" do
post.activity_pub_publish!
expect(post.reload.activity_pub_content).to eq(post.raw)
expect(post.reload.activity_pub_content).to eq(post.cooked)
end

it "sets the post visibility" do
Expand Down