From 5f0e390cac5199ef4789d0923d4305a7f664ce2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20M=C3=BCller?= Date: Wed, 15 May 2024 16:57:19 +0200 Subject: [PATCH] Revert "Sanitize doc generation" (#14595) --- lib/.shards.info | 3 - lib/sanitize/.circleci/config.yml | 94 ---- lib/sanitize/.editorconfig | 9 - lib/sanitize/.gitignore | 9 - lib/sanitize/LICENSE | 202 -------- lib/sanitize/Makefile | 54 -- lib/sanitize/README.md | 128 ----- lib/sanitize/lib | 1 - lib/sanitize/scripts/generate-docs.sh | 18 - lib/sanitize/shard.yml | 15 - lib/sanitize/spec/html_sanitizer/basic.hrx | 70 --- lib/sanitize/spec/html_sanitizer/class.hrx | 34 -- .../spec/html_sanitizer/combined_policies.hrx | 42 -- .../html_sanitizer/combined_policies_spec.cr | 11 - lib/sanitize/spec/html_sanitizer/default.hrx | 138 ----- .../html_sanitizer/html_sanitizer_spec.cr | 102 ---- lib/sanitize/spec/html_sanitizer/img.hrx | 46 -- lib/sanitize/spec/html_sanitizer/links.hrx | 89 ---- .../protocol-based-javascript.hrx | 160 ------ .../html_sanitizer/protocol_javascript.hrx | 67 --- lib/sanitize/spec/html_sanitizer/url_spec.cr | 8 - lib/sanitize/spec/html_sanitizer/xss.hrx | 476 ------------------ lib/sanitize/spec/spec_helper.cr | 1 - lib/sanitize/spec/support/hrx.cr | 83 --- lib/sanitize/spec/text_policy.hrx | 67 --- lib/sanitize/spec/text_policy_spec.cr | 17 - lib/sanitize/spec/uri_sanitizer_spec.cr | 130 ----- lib/sanitize/src/adapter/libxml2.cr | 137 ----- lib/sanitize/src/policy.cr | 45 -- lib/sanitize/src/policy/html_sanitizer.cr | 350 ------------- .../src/policy/html_sanitizer/safelist.cr | 70 --- lib/sanitize/src/policy/text.cr | 23 - lib/sanitize/src/policy/whitelist.cr | 57 --- lib/sanitize/src/processor.cr | 110 ---- lib/sanitize/src/sanitize.cr | 5 - lib/sanitize/src/uri_sanitizer.cr | 107 ---- shard.lock | 4 - shard.yml | 5 +- .../crystal/tools/doc/doc_renderer_spec.cr | 30 +- .../crystal/tools/doc/markd_doc_renderer.cr | 23 - 40 files changed, 3 insertions(+), 3037 deletions(-) delete mode 100644 lib/sanitize/.circleci/config.yml delete mode 100644 lib/sanitize/.editorconfig delete mode 100644 lib/sanitize/.gitignore delete mode 100644 lib/sanitize/LICENSE delete mode 100644 lib/sanitize/Makefile delete mode 100644 lib/sanitize/README.md delete mode 120000 lib/sanitize/lib delete mode 100755 lib/sanitize/scripts/generate-docs.sh delete mode 100644 lib/sanitize/shard.yml delete mode 100644 lib/sanitize/spec/html_sanitizer/basic.hrx delete mode 100644 lib/sanitize/spec/html_sanitizer/class.hrx delete mode 100644 lib/sanitize/spec/html_sanitizer/combined_policies.hrx delete mode 100644 lib/sanitize/spec/html_sanitizer/combined_policies_spec.cr delete mode 100644 lib/sanitize/spec/html_sanitizer/default.hrx delete mode 100644 lib/sanitize/spec/html_sanitizer/html_sanitizer_spec.cr delete mode 100644 lib/sanitize/spec/html_sanitizer/img.hrx delete mode 100644 lib/sanitize/spec/html_sanitizer/links.hrx delete mode 100644 lib/sanitize/spec/html_sanitizer/protocol-based-javascript.hrx delete mode 100644 lib/sanitize/spec/html_sanitizer/protocol_javascript.hrx delete mode 100644 lib/sanitize/spec/html_sanitizer/url_spec.cr delete mode 100644 lib/sanitize/spec/html_sanitizer/xss.hrx delete mode 100644 lib/sanitize/spec/spec_helper.cr delete mode 100644 lib/sanitize/spec/support/hrx.cr delete mode 100644 lib/sanitize/spec/text_policy.hrx delete mode 100644 lib/sanitize/spec/text_policy_spec.cr delete mode 100644 lib/sanitize/spec/uri_sanitizer_spec.cr delete mode 100644 lib/sanitize/src/adapter/libxml2.cr delete mode 100644 lib/sanitize/src/policy.cr delete mode 100644 lib/sanitize/src/policy/html_sanitizer.cr delete mode 100644 lib/sanitize/src/policy/html_sanitizer/safelist.cr delete mode 100644 lib/sanitize/src/policy/text.cr delete mode 100644 lib/sanitize/src/policy/whitelist.cr delete mode 100644 lib/sanitize/src/processor.cr delete mode 100644 lib/sanitize/src/sanitize.cr delete mode 100644 lib/sanitize/src/uri_sanitizer.cr diff --git a/lib/.shards.info b/lib/.shards.info index b15cf54122a0..7f03bb906410 100644 --- a/lib/.shards.info +++ b/lib/.shards.info @@ -7,6 +7,3 @@ shards: reply: git: https://github.com/i3oris/reply.git version: 0.3.1+git.commit.90a7eb5a76048884d5d56bf6b9369f1e67fdbcd7 - sanitize: - git: https://github.com/straight-shoota/sanitize.git - version: 0.1.0+git.commit.75c141b619c77956e88f557149566cd28876398b diff --git a/lib/sanitize/.circleci/config.yml b/lib/sanitize/.circleci/config.yml deleted file mode 100644 index df9b752af31d..000000000000 --- a/lib/sanitize/.circleci/config.yml +++ /dev/null @@ -1,94 +0,0 @@ -version: 2 - -dry: - restore_shards_cache: &restore_shards_cache - keys: - - shards-cache-v1-{{ .Branch }}-{{ checksum "shard.yml" }} - - shards-cache-v1-{{ .Branch }} - - shards-cache-v1 - - save_shards_cache: &save_shards_cache - key: shards-cache-v1-{{ .Branch }}-{{ checksum "shard.yml" }} - paths: - - ./shards-cache - -jobs: - test: - docker: - - image: crystallang/crystal:latest - environment: - SHARDS_CACHE_PATH: ./shards-cache - steps: - - run: crystal --version - - - checkout - - - restore_cache: *restore_shards_cache - - run: shards - - save_cache: *save_shards_cache - - - run: make test - - - run: crystal tool format --check spec src - - deploy-docs: - docker: - - image: crystallang/crystal:latest - environment: - SHARDS_CACHE_PATH: ./shards-cache - steps: - - run: crystal --version - - - checkout - - - run: scripts/generate-docs.sh - - - run: apt update && apt install -y curl rsync - - run: - command: curl https://raw.githubusercontent.com/straight-shoota/autodeploy-docs/master/autodeploy-docs.sh | bash - environment: - GIT_COMMITTER_NAME: cirlceci - GIT_COMMITTER_EMAIL: circle@circleci.com - - test-on-nightly: - docker: - - image: crystallang/crystal:nightly - environment: - SHARDS_CACHE_PATH: ./shards-cache - steps: - - run: crystal --version - - - checkout - - - restore_cache: *restore_shards_cache - - run: shards - - - run: make test - - - run: crystal tool format --check spec src - -workflows: - version: 2 - # Run tests on every single commit - ci: - jobs: - - test - # Build and depoy docs only on master branch - - deploy-docs: - requires: - - test - filters: &master-only - branches: - only: - - master - # Run tests every night using crystal nightly - nightly: - triggers: - - schedule: - cron: "0 4 * * *" - filters: - branches: - only: - - master - jobs: - - test-on-nightly diff --git a/lib/sanitize/.editorconfig b/lib/sanitize/.editorconfig deleted file mode 100644 index 163eb75c8525..000000000000 --- a/lib/sanitize/.editorconfig +++ /dev/null @@ -1,9 +0,0 @@ -root = true - -[*.cr] -charset = utf-8 -end_of_line = lf -insert_final_newline = true -indent_style = space -indent_size = 2 -trim_trailing_whitespace = true diff --git a/lib/sanitize/.gitignore b/lib/sanitize/.gitignore deleted file mode 100644 index 0bbd4a9f41e1..000000000000 --- a/lib/sanitize/.gitignore +++ /dev/null @@ -1,9 +0,0 @@ -/docs/ -/lib/ -/bin/ -/.shards/ -*.dwarf - -# Libraries don't need dependency lock -# Dependencies will be locked in applications that use them -/shard.lock diff --git a/lib/sanitize/LICENSE b/lib/sanitize/LICENSE deleted file mode 100644 index d64569567334..000000000000 --- a/lib/sanitize/LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/lib/sanitize/Makefile b/lib/sanitize/Makefile deleted file mode 100644 index 980fc7a52014..000000000000 --- a/lib/sanitize/Makefile +++ /dev/null @@ -1,54 +0,0 @@ --include Makefile.local # for optional local options - -BUILD_TARGET ::= bin/app - -# The shards command to use -SHARDS ?= shards -# The crystal command to use -CRYSTAL ?= crystal - -SRC_SOURCES ::= $(shell find src -name '*.cr' 2>/dev/null) -LIB_SOURCES ::= $(shell find lib -name '*.cr' 2>/dev/null) -SPEC_SOURCES ::= $(shell find spec -name '*.cr' 2>/dev/null) - -.PHONY: test -test: ## Run the test suite -test: lib - $(CRYSTAL) spec - -.PHONY: format -format: ## Apply source code formatting -format: $(SRC_SOURCES) $(SPEC_SOURCES) - $(CRYSTAL) tool format src spec - -docs: ## Generate API docs -docs: $(SRC_SOURCES) lib - $(CRYSTAL) docs -o docs - -lib: shard.lock - $(SHARDS) install - -shard.lock: shard.yml - $(SHARDS) update - -.PHONY: clean -clean: ## Remove application binary -clean: - @rm -f $(BUILD_TARGET) - -.PHONY: help -help: ## Show this help - @echo - @printf '\033[34mtargets:\033[0m\n' - @grep -hE '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) |\ - sort |\ - awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-15s\033[0m %s\n", $$1, $$2}' - @echo - @printf '\033[34moptional variables:\033[0m\n' - @grep -hE '^[a-zA-Z_-]+ \?=.*?## .*$$' $(MAKEFILE_LIST) |\ - sort |\ - awk 'BEGIN {FS = " \\?=.*?## "}; {printf " \033[36m%-15s\033[0m %s\n", $$1, $$2}' - @echo - @printf '\033[34mrecipes:\033[0m\n' - @grep -hE '^##.*$$' $(MAKEFILE_LIST) |\ - awk 'BEGIN {FS = "## "}; /^## [a-zA-Z_-]/ {printf " \033[36m%s\033[0m\n", $$2}; /^## / {printf " %s\n", $$2}' diff --git a/lib/sanitize/README.md b/lib/sanitize/README.md deleted file mode 100644 index fdca90db33a1..000000000000 --- a/lib/sanitize/README.md +++ /dev/null @@ -1,128 +0,0 @@ -# sanitize - -`sanitize` is a Crystal library for transforming HTML/XML trees. It's primarily -used to sanitize HTML from untrusted sources in order to prevent -[XSS attacks](http://en.wikipedia.org/wiki/Cross-site_scripting) and other -adversities. - -It builds on stdlib's [`XML`](https://crystal-lang.org/api/XML.html) module to -parse HTML/XML. Based on [libxml2](http://xmlsoft.org/) it's a solid parser and -turns malformed and malicious input into valid and safe markup. - -* Code: [https://github.com/straight-shoota/sanitize](https://github.com/straight-shoota/sanitize) -* API docs: [https://straight-shoota.github.io/sanitize/api/latest/](https://straight-shoota.github.io/sanitize/api/latest/) -* Issue tracker: [https://github.com/straight-shoota/sanitize/issues](https://github.com/straight-shoota/sanitize/issues) -* Shardbox: [https://shardbox.org/shards/sanitize](https://shardbox.org/shards/sanitize) - -## Installation - -1. Add the dependency to your `shard.yml`: - - ```yaml - dependencies: - sanitize: - github: straight-shoota/sanitize - ``` - -2. Run `shards install` - -## Sanitization Features - -The `Sanitize::Policy::HTMLSanitizer` policy applies the following sanitization steps. Except -for the first one (which is essential to the entire process), all can be disabled -or configured. - -* Turns malformed and malicious HTML into valid and safe markup. -* Strips HTML elements and attributes not included in the safe list. -* Sanitizes URL attributes (like `href` or `src`) with customizable sanitization - policy. -* Adds `rel="nofollow"` to all links and `rel="noopener"` to links with `target`. -* Validates values of accepted attributes `align`, `width` and `height`. -* Filters `class` attributes based on a whitelist (by default all classes are - rejected). - -## Usage - -Transformation is based on rules defined by `Sanitize::Policy` implementations. - -The recommended standard policy for HTML sanitization is `Sanitize::Policy::HTMLSanitizer.common` -which represents good defaults for most use cases. -It sanitizes user input against a known safe list of accepted elements and their -attributes. - -```crystal -require "sanitize" - -sanitizer = Sanitize::Policy::HTMLSanitizer.common -sanitizer.process(%(foo)) # => %(foo) -sanitizer.process(%(

foo

)) # => %(

foo

) -sanitizer.process(%()) # => %() -sanitizer.process(%(
foobar
)) # => %(
foobar
) -``` - -Sanitization should always run after any other processing (for example rendering -Markdown) and is a must when including HTML from untrusted sources into a web -page. - -### With Markd - -A typical format for user generated content is `Markdown`. Even though it has -only a very limited feature set compared to HTML, it can still produce -potentially harmful HTML and is is usually possible to embed raw HTML directly. -So Sanitization is necessary. - -The most common Markdown renderer is [markd](https://shardbox.org/shards/markd), -so here is a sample how to use it with `sanitize`: - -````crystal -sanitizer = Sanitize::Policy::HTMLSanitizer.common -# Allow classes with `language-` prefix which are used for syntax highlighting. -sanitizer.valid_classes << /language-.+/ - -markdown = <<-MD - Sanitization with [https://shardbox.org/shards/sanitize](sanitize) is not that - **difficult**. - ```cr - puts "Hello World!" - ``` -

Hello world!

- MD - -html = Markd.to_html(markdown) -sanitized = sanitizer.process(html) -puts sanitized -```` - -The result: - -```html -

Sanitization with https://shardbox.org/shards/sanitize is not that -difficult.

-
puts "Hello World!"
-
-

Hello world!

-``` - -## Limitations - -Sanitizing CSS is not supported. Thus `style` attributes can't be accepted in a -safe way. -CSS sanitization features may be added when a CSS parsing library is available. - -## Security - -If you want to privately disclose security-issues, please contact -[straightshoota](https://keybase.io/straightshoota) on Keybase or -[straightshoota@gmail.com](mailto:straightshoota@gmail.com) (PGP: `DF2D C9E9 FFB9 6AE0 2070 D5BC F0F3 4963 7AC5 087A`). - -## Contributing - -1. Fork it ([https://github.com/straight-shoota/sanitize/fork](https://github.com/straight-shoota/sanitize/fork)) -2. Create your feature branch (`git checkout -b my-new-feature`) -3. Commit your changes (`git commit -am 'Add some feature'`) -4. Push to the branch (`git push origin my-new-feature`) -5. Create a new Pull Request - -## Contributors - -- [Johannes Müller](https://github.com/straight-shoota) - creator and maintainer diff --git a/lib/sanitize/lib b/lib/sanitize/lib deleted file mode 120000 index a96aa0ea9d8c..000000000000 --- a/lib/sanitize/lib +++ /dev/null @@ -1 +0,0 @@ -.. \ No newline at end of file diff --git a/lib/sanitize/scripts/generate-docs.sh b/lib/sanitize/scripts/generate-docs.sh deleted file mode 100755 index 5dbaf344c48d..000000000000 --- a/lib/sanitize/scripts/generate-docs.sh +++ /dev/null @@ -1,18 +0,0 @@ -#! /usr/bin/env bash - -set -e - -GENERATED_DOCS_DIR="./docs" - -echo -e "Building docs into ${GENERATED_DOCS_DIR}" -echo -e "Clearing ${GENERATED_DOCS_DIR} directory" -rm -rf "${GENERATED_DOCS_DIR}" - -echo -e "Running \`make docs\`..." -make docs - -echo -e "Copying README.md" - -# "{{" and "{%"" need to be escaped, otherwise Jekyll might interpret the expressions (on Github Pages) -ESCAPE_TEMPLATE='s/{{/{{"{{"}}/g; s/{\%/{{"{%"}}/g;' -sed "${ESCAPE_TEMPLATE}" README.md > "${GENERATED_DOCS_DIR}/README.md" diff --git a/lib/sanitize/shard.yml b/lib/sanitize/shard.yml deleted file mode 100644 index eb9158fc58e4..000000000000 --- a/lib/sanitize/shard.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: sanitize -version: 0.1.0 - -authors: - - Johannes Müller - -crystal: 0.35.0 - -license: Apache-2.0 - -documentation: https://straight-shoota.github.io/sanitize/api/latest/ - -development_dependencies: - hrx: - github: straight-shoota/hrx diff --git a/lib/sanitize/spec/html_sanitizer/basic.hrx b/lib/sanitize/spec/html_sanitizer/basic.hrx deleted file mode 100644 index fe291053b1dc..000000000000 --- a/lib/sanitize/spec/html_sanitizer/basic.hrx +++ /dev/null @@ -1,70 +0,0 @@ -<===> empty/document.html -<===> - - -<===> pending:skeleton/document.html - - - - - -<===> - - -<===> invalid/fragment.html -foo

bar

bazz
quux
-<===> invalid/common.html -foo

bar

bazz
quux
-<===> - - - -<===> invalid-div/fragment.html -foo

bar

bazz
quux
-<===> invalid-div/common.html -foo

bar

bazz quux -<===> - - -<===> basic/fragment.html -Lorem ipsum dolor sit
amet -<===> basic/common.html -Lorem ipsum dolor sit
amet -<===> - - -<===> malformed/fragment.html -Lorem dolor sit
amet -<===> malicious/common.html -Lorem ipsum dolor sit
amet <script>alert("hello world"); -<===> - - -<===> target="_blank"/fragment.html -foo -<===> target="_blank"/common.html -foo -<===> - - -<===> percent encoded URL/fragment.html -CI Status -<===> percent encoded URL/common.html -CI Status -<===> diff --git a/lib/sanitize/spec/html_sanitizer/class.hrx b/lib/sanitize/spec/html_sanitizer/class.hrx deleted file mode 100644 index 897c1c2f4606..000000000000 --- a/lib/sanitize/spec/html_sanitizer/class.hrx +++ /dev/null @@ -1,34 +0,0 @@ -<===> reject/fragment.html -
-<===> reject/common.html -
-<===> reject/allow-prefix.html -
-<===> - - -<===> allow-with-prefix/fragment.html -
-<===> allow-with-prefix/common.html -
-<===> allow-with-prefix/allow-prefix.html -
-<===> - - -<===> reject-non-prefix/fragment.html -
-<===> reject-non-prefix/common.html -
-<===> reject-non-prefix/allow-prefix.html -
-<===> - - -<===> allow-explicit/fragment.html -
-<===> allow-explicit/common.html -
-<===> allow-explicit/allow-prefix.html -
-<===> diff --git a/lib/sanitize/spec/html_sanitizer/combined_policies.hrx b/lib/sanitize/spec/html_sanitizer/combined_policies.hrx deleted file mode 100644 index 01de9ae3f93f..000000000000 --- a/lib/sanitize/spec/html_sanitizer/combined_policies.hrx +++ /dev/null @@ -1,42 +0,0 @@ -<===> basic/fragment.html -Lorem ipsum dolor sit
amet -<===> basic/text.html -Lorem ipsum dolor sit amet -<===> basic/inline.html -Lorem ipsum dolor sit amet -<===> basic/common.html -Lorem ipsum dolor sit
amet -<===> - - -<===> malformed/fragment.html -Lorem dolor sit
amet -<===> malicious/text.html -Lorem ipsum dolor sit amet <script>alert("hello world"); -<===> malicious/inline.html -Lorem ipsum dolor sit amet <script>alert("hello world"); -<===> malicious/common.html -Lorem ipsum dolor sit
amet <script>alert("hello world"); -<===> diff --git a/lib/sanitize/spec/html_sanitizer/combined_policies_spec.cr b/lib/sanitize/spec/html_sanitizer/combined_policies_spec.cr deleted file mode 100644 index 5751fba7b459..000000000000 --- a/lib/sanitize/spec/html_sanitizer/combined_policies_spec.cr +++ /dev/null @@ -1,11 +0,0 @@ -require "../support/hrx" -require "../../src/processor" -require "../../src/policy/html_sanitizer" -require "../../src/policy/text" - -run_hrx_samples Path["./combined_policies.hrx"], { - "text" => Sanitize::Policy::Text.new, - "inline" => Sanitize::Policy::HTMLSanitizer.inline.no_links, - "basic" => Sanitize::Policy::HTMLSanitizer.basic, - "common" => Sanitize::Policy::HTMLSanitizer.common, -} diff --git a/lib/sanitize/spec/html_sanitizer/default.hrx b/lib/sanitize/spec/html_sanitizer/default.hrx deleted file mode 100644 index 627adb8fc7fd..000000000000 --- a/lib/sanitize/spec/html_sanitizer/default.hrx +++ /dev/null @@ -1,138 +0,0 @@ -<===> invalid/fragment.html -foo

bar

bazz
quux
-<===> invalid/stripped.html -foo

bar

bazz
quux
-<===> invalid/escaped.html -<invalid>foo<p>bar</p>bazz</invalid>
quux
-<===> invalid/pruned.html -
quux
-<===> - - -<===> bad_argument/fragment.html -
foo
-<===> bad_argument/stripped.html -
foo
-<===> - -<==> whitewash/fragment.html -no
foo
bar -<==> whitewash/pruned.html -
foo
-<==> - - -<===> nofollow/fragment.html -Click here -<===> nofollow/stripped.html -Click here -<===> - - -<===> nofollow-rel/fragment.html -Click here -<===> nofollow-rel/stripped.html -Click here -<===> - - -<===> unprintable/fragment.html -Lo\u2029ofah ro\u2028cks! -<===> unprintable/stripped.html -Loofah rocks! -<===> - - -<===> msword/fragment.html - - -

Foo BOLD

-<===> msword/stripped.html - - -

Foo BOLD

-<===> - - -<===> entities/fragment.html -

foo bar

-<===> - - -<===> align/fragment.html -

foo

-<===> - - -<===> align-empty/fragment.html -

foo

-<===> align-empty/common.html -

foo

-<===> - - -<===> align-invalid/fragment.html -

foo

-<===> align-invalid/common.html -

foo

-<===> diff --git a/lib/sanitize/spec/html_sanitizer/html_sanitizer_spec.cr b/lib/sanitize/spec/html_sanitizer/html_sanitizer_spec.cr deleted file mode 100644 index f70a965345fa..000000000000 --- a/lib/sanitize/spec/html_sanitizer/html_sanitizer_spec.cr +++ /dev/null @@ -1,102 +0,0 @@ -require "../support/hrx" -require "../../src/policy/html_sanitizer" - -describe Sanitize::Policy::HTMLSanitizer do - it "removes invalid element" do - Sanitize::Policy::HTMLSanitizer.common.process("

foobar

").should eq "

foobar

" - end - - it "inserts whitespace for removed block tag" do - Sanitize::Policy::HTMLSanitizer.common.process("

foo

bar
baz

").should eq "

foo bar baz

" - end - - it "strips tag with invalid URL attribute" do - Sanitize::Policy::HTMLSanitizer.common.process(%()).should eq %() - Sanitize::Policy::HTMLSanitizer.common.process(%(foo)).should eq "foo" - end - - it "escapes URL attribute" do - Sanitize::Policy::HTMLSanitizer.common.process(%()).should eq %() - end - - it %(adds rel="noopener" on target="_blank") do - policy = Sanitize::Policy::HTMLSanitizer.common - policy.process(%(foo)).should eq(%(foo)) - policy.accepted_attributes["a"] << "target" - policy.process(%(foo)).should eq(%(foo)) - end - - it "doesn't leak configuration" do - policy = Sanitize::Policy::HTMLSanitizer.common - policy.accepted_attributes["p"] << "invalid" - policy.process(%(

bar

)).should eq(%(

bar

)) - Sanitize::Policy::HTMLSanitizer.common.process(%(

bar

)).should eq(%(

bar

)) - end - - describe "html scaffold" do - it "fragment" do - Sanitize::Policy::HTMLSanitizer.common.process("FOO

BAR

").should eq "FOO

BAR

" - end - - it "document" do - sanitizer = Sanitize::Policy::HTMLSanitizer.common - sanitizer.accept_tag("html") - sanitizer.accept_tag("head") - sanitizer.accept_tag("body") - sanitizer.process_document("FOO

BAR

").should eq "FOO

BAR

\n" - end - end - - describe "#transform_classes" do - it "strips classes by default" do - policy = Sanitize::Policy::HTMLSanitizer.inline - orig_attributes = {"class" => "foo bar baz"} - attributes = orig_attributes.clone - policy.transform_classes("div", attributes) - attributes.should eq Hash(String, String).new - end - - it "accepts classes" do - policy = Sanitize::Policy::HTMLSanitizer.inline - orig_attributes = {"class" => "foo bar baz"} - attributes = orig_attributes.clone - - policy.valid_classes << /fo*/ - policy.valid_classes << "bar" - policy.transform_classes("div", attributes) - attributes.should eq({"class" => "foo bar"}) - end - - it "only matches full class name" do - policy = Sanitize::Policy::HTMLSanitizer.inline - orig_attributes = {"class" => "foobar barfoo barfoobaz foo fom"} - attributes = orig_attributes.clone - - policy.valid_classes << /fo./ - policy.transform_classes("div", attributes) - attributes.should eq({"class" => "foo fom"}) - end - end - - run_hrx_samples Path["basic.hrx"], { - "common" => Sanitize::Policy::HTMLSanitizer.common, - } - run_hrx_samples Path["protocol_javascript.hrx"], { - "common" => Sanitize::Policy::HTMLSanitizer.common, - } - run_hrx_samples Path["links.hrx"], { - "common" => Sanitize::Policy::HTMLSanitizer.common, - } - run_hrx_samples Path["xss.hrx"], { - "common" => Sanitize::Policy::HTMLSanitizer.common, - } - run_hrx_samples Path["img.hrx"], { - "common" => Sanitize::Policy::HTMLSanitizer.common, - } - run_hrx_samples Path["class.hrx"], { - "common" => Sanitize::Policy::HTMLSanitizer.common, - "allow-prefix" => Sanitize::Policy::HTMLSanitizer.common.tap { |sanitizer| - sanitizer.valid_classes = Set{/allowed-.+/, "explicitly-allowed"} - }, - } -end diff --git a/lib/sanitize/spec/html_sanitizer/img.hrx b/lib/sanitize/spec/html_sanitizer/img.hrx deleted file mode 100644 index 1fd81d00d687..000000000000 --- a/lib/sanitize/spec/html_sanitizer/img.hrx +++ /dev/null @@ -1,46 +0,0 @@ -<===> img/fragment.html - -<===> - - -<===> img with width/fragment.html - -<===> - - -<===> img with height/fragment.html - -<===> - - -<===> img with width and height/fragment.html - -<===> - - -<===> img invalid height/fragment.html - -<===> img invalid height/common.html - -<===> - - -<===> img invalid width/fragment.html - -<===> img invalid width/common.html - -<===> - - - -<===> img invalid width and height/fragment.html - -<===> img invalid width and height/common.html - -<===> - - - -<===> img percent width and height/fragment.html - -<===> diff --git a/lib/sanitize/spec/html_sanitizer/links.hrx b/lib/sanitize/spec/html_sanitizer/links.hrx deleted file mode 100644 index 104740825fab..000000000000 --- a/lib/sanitize/spec/html_sanitizer/links.hrx +++ /dev/null @@ -1,89 +0,0 @@ -<===> links/1/fragment.html - -<===> links/1/common.html - -<===> - - -<===> links/2/fragment.html - -<===> links/2/common.html - -<===> - - -<===> links/3/fragment.html - -<===> links/3/common.html - -<===> - - -<===> links/4/fragment.html - -<===> links/4/common.html - -<===> - - -<===> links/5/fragment.html - -<===> links/5/common.html - -<===> - - -<===> links/6/fragment.html - -<===> links/6/common.html - -<===> - - -<===> links/7/fragment.html - -<===> links/7/common.html - -<===> - - -<===> links/8/fragment.html - -<===> links/8/common.html - -<===> - - -<===> links/9/fragment.html - -<===> links/9/common.html - -<===> - - -<===> links/10/fragment.html - -<===> links/10/common.html - -<===> - - -<===> links/11/fragment.html -Red dot -<===> links/11/common.html -Red dot -<===> - - -<===> links/12/fragment.html - -<===> links/12/common.html - -<===> - - -<===> links/13/fragment.html - -<===> links/13/common.html - -<===> diff --git a/lib/sanitize/spec/html_sanitizer/protocol-based-javascript.hrx b/lib/sanitize/spec/html_sanitizer/protocol-based-javascript.hrx deleted file mode 100644 index 16576ea78f80..000000000000 --- a/lib/sanitize/spec/html_sanitizer/protocol-based-javascript.hrx +++ /dev/null @@ -1,160 +0,0 @@ - -<===> simple, no spaces/fragment.html -foo -<===> simple, no spaces/common.html -foo -<===> simple, no spaces/restricted.html -foo -<===> simple, no spaces/basic.html -foo -<===> simple, no spaces/relaxed.html -foo - -<===> simple, spaces before/fragment.html -foo -<===> simple, spaces before/common.html -foo -<===> simple, spaces before/restricted.html -foo -<===> simple, spaces before/basic.html -foo -<===> simple, spaces before/relaxed.html -foo - -<===> simple, spaces after/fragment.html -foo -<===> simple, spaces after/common.html -foo -<===> simple, spaces after/restricted.html -foo -<===> simple, spaces after/basic.html -foo -<===> simple, spaces after/relaxed.html -foo - -<===> simple, spaces before and after/fragment.html -foo -<===> simple, spaces before and after/common.html -foo -<===> simple, spaces before and after/restricted.html -foo -<===> simple, spaces before and after/basic.html -foo -<===> simple, spaces before and after/relaxed.html -foo - -<===> preceding colon/fragment.html -foo -<===> preceding colon/common.html -foo -<===> preceding colon/restricted.html -foo -<===> preceding colon/basic.html -foo -<===> preceding colon/relaxed.html -foo - -<===> UTF-8 encoding/fragment.html -foo -<===> UTF-8 encoding/common.html -foo -<===> UTF-8 encoding/restricted.html -foo -<===> UTF-8 encoding/basic.html -foo -<===> UTF-8 encoding/relaxed.html -foo - -<===> long UTF-8 encoding/fragment.html -foo -<===> long UTF-8 encoding/common.html -foo -<===> long UTF-8 encoding/restricted.html -foo -<===> long UTF-8 encoding/basic.html -foo -<===> long UTF-8 encoding/relaxed.html -foo - -<===> long UTF-8 encoding without semicolons/fragment.html -foo -<===> long UTF-8 encoding without semicolons/common.html -foo -<===> long UTF-8 encoding without semicolons/restricted.html -foo -<===> long UTF-8 encoding without semicolons/basic.html -foo -<===> long UTF-8 encoding without semicolons/relaxed.html -foo - -<===> hex encoding/fragment.html -foo -<===> hex encoding/common.html -foo -<===> hex encoding/restricted.html -foo -<===> hex encoding/basic.html -foo -<===> hex encoding/relaxed.html -foo - -<===> long hex encoding/fragment.html -foo -<===> long hex encoding/common.html -foo -<===> long hex encoding/restricted.html -foo -<===> long hex encoding/basic.html -foo -<===> long hex encoding/relaxed.html -foo - -<===> hex encoding without semicolons/fragment.html -foo -<===> hex encoding without semicolons/common.html -foo -<===> hex encoding without semicolons/restricted.html -foo -<===> hex encoding without semicolons/basic.html -foo -<===> hex encoding without semicolons/relaxed.html -foo - -<===> null char/fragment.html - -<===> null char/common.html -<===> null char/restricted.html -<===> null char/basic.html -<===> null char/relaxed.html -<===> invalid URL char/fragment.html - -<===> invalid URL char/common.html - -<===> invalid URL char/restricted.html - -<===> invalid URL char/basic.html - -<===> invalid URL char/relaxed.html - - -<===> spaces and entities/fragment.html - -<===> spaces and entities/common.html - -<===> spaces and entities/restricted.html - -<===> spaces and entities/basic.html - -<===> spaces and entities/relaxed.html - - -<===> protocol whitespace/fragment.html - -<===> protocol whitespace/common.html - -<===> protocol whitespace/restricted.html - -<===> protocol whitespace/basic.html - -<===> protocol whitespace/relaxed.html - diff --git a/lib/sanitize/spec/html_sanitizer/protocol_javascript.hrx b/lib/sanitize/spec/html_sanitizer/protocol_javascript.hrx deleted file mode 100644 index fc4b86c50d29..000000000000 --- a/lib/sanitize/spec/html_sanitizer/protocol_javascript.hrx +++ /dev/null @@ -1,67 +0,0 @@ -<===> simple, no spaces/fragment.html -foo -<===> simple, no spaces/common.html -foo -<===> simple, spaces before/fragment.html -foo -<===> -# TODO: Maybe this should strip the a tag -<===> simple, spaces before/common.html -foo -<===> simple, spaces after/fragment.html -foo -<===> simple, spaces after/common.html -foo -<===> simple, spaces before and after/fragment.html -foo -<===> -# TODO: Maybe this should strip the a tag -<===> simple, spaces before and after/common.html -foo -<===> preceding colon/fragment.html -foo -<===> -# TODO: Maybe this should strip the a tag -<===> preceding colon/common.html -foo -<===> UTF-8 encoding/fragment.html -foo -<===> UTF-8 encoding/common.html -foo -<===> long UTF-8 encoding/fragment.html -foo -<===> long UTF-8 encoding/common.html -foo -<===> long UTF-8 encoding without semicolons/fragment.html -foo -<===> long UTF-8 encoding without semicolons/common.html -foo -<===> hex encoding/fragment.html -foo -<===> hex encoding/common.html -foo -<===> long hex encoding/fragment.html -foo -<===> long hex encoding/common.html -foo -<===> hex encoding without semicolons/fragment.html -foo -<===> hex encoding without semicolons/common.html -foo -<===> null char/fragment.html - -<===> -# TODO: Maybe this should strip the a tag -<===> null char/common.html - -<===> invalid URL char/fragment.html - -<===> -# TODO: Maybe this should strip the a tag -<===> invalid URL char/common.html - -<===> spaces and entities/fragment.html - -<===> spaces and entities/common.html - -<===> diff --git a/lib/sanitize/spec/html_sanitizer/url_spec.cr b/lib/sanitize/spec/html_sanitizer/url_spec.cr deleted file mode 100644 index 5e1aade7ae90..000000000000 --- a/lib/sanitize/spec/html_sanitizer/url_spec.cr +++ /dev/null @@ -1,8 +0,0 @@ -require "../support/hrx" -require "../../src/policy/html_sanitizer" - -describe "Sanitize::Policy::HTMLSanitizer" do - it "escapes URL attribute" do - Sanitize::Policy::HTMLSanitizer.common.process(%()).should eq %() - end -end diff --git a/lib/sanitize/spec/html_sanitizer/xss.hrx b/lib/sanitize/spec/html_sanitizer/xss.hrx deleted file mode 100644 index 4f2e238944c7..000000000000 --- a/lib/sanitize/spec/html_sanitizer/xss.hrx +++ /dev/null @@ -1,476 +0,0 @@ -<===> # Basic XSS -<===> fragment.html -test -<===> common.html -test -<===> - - -# Pending because libxml2 behaviour changed in 2.9.13 (https://gitlab.gnome.org/GNOME/libxml2/-/issues/339) -<===> pending:fragment.html -<<<>< -<===> common.html - -<===> - - -<===> fragment.html - -<===> -` -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html -
-<===> - - -<===> fragment.html -
-<===> common.html -
-<===> - - -<===> fragment.html -
-<===> common.html -
-<===> - - -<===> fragment.html -
-<===> common.html -
-<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html - -<===> - -<===> common.html - -<===> - - -<===> fragment.html - -<===> common.html - -<===> - - -<===> fragment.html -PT SRC="http://ha.ckers.org/xss.js"> -<===> common.html -PT SRC="http://ha.ckers.org/xss.js"> -<===> - - -<===> fragment.html - -<==> complex/text.html -Lorem ipsum dolor sit amet -<==> - - -# Pending because libxml2 behaviour changed in 2.9.13 (https://gitlab.gnome.org/GNOME/libxml2/-/issues/339) -<==> pending:html-special-chars/fragment.html -<script> -<==> pending:html-special-chars/text.html -<script> -<==> - - -<==> prune script/fragment.html - -<==> prune script/text.html -<==> - - -<==> prune style/fragment.html - -<==> prune script/text.html -<==> diff --git a/lib/sanitize/spec/text_policy_spec.cr b/lib/sanitize/spec/text_policy_spec.cr deleted file mode 100644 index 8b02a154f330..000000000000 --- a/lib/sanitize/spec/text_policy_spec.cr +++ /dev/null @@ -1,17 +0,0 @@ -require "./support/hrx" -require "../src/policy/text" -require "../src/processor" - -describe Sanitize::Policy::Text do - it "continues on tag" do - Sanitize::Policy::Text.new.transform_tag("foo", {} of String => String).should eq Sanitize::Policy::CONTINUE - end - - it "adds whitespace" do - Sanitize::Policy::Text.new.process("foo
bar").should eq "foo bar" - end - - run_hrx_samples Path["./text_policy.hrx"], { - "text" => Sanitize::Policy::Text.new, - } -end diff --git a/lib/sanitize/spec/uri_sanitizer_spec.cr b/lib/sanitize/spec/uri_sanitizer_spec.cr deleted file mode 100644 index a3aa25b092ad..000000000000 --- a/lib/sanitize/spec/uri_sanitizer_spec.cr +++ /dev/null @@ -1,130 +0,0 @@ -require "../src/uri_sanitizer" -require "spec" -require "uri" - -private def assert_sanitize(source : String, expected : String? = source, sanitizer = Sanitize::URISanitizer.new, *, file = __FILE__, line = __LINE__) - if expected - expected = URI.parse(expected) - end - sanitizer.sanitize(URI.parse(source)).should eq(expected), file: file, line: line -end - -describe Sanitize::URISanitizer do - describe "#accepted_schemes" do - it "has default value" do - Sanitize::URISanitizer.new.accepted_schemes.should eq Set{"http", "https", "mailto", "tel"} - end - - it "accepts minimal schemes" do - assert_sanitize("http://example.com") - assert_sanitize("https://example.com") - assert_sanitize("mailto:mail@example.com") - assert_sanitize("tel:example.com") - end - - it "refutes unsafe schemes" do - assert_sanitize("javascript:alert();", nil) - assert_sanitize("ssh:git@github.com", nil) - end - - it "custom schemes" do - sanitizer = Sanitize::URISanitizer.new - sanitizer.accept_scheme "javascript" - assert_sanitize("javascript:alert();", sanitizer: sanitizer) - end - - it "can be disabled" do - sanitizer = Sanitize::URISanitizer.new - sanitizer.accepted_schemes = nil - assert_sanitize("javascript:alert();", sanitizer: sanitizer) - assert_sanitize("foo:bar", sanitizer: sanitizer) - end - end - - describe "#base_url" do - it "disabled by default" do - Sanitize::URISanitizer.new.base_url.should be_nil - assert_sanitize("foo") - end - - it "set to absolute URL" do - sanitizer = Sanitize::URISanitizer.new - sanitizer.base_url = URI.parse("https://example.com/base/") - - assert_sanitize("foo", "https://example.com/base/foo", sanitizer: sanitizer) - assert_sanitize("/foo", "https://example.com/foo", sanitizer: sanitizer) - end - - it "doesn't base fragment-only URLs" do - sanitizer = Sanitize::URISanitizer.new - sanitizer.base_url = URI.parse("https://example.com/base/") - - assert_sanitize("#foo", sanitizer: sanitizer) - assert_sanitize("#", sanitizer: sanitizer) - assert_sanitize("https:#", sanitizer: sanitizer) - assert_sanitize("?#foo", "https://example.com/base/?#foo", sanitizer: sanitizer) - assert_sanitize("/#", "https://example.com/#", sanitizer: sanitizer) - assert_sanitize("https://#", "https://#", sanitizer: sanitizer) - - sanitizer.resolve_fragment_urls = true - assert_sanitize("#foo", "https://example.com/base/#foo", sanitizer: sanitizer) - assert_sanitize("#", "https://example.com/base/#", sanitizer: sanitizer) - assert_sanitize("https:#", "https:#", sanitizer: sanitizer) - end - end - - describe "#accepted_hosts" do - it "disabled by default" do - Sanitize::URISanitizer.new.accepted_hosts.should be_nil - end - - it "restricts hosts" do - sanitizer = Sanitize::URISanitizer.new - sanitizer.accepted_hosts = Set{"foo.example.com"} - assert_sanitize("http://foo.example.com", sanitizer: sanitizer) - assert_sanitize("http://bar.example.com", nil, sanitizer: sanitizer) - assert_sanitize("http://example.com", nil, sanitizer: sanitizer) - assert_sanitize("http://foo.foo.example.com", nil, sanitizer: sanitizer) - assert_sanitize("foo", sanitizer: sanitizer) - end - - it "works with base_url" do - sanitizer = Sanitize::URISanitizer.new - sanitizer.accepted_hosts = Set{"foo.example.com"} - sanitizer.base_url = URI.parse("http://bar.example.com/") - assert_sanitize("foo", "http://bar.example.com/foo", sanitizer: sanitizer) - assert_sanitize("http://bar.example.com/foo", nil, sanitizer: sanitizer) - end - end - - describe "#rejected_hosts" do - it "disabled by default" do - Sanitize::URISanitizer.new.rejected_hosts.should be_a(Set(String)) - end - - it "restricts hosts" do - sanitizer = Sanitize::URISanitizer.new - sanitizer.rejected_hosts = Set{"bar.example.com"} - assert_sanitize("http://foo.example.com", sanitizer: sanitizer) - assert_sanitize("http://bar.example.com", nil, sanitizer: sanitizer) - assert_sanitize("http://example.com", sanitizer: sanitizer) - assert_sanitize("http://bar.bar.example.com", sanitizer: sanitizer) - assert_sanitize("foo", sanitizer: sanitizer) - end - - it "works with base_url" do - sanitizer = Sanitize::URISanitizer.new - sanitizer.rejected_hosts = Set{"foo.example.com"} - sanitizer.base_url = URI.parse("http://foo.example.com/") - assert_sanitize("foo", "http://foo.example.com/foo", sanitizer: sanitizer) - assert_sanitize("http://foo.example.com/foo", nil, sanitizer: sanitizer) - end - - it "overrides accepted_hosts" do - sanitizer = Sanitize::URISanitizer.new - sanitizer.rejected_hosts = Set{"foo.example.com"} - sanitizer.accepted_hosts = Set{"foo.example.com"} - assert_sanitize("http://foo.example.com/foo", nil, sanitizer: sanitizer) - end - end -end diff --git a/lib/sanitize/src/adapter/libxml2.cr b/lib/sanitize/src/adapter/libxml2.cr deleted file mode 100644 index 51d899454c66..000000000000 --- a/lib/sanitize/src/adapter/libxml2.cr +++ /dev/null @@ -1,137 +0,0 @@ -struct Sanitize::Adapter::LibXML2 - include Adapter - - def self.process(policy : Policy, html : String, fragment : Bool = false) - return "" if html.empty? - - node = parse(html, fragment) - process(policy, node, fragment) - end - - def self.process(policy : Policy, node : XML::Node, fragment : Bool = false) - build(fragment) do |builder| - process(policy, node, builder, fragment) - end - end - - def self.process(policy : Policy, node : XML::Node, builder : XML::Builder, fragment : Bool = false) - processor = Processor.new(policy, new(builder)) - visit(processor, node, fragment) - builder.end_document - builder.flush - end - - def self.parse(html : String, fragment : Bool) - if fragment - html = "#{html}" - end - - node = XML.parse_html(html, XML::HTMLParserOptions.default | XML::HTMLParserOptions::NOIMPLIED | XML::HTMLParserOptions::NODEFDTD) - end - - def self.build(fragment : Bool) - result = String.build do |io| - builder = XML::Builder.new(io) - - if fragment - builder.start_element("fragment") - end - - yield(builder) - end - - if fragment - result = "" if result == "\n" - result = result.lchop("").rchop("\n") - end - # strip trailing non-linebreak whitespace - if result.ends_with?("\n") - result - else - result.rstrip - end - end - - def self.visit(processor : Processor, node : XML::Node, fragment : Bool) - visitor = Visitor.new(processor, fragment) - visitor.visit(node) - end - - # :nodoc: - struct Visitor - @attributes = Hash(String, String).new - - def initialize(@processor : Processor, @fragment : Bool) - end - - # :nodoc: - def visit(node : XML::Node) - case node.type - when .html_document_node? - visit_children(node) - when .dtd_node? - # skip DTD - when .text_node? - visit_text(node) - when .element_node? - visit_element(node) - when .comment_node? - # skip comments - when .cdata_section_node? - # skip CDATA - else - raise "Not implemented for: #{node.type}:#{node.name}:#{node.content}" - end - end - - def visit_children(node) - node.children.each do |child| - visit(child) - end - end - - def visit_text(node) - @processor.process_text(node.content) - end - - def visit_element(node) - if @fragment && node.name.in?({"html", "body"}) - @attributes.clear - @processor.process_element(node.name, @attributes, Processor::CONTINUE) do - visit_children(node) - end - return - end - - @attributes.clear - node.attributes.each do |attribute| - @attributes[attribute.name] = attribute.content - end - - name = node.name - if namespace = node.namespace - name = "#{namespace}:#{name}" - end - - @processor.process_element(name, @attributes) do - visit_children(node) - end - end - end - - def initialize(@builder : XML::Builder) - end - - def start_tag(name : String, attributes : Hash(String, String)) : Nil - @builder.start_element(name) - @builder.attributes(attributes) - end - - def end_tag(name : String, attributes : Hash(String, String)) : Nil - @builder.end_element - end - - def write_text(text : String) : Nil - @builder.text(text) - end -end diff --git a/lib/sanitize/src/policy.cr b/lib/sanitize/src/policy.cr deleted file mode 100644 index d1ce31ccf63f..000000000000 --- a/lib/sanitize/src/policy.cr +++ /dev/null @@ -1,45 +0,0 @@ -# A policy defines the rules for transforming an HTML/XML tree. -# -# * `HTMLSanitizer` is a policy for HTML sanitization. -# * `Whitelist` is a whitelist-based transformer that's useful either for -# simple stripping applications or as a building block for more advanced -# sanitization policies. -# * `Text` is a policy that turns HTML into plain text. -abstract class Sanitize::Policy - # :nodoc: - alias CONTINUE = Processor::CONTINUE - # :nodoc: - alias STOP = Processor::STOP - - # Defines the string that is added when whitespace is needed when a block tag - # is stripped. - property block_whitespace = " " - - # Receives the content of a text node and returns the transformed content. - # - # If the return value is `nil`, the content is skipped. - abstract def transform_text(text : String) : String? - - # Receives the element name and attributes of an opening tag and returns the - # transformed element name (usually the same as the input name). - # - # *attributes* are transformed directly in place. - # - # Special return values: - # * `Processor::CONTINUE`: Tells the processor to strip the current tag but - # continue traversing its children. - # * `Processor::CONTINUE`: Tells the processor to skip the current tag and its - # children completely and move to the next sibling. - abstract def transform_tag(name : String, attributes : Hash(String, String)) : String | Processor::CONTINUE | Processor::STOP - - HTML_BLOCK_ELEMENTS = Set{ - "address", "article", "aside", "audio", "video", "blockquote", "br", - "canvas", "dd", "div", "dl", "fieldset", "figcaption", "figure", "footer", - "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", - "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", - } - - def block_tag?(name) - HTML_BLOCK_ELEMENTS.includes?(name) - end -end diff --git a/lib/sanitize/src/policy/html_sanitizer.cr b/lib/sanitize/src/policy/html_sanitizer.cr deleted file mode 100644 index dbcc71ce131a..000000000000 --- a/lib/sanitize/src/policy/html_sanitizer.cr +++ /dev/null @@ -1,350 +0,0 @@ -require "./whitelist" -require "../uri_sanitizer" - -# This policy serves as a good default configuration that should fit most -# typical use cases for HTML sanitization. -# -# ## Configurations -# It comes in three different configurations with different sets of supported -# HTML tags. -# -# They only differ in the default configuration of allowed tags and attributes. -# The transformation behaviour is otherwise the same. -# -# ### Common Configuration -# `.common`: Accepts most standard tags and thus allows using a good -# amount of HTML features (see `COMMON_SAFELIST`). -# -# This is the recommended default configuration and should work for typical use -# cases unless strong restrictions on allowed content is required. -# -# ``` -# sanitizer = Sanitize::Policy::HTMLSanitizer.common -# sanitizer.process(%(foo)) # => %(foo) -# sanitizer.process(%(

foo

)) # => %(

foo

) -# sanitizer.process(%()) # => %() -# sanitizer.process(%(
foobar
)) # => %(
foobar
) -# ``` -# -# NOTE: This configuration (nor any other) does not accept `<html>`, -# `<head>`, or # `<body>` tags by default. In order to use -# `#sanitized_document` they need to be added explicitly to `accepted_arguments`. -# -# ### Basic Configuration -# -# `.basic`: This set accepts some basic tags including paragraphs, headlines, -# lists, and images (see `BASIC_SAFELIST`). -# -# ``` -# sanitizer = Sanitize::Policy::HTMLSanitizer.basic -# sanitizer.process(%(foo)) # => %(foo) -# sanitizer.process(%(

foo

)) # => %(

foo

) -# sanitizer.process(%()) # => %() -# sanitizer.process(%(
foobar
)) # => %(foo bar) -# ``` -# -# ### Inline Configuration -# -# `.inline`: Accepts only a limited set of inline tags (see `INLINE_SAFELIST`). -# -# ``` -# sanitizer = Sanitize::Policy::HTMLSanitizer.inline -# sanitizer.process(%(foo)) # => %(foo) -# sanitizer.process(%(

foo

)) # => %(foo) -# sanitizer.process(%()) # => %() -# sanitizer.process(%(
foobar
)) # => %(foo bar) -# ``` -# -# ## Attribute Transformations -# -# Attribute transformations are identical in all three configurations. But more -# advanced transforms won't apply if the respective attribute is not allowed in -# `accepted_tags`. -# So you can easily add additional elements and attributes to lower-tier sets -# and get the same attribute validation. For example: `.inline` doesn't include -# `<img>` tags, but when `img` is added to `accepted_attributes`, -# the policy validates img tags the same way as in `.common`. -# -# ### URL Sanitization -# -# This transformation applies to attributes that contain a URL (configurable -# through (`url_attributes`). -# -# * Makes sure the value is a valid URI (via `URI.parse`). If it does not parse, -# the attribute value is set to empty string. -# * Sanitizes the URI via `URISanitizer (configurable trough `uri_sanitizer`). -# If the sanitizer returns `nil`, the attribute value is set to empty string. -# -# The same `URISanitizer` is used for any URL attributes. -# -# ### Anchor Tags -# -# For `<a>` tags with a `href` attribute, there are two transforms: -# -# * `rel="nofollow"` is added (can be disabled with `add_rel_nofollow`). -# * `rel="noopener"` is added to links with `target` attribute (can be disabled -# with `add_rel_noopener`). -# -# Anchor tags the have neither a `href`, `name` or `id` attribute are stripped. -# -# NOTE: `name` and `id` attributes are not in any of the default sets of -# accepted attributes, so they can only be used when explicitly enabled. -# -# ### Image Tags -# -# `<img>` tags are stripped if they don't have a `src` attribute. -# -# ### Size Attributes -# -# If a tag has `width` or `height` attributes, the values are validated to be -# numerical or percent values. -# By default, these attributes are only accepted for <img> tags. -# -# ### Alignment Attribute -# -# The `align` attribute is validated against allowed values for this attribute: -# `center, left, right, justify, char`. -# If the value is invalid, the attribute is stripped. -# -# ### Classes -# -# `class` attributes are filtered to accept only classes described by -# `valid_classes`. String values need to match the class name exactly, regex -# values need to match the entire class name. -# -# `class` is accepted as a global attribute in the default configuration, but no -# values are allowed in `valid_classes`. -# -# All classes can be accepted by adding the match-all regular expression `/.*/` -# to `valid_classes`. -class Sanitize::Policy::HTMLSanitizer < Sanitize::Policy::Whitelist - # Add `rel="nofollow"` to every `<a>` tag with `href` attribute. - property add_rel_nofollow = true - - # Add `rel="noopener"` to every `<a>` tag with `href` and `target` attribute. - property add_rel_noopener = true - - # Configures the `URISanitizer` to use for sanitizing URL attributes. - property uri_sanitizer = URISanitizer.new - - # Configures which attributes are considered to contain URLs. If empty, URL - # sanitization is disabled. - # - # Default value: `Set{"src", "href", "action", "cite", "longdesc"}`. - property url_attributes : Set(String) = Set{"src", "href", "action", "cite", "longdesc"} - - # Configures which classes are valid for `class` attributes. - # - # String values need to match the class name exactly, regex - # values need to match the entire class name. - # - # Default value: empty - property valid_classes : Set(String | Regex) = Set(String | Regex).new - - def valid_classes=(classes) - valid_classes = classes.map(&.as(String | Regex)).to_set - end - - # Creates an instance which accepts a limited set of inline tags (see - # `INLINE_SAFELIST`). - def self.inline : HTMLSanitizer - new( - accepted_attributes: INLINE_SAFELIST.clone - ) - end - - # Creates an instance which accepts more basic tags including paragraphs, - # headlines, lists, and images (see `BASIC_SAFELIST`). - def self.basic : HTMLSanitizer - new( - accepted_attributes: BASIC_SAFELIST.clone - ) - end - - # Creates an instance which accepts even more standard tags and thus allows - # using a good amount of HTML features (see `COMMON_SAFELIST`). - # - # Unless you need tight restrictions on allowed content, this is the - # recommended default. - def self.common : HTMLSanitizer - new( - accepted_attributes: COMMON_SAFELIST.clone - ) - end - - # Removes anchor tag (`<a>` from the list of accepted tags). - # - # NOTE: This doesn't reject attributes with URL values for other tags. - def no_links - accepted_attributes.delete("a") - - self - end - - def accept_tag(tag : String, attributes : Set(String) = Set(String).new) - accepted_attributes[tag] = attributes - end - - def transform_attributes(tag : String, attributes : Hash(String, String)) : String | CONTINUE | STOP - transform_url_attributes(tag, attributes) - - tag_result = case tag - when "a" - transform_tag_a(attributes) - when "img" - transform_tag_img(attributes) - end - - if tag_result - return tag_result - end - - limit_numeric_or_percent(attributes, "width") - limit_numeric_or_percent(attributes, "height") - limit_enum(attributes, "align", ["center", "left", "right", "justify", "char"]) - - transform_classes(tag, attributes) - - tag - end - - def transform_tag_img(attributes) - unless attributes.has_key?("src") - return CONTINUE - end - end - - def transform_tag_a(attributes) - if href = attributes["href"]? - if add_rel_nofollow - append_attribute(attributes, "rel", "nofollow") - end - if add_rel_noopener && attributes.has_key?("target") - append_attribute(attributes, "rel", "noopener") - end - end - if !(((href = attributes["href"]?) && !href.empty?) || attributes.has_key?("id") || attributes.has_key?("tag")) - return CONTINUE - end - end - - def transform_url_attributes(tag, attributes) - all_ok = true - url_attributes.each do |key| - if value = attributes[key]? - all_ok &&= transform_url_attribute(tag, attributes, key, value) - end - end - all_ok - end - - def transform_url_attribute(tag, attributes, attribute, value) - begin - uri = URI.parse(value.strip) - rescue URI::Error - attributes[attribute] = "" - return false - end - - uri = transform_uri(tag, attributes, attribute, uri) - if uri.nil? || (uri.blank? || uri == "#") - attributes[attribute] = "" - return false - end - - attributes[attribute] = uri - true - end - - def transform_uri(tag, attributes, attribute, uri : URI) : String? - if uri_sanitizer = self.uri_sanitizer - uri = uri_sanitizer.sanitize(uri) - - return unless uri - end - - # Make sure special characters are properly encoded to avoid interpretation - # of tweaked relative paths as "javascript:" URI (for example) - if path = uri.path - uri.path = String.build do |io| - URI.encode(URI.decode(path), io) { |byte| URI.reserved?(byte) || URI.unreserved?(byte) } - end - end - - uri.to_s - end - - def transform_classes(tag, attributes) - attribute = attributes["class"]? - return unless attribute - - classes = attribute.split - classes = classes.select { |klass| valid_class?(tag, klass, valid_classes) } - if classes.empty? - attributes.delete("class") - else - attributes["class"] = classes.join(" ") - end - end - - private def limit_numeric_or_percent(attributes, attribute) - if value = attributes[attribute]? - value = value.strip - if value.ends_with?("%") - value = value.byte_slice(0, value.size - 1) - end - value.each_char do |char| - unless char.ascii_number? - attributes.delete(attribute) - break - end - end - end - end - - private def limit_enum(attributes, attribute, list) - if value = attributes[attribute]? - value = value.strip - if valid_with_list?(value, list) - attributes[attribute] = value - else - attributes.delete(attribute) - end - end - end - - def valid_class?(tag, klass, valid_classes) - valid_with_list?(klass, valid_classes) - end - - private def valid_with_list?(value, list) - list.any? { |validator| - case validator - when String - validator == value - when Regex - data = validator.match(value) - next unless data - data.byte_begin == 0 && data.byte_end == value.bytesize - end - } - end - - def append_attribute(attributes, attribute, value) - if curr_value = attributes[attribute]? - values = curr_value.split - if values.includes?(value) - return false - else - values << value - attributes[attribute] = values.join(" ") - end - else - attributes[attribute] = value - end - - true - end -end - -require "./html_sanitizer/safelist" diff --git a/lib/sanitize/src/policy/html_sanitizer/safelist.cr b/lib/sanitize/src/policy/html_sanitizer/safelist.cr deleted file mode 100644 index 2d5a7edc08cd..000000000000 --- a/lib/sanitize/src/policy/html_sanitizer/safelist.cr +++ /dev/null @@ -1,70 +0,0 @@ -class Sanitize::Policy::HTMLSanitizer < Sanitize::Policy::Whitelist - # Only limited elements for inline text markup. - INLINE_SAFELIST = { - "a" => Set{"href", "hreflang"}, - "abbr" => Set(String).new, - "acronym" => Set(String).new, - "b" => Set(String).new, - "code" => Set(String).new, - "em" => Set(String).new, - "i" => Set(String).new, - "strong" => Set(String).new, - "*" => Set{ - "dir", - "lang", - "title", - "class", - }, - } - - # Compatible with basic Markdown features. - BASIC_SAFELIST = INLINE_SAFELIST.merge({ - "blockquote" => Set{"cite"}, - "br" => Set(String).new, - "h1" => Set(String).new, - "h2" => Set(String).new, - "h3" => Set(String).new, - "h4" => Set(String).new, - "h5" => Set(String).new, - "h6" => Set(String).new, - "hr" => Set(String).new, - "img" => Set{"alt", "src", "longdesc", "width", "height", "align"}, - "li" => Set(String).new, - "ol" => Set{"start"}, - "p" => Set{"align"}, - "pre" => Set(String).new, - "ul" => Set(String).new, - }) - - # Accepts most standard tags and thus allows using a good amount of HTML features. - COMMON_SAFELIST = BASIC_SAFELIST.merge({ - "dd" => Set(String).new, - "del" => Set{"cite"}, - "details" => Set(String).new, - "dl" => Set(String).new, - "dt" => Set(String).new, - "div" => Set(String).new, - "ins" => Set{"cite"}, - "kbd" => Set(String).new, - "q" => Set{"cite"}, - "ruby" => Set(String).new, - "rp" => Set(String).new, - "rt" => Set(String).new, - "s" => Set(String).new, - "samp" => Set(String).new, - "strike" => Set(String).new, - "sub" => Set(String).new, - "summary" => Set(String).new, - "sup" => Set(String).new, - "table" => Set(String).new, - "time" => Set{"datetime"}, - "tbody" => Set(String).new, - "td" => Set(String).new, - "tfoot" => Set(String).new, - "th" => Set(String).new, - "thead" => Set(String).new, - "tr" => Set(String).new, - "tt" => Set(String).new, - "var" => Set(String).new, - }) -end diff --git a/lib/sanitize/src/policy/text.cr b/lib/sanitize/src/policy/text.cr deleted file mode 100644 index 82a2e6775de2..000000000000 --- a/lib/sanitize/src/policy/text.cr +++ /dev/null @@ -1,23 +0,0 @@ -require "../policy" - -# Reduces an HTML tree to the content of its text nodes. -# It renders a plain text result, similar to copying HTML content rendered by -# a browser to a text editor. -# HTML special characters are escaped. -# -# ``` -# policy = Sanitize::Policy::Text.new -# policy.process(%(foo bar!)) # => "foo bar!" -# policy.process(%(

foo

bar

)) # => "foo bar" -# policy.block_whitespace = "\n" -# policy.process(%(

foo

bar

)) # => "foo\nbar" -# ``` -class Sanitize::Policy::Text < Sanitize::Policy - def transform_text(text : String) : String? - text - end - - def transform_tag(name : String, attributes : Hash(String, String)) : String | CONTINUE | STOP - CONTINUE - end -end diff --git a/lib/sanitize/src/policy/whitelist.cr b/lib/sanitize/src/policy/whitelist.cr deleted file mode 100644 index 5f96a71dac9e..000000000000 --- a/lib/sanitize/src/policy/whitelist.cr +++ /dev/null @@ -1,57 +0,0 @@ -require "../policy" - -# This is a simple policy based on a tag and attribute whitelist. -# -# This policy accepts only `<div>` and `<p>` tags with optional `title` attributes: -# ``` -# policy = Sanitize::Policy::Whitelist.new({ -# "div" => Set{"title"}, -# "p" => Set{"title"}, -# }) -# ``` -# -# The special `*` key applies to *all* tag names and can be used to allow global -# attributes: -# -# This example is equivalent to the above. If more tag names were added, they -# would also accept `title` attributes. -# ``` -# policy = Sanitize::Policy::Whitelist.new({ -# "div" => Set(String).new, -# "p" => Set(String).new, -# "*" => Set{"title"}, -# }) -# ``` -# -# Attributes are always optional, so this policy won't enforce the presence of -# an attribute. -# -# If a tag's attribute list is empty, no attributes are allowed for this tag. -# -# Attribute values are not changed by this policy. -class Sanitize::Policy::Whitelist < Sanitize::Policy - # Mapping of accepted tag names and attributes. - property accepted_attributes : Hash(String, Set(String)) - - # Short cut to `accepted_attributes["*"]`. - getter global_attributes : Set(String) { accepted_attributes.fetch("*") { Set(String).new } } - - def initialize(@accepted_attributes : Hash(String, Set(String))) - end - - def transform_text(text : String) : String? - text - end - - def transform_tag(name : String, attributes : Hash(String, String)) : String | CONTINUE | STOP - acceptable_attributes = accepted_attributes.fetch(name) { return CONTINUE } - - attributes.reject! { |attr, _| !acceptable_attributes.includes?(attr) && !global_attributes.includes?(attr) } - - transform_attributes(name, attributes) - end - - def transform_attributes(name : String, attributes : Hash(String, String)) : String | CONTINUE | STOP - name - end -end diff --git a/lib/sanitize/src/processor.cr b/lib/sanitize/src/processor.cr deleted file mode 100644 index 6d4e4ac82766..000000000000 --- a/lib/sanitize/src/processor.cr +++ /dev/null @@ -1,110 +0,0 @@ -require "xml" -require "log" -require "./adapter/libxml2" - -module Sanitize - abstract class Policy - # Processes the HTML fragment *html* with this policy using the default - # adapter (`Adapter::LibXML2`). - def process(html : String | XML::Node) : String - Adapter::LibXML2.process(self, html, fragment: true) - end - - # Processes the HTML document *html* with this policy using the default - # adapter (`Adapter::LibXML2`). - def process_document(html : String | XML::Node) : String - Adapter::LibXML2.process(self, html, fragment: false) - end - end - - module Adapter - abstract def write_text(text : String) : Nil - abstract def start_tag(name : String, attributes : Hash(String, String)) : Nil - abstract def end_tag(name : String, attributes : Hash(String, String)) : Nil - end - - # A processor traverses the HTML/XML tree, applies transformations through the - # policy and passes the result to the adapter which then builds the result. - class Processor - Log = ::Log.for(self) - - # This module serves as a singleton constant that signals the processor to - # skip the current tag but continue to traverse its children. - module CONTINUE - extend self - end - - # This module serves as a singleton constant that signals the processor to - # skip the current tag and its children. - module STOP - extend self - end - - @last_text_ended_with_whitespace = true - @stripped_block_tag = false - - def initialize(@policy : Policy, @adapter : Adapter) - end - - def process_text(text : String) - text = @policy.transform_text(text) - - if @stripped_block_tag && !@last_text_ended_with_whitespace && !text.try(&.[0].whitespace?) - @adapter.write_text(@policy.block_whitespace) - end - - @stripped_block_tag = false - - if text - @adapter.write_text(text) - @last_text_ended_with_whitespace = text.[-1].whitespace? - else - @last_text_ended_with_whitespace = false - end - end - - def process_element(name : String, attributes : Hash(String, String), &) - process_element(name, attributes, @policy.transform_tag(name, attributes)) do - yield - end - end - - def process_element(orig_name : String, attributes : Hash(String, String), name, &) - case name - when STOP - Log.debug { "#{@policy.class} stopping at tag #{orig_name} #{attributes}" } - if @policy.block_tag?(orig_name) - @stripped_block_tag = true - end - return - when CONTINUE - Log.debug { "#{@policy.class} stripping tag #{orig_name} #{attributes}" } - if @policy.block_tag?(orig_name) - @stripped_block_tag = true - end - when String - @stripped_block_tag = false - @adapter.start_tag(name, attributes) - end - - yield - - case name - when CONTINUE - if @policy.block_tag?(orig_name) - @stripped_block_tag = true - end - when String - @stripped_block_tag = false - @adapter.end_tag(name, attributes) - end - end - - def reset - @last_text_ended_with_whitespace = true - @stripped_block_tag = false - end - end -end - -require "./adapter/libxml2" diff --git a/lib/sanitize/src/sanitize.cr b/lib/sanitize/src/sanitize.cr deleted file mode 100644 index a94e7c660323..000000000000 --- a/lib/sanitize/src/sanitize.cr +++ /dev/null @@ -1,5 +0,0 @@ -require "./policy/*" -require "./processor" - -module Sanitize -end diff --git a/lib/sanitize/src/uri_sanitizer.cr b/lib/sanitize/src/uri_sanitizer.cr deleted file mode 100644 index cfda46736b3b..000000000000 --- a/lib/sanitize/src/uri_sanitizer.cr +++ /dev/null @@ -1,107 +0,0 @@ -require "uri" - -# A `URISanitizer` is used to validate and transform a URI based on specified -# rules. -class Sanitize::URISanitizer - # Specifies a whitelist of URI schemes this sanitizer accepts. - # - # If empty, no schemes are accepted (i.e. only relative URIs are valid). - # If `nil`, all schemes are accepted (this setting is potentially dangerous). - # - # Relative URIs are not affected by this setting. - property accepted_schemes : Set(String)? - - # Specifies a whitelist of hosts this sanitizer accepts. - # - # If empty, no hosts are accepted (i.e. only relative URIs are valid). - # If `nil`, all hosts are accepted (default). - # - # The blacklist `rejected_hosts` has precedence over this whitelist. - property accepted_hosts : Set(String)? - - # Specifies a blacklist of hosts this sanitizer rejects. - # - # If empty, no hosts are rejected. - # - # This blacklist has precedence over the whitelist `accepted_hosts`. - property rejected_hosts : Set(String) = Set(String).new - - # Specifies a base URL all relative URLs are resolved against. - # - # If `nil`, relative URLs are not resolved. - property base_url : URI? - - # Configures whether fragment-only URIs are resolved on `base_url`. - # - # ``` - # sanitizer = Sanitize::URISanitizer.new - # sanitizer.base_url = URI.parse("https://example.com/base/") - # sanitizer.sanitize(URI.parse("#foo")) # => "#foo" - # - # sanitizer.resolve_fragment_urls = true - # sanitizer.sanitize(URI.parse("#foo")) # => "https://example.com/base/#foo" - # ``` - property resolve_fragment_urls = false - - def initialize(@accepted_schemes : Set(String)? = Set{"http", "https", "mailto", "tel"}) - end - - # Adds *scheme* to `accepted_schemes`. - def accept_scheme(scheme : String) - schemes = self.accepted_schemes ||= Set(String).new - schemes << scheme - end - - def sanitize(uri : URI) : URI? - unless accepts_scheme?(uri.scheme) - return nil - end - - unless accepts_host?(uri.host) - return nil - end - - uri = resolve_base_url(uri) - - uri - end - - def accepts_scheme?(scheme) - if scheme.nil? - return true - end - - if accepted_schemes = self.accepted_schemes - return accepted_schemes.includes?(scheme) - end - - true - end - - def accepts_host?(host) - if host.nil? - return true - end - - return false if rejected_hosts.includes?(host) - - if accepted_hosts = self.accepted_hosts - return false unless accepted_hosts.includes?(host) - end - - true - end - - def resolve_base_url(uri) - if base_url = self.base_url - unless uri.absolute? || (!resolve_fragment_urls && fragment_url?(uri)) - uri = base_url.resolve(uri) - end - end - uri - end - - private def fragment_url?(uri) - uri.path.empty? && uri.host.nil? && uri.query.nil? && !uri.fragment.nil? - end -end diff --git a/shard.lock b/shard.lock index d80148209315..e7f2ddc86d10 100644 --- a/shard.lock +++ b/shard.lock @@ -8,7 +8,3 @@ shards: git: https://github.com/i3oris/reply.git version: 0.3.1+git.commit.90a7eb5a76048884d5d56bf6b9369f1e67fdbcd7 - sanitize: - git: https://github.com/straight-shoota/sanitize.git - version: 0.1.0+git.commit.75c141b619c77956e88f557149566cd28876398b - diff --git a/shard.yml b/shard.yml index cbc960c0ee15..396d91bdffe2 100644 --- a/shard.yml +++ b/shard.yml @@ -2,7 +2,7 @@ name: crystal version: 1.13.0-dev authors: - - Crystal Core Team +- Crystal Core Team description: | The Crystal standard library and compiler. @@ -15,9 +15,6 @@ dependencies: reply: github: I3oris/reply commit: 90a7eb5a76048884d5d56bf6b9369f1e67fdbcd7 - sanitize: - github: straight-shoota/sanitize - commit: 75c141b619c77956e88f557149566cd28876398b license: Apache-2.0 diff --git a/spec/compiler/crystal/tools/doc/doc_renderer_spec.cr b/spec/compiler/crystal/tools/doc/doc_renderer_spec.cr index 65090c8185f7..d8d179a05d51 100644 --- a/spec/compiler/crystal/tools/doc/doc_renderer_spec.cr +++ b/spec/compiler/crystal/tools/doc/doc_renderer_spec.cr @@ -374,33 +374,7 @@ describe Doc::MarkdDocRenderer do HTML end - describe "renders html with sanitization" do - it_renders nil, %(

Foo

), %(

Foo

) - it_renders nil, %(), %() - it_renders nil, %(

example text

), %(

example text

) - - it_renders nil, %(```crystal\n# \n```), - %(
# <script>alert("hello world")</script>
) - end - - describe "still renders tables despite sanitization" do - table_mkdn = <<-HTML - - - - - - - - - - - - - -
column 1column 2
data 1data 2
data 3data 4
- HTML - - it_renders nil, table_mkdn, table_mkdn + describe "renders html" do + it_renders nil, %(

Foo

), %(

Foo

) end end diff --git a/src/compiler/crystal/tools/doc/markd_doc_renderer.cr b/src/compiler/crystal/tools/doc/markd_doc_renderer.cr index 2ea3348b2b36..f703d7ed787b 100644 --- a/src/compiler/crystal/tools/doc/markd_doc_renderer.cr +++ b/src/compiler/crystal/tools/doc/markd_doc_renderer.cr @@ -1,7 +1,4 @@ -require "sanitize" - class Crystal::Doc::MarkdDocRenderer < Markd::HTMLRenderer - SANITIZER = Sanitize::Policy::HTMLSanitizer.common @anchor_map = Hash(String, Int32).new(0) def initialize(@type : Crystal::Doc::Type, options) @@ -161,24 +158,4 @@ class Crystal::Doc::MarkdDocRenderer < Markd::HTMLRenderer type.lookup_macro(name, args_count) || type.program.lookup_macro(name, args_count) end - - def text(node : Markd::Node, entering : Bool) - output(sanitize(node)) - end - - def html_block(node : Markd::Node, entering : Bool) - newline - content = @options.safe? ? "" : sanitize(node) - literal(content) - newline - end - - def html_inline(node : Markd::Node, entering : Bool) - content = @options.safe? ? "" : sanitize(node) - literal(content) - end - - def sanitize(node : Markd::Node) : String - SANITIZER.process(node.text) - end end