diff --git a/CHANGES.md b/CHANGES.md index 09776b9e80..37d300dfec 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -10,6 +10,8 @@ This document is intended for Spotless developers. We adhere to the [keepachangelog](https://keepachangelog.com/en/1.0.0/) format (starting after version `1.27.0`). ## [Unreleased] +### Added +* `PaddedCell.calculateDirtyState` is now defensive about misconfigured character encoding. ([#575](https://github.com/diffplug/spotless/pull/575)) ## [1.30.1] - 2020-05-17 ### Fixed diff --git a/lib/src/main/java/com/diffplug/spotless/EncodingErrorMsg.java b/lib/src/main/java/com/diffplug/spotless/EncodingErrorMsg.java new file mode 100644 index 0000000000..b89400b7c2 --- /dev/null +++ b/lib/src/main/java/com/diffplug/spotless/EncodingErrorMsg.java @@ -0,0 +1,149 @@ +/* + * Copyright 2016 DiffPlug + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.diffplug.spotless; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.StandardCharsets; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Collection; +import java.util.Iterator; +import java.util.LinkedHashSet; + +import javax.annotation.Nullable; + +class EncodingErrorMsg { + static final char UNREPRESENTABLE = '�'; + private static int CONTEXT = 3; + + static @Nullable String msg(String chars, byte[] bytes, Charset charset) { + int unrepresentable = chars.indexOf(UNREPRESENTABLE); + if (unrepresentable == -1) { + return null; + } + + // sometimes the '�' is really in a file, such as for *this* file + // so we have to handle that corner case + ByteBuffer byteBuf = ByteBuffer.wrap(bytes); + CharBuffer charBuf = CharBuffer.allocate(chars.length()); + CoderResult result = charset.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT) + .decode(byteBuf, charBuf, true); + if (!result.isError()) { + return null; + } else { + // there really is an encoding error, so we'll send a message + return new EncodingErrorMsg(chars, byteBuf, charset, unrepresentable).message.toString(); + } + } + + private final ByteBuffer byteBuf; + private final CharBuffer charBuf; + private final int unrepresentable; + private final StringBuilder message; + + private EncodingErrorMsg(String chars, ByteBuffer byteBuf, Charset charset, int unrepresentable) { + this.byteBuf = byteBuf; + this.unrepresentable = unrepresentable; + // make a new, smaller charBuf better suited to our request + charBuf = CharBuffer.allocate(Math.min(unrepresentable + 2 * CONTEXT, chars.length())); + + message = new StringBuilder("Encoding error! "); + if (charset.equals(StandardCharsets.UTF_8)) { + message.append("Spotless uses UTF-8 by default."); + } else { + message.append("You configured Spotless to use " + charset.name() + "."); + } + + int line = 1; + int col = 1; + for (int i = 0; i < unrepresentable; ++i) { + char c = chars.charAt(i); + if (c == '\n') { + ++line; + col = 1; + } else if (c != '\r') { + ++col; + } + } + message.append(" At line " + line + " col " + col + ":"); + + // https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html + LinkedHashSet encodings = new LinkedHashSet<>(); + encodings.add(charset); // the encoding we are using + encodings.add(StandardCharsets.UTF_8); // followed by likely encodings + addIfAvailable(encodings, "windows-1252"); + encodings.add(StandardCharsets.ISO_8859_1); + addIfAvailable(encodings, "Shift_JIS"); + addIfAvailable(encodings, "Big5"); + addIfAvailable(encodings, "Big5-HKSCS"); + addIfAvailable(encodings, "GBK"); + addIfAvailable(encodings, "GB2312"); + addIfAvailable(encodings, "GB18030"); + + Iterator iterator = encodings.iterator(); + appendExample(iterator.next(), true); + while (iterator.hasNext()) { + appendExample(iterator.next(), false); + } + } + + private static void addIfAvailable(Collection charsets, String name) { + try { + charsets.add(Charset.forName(name)); + } catch (UnsupportedCharsetException e) { + // no worries + } + } + + private void appendExample(Charset charset, boolean must) { + byteBuf.clear(); + charBuf.clear(); + + CharsetDecoder decoder = charset.newDecoder(); + if (!must) { + // bail early if we can + CoderResult r = decoder + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT) + .decode(byteBuf, charBuf, true); + if (r.isError()) { + return; + } + } else { + decoder + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE) + .decode(byteBuf, charBuf, true); + } + charBuf.flip(); + + int start = Math.max(unrepresentable - CONTEXT, 0); + int end = Math.min(charBuf.limit(), unrepresentable + CONTEXT + 1); + message.append('\n'); + message.append(charBuf.subSequence(start, end).toString() + .replace('\n', '␤') + .replace('\r', '␍') + .replace('\t', '⇥')); + message.append(" <- "); + message.append(charset.name()); + } +} diff --git a/lib/src/main/java/com/diffplug/spotless/PaddedCell.java b/lib/src/main/java/com/diffplug/spotless/PaddedCell.java index d055d07202..e3283ad11c 100644 --- a/lib/src/main/java/com/diffplug/spotless/PaddedCell.java +++ b/lib/src/main/java/com/diffplug/spotless/PaddedCell.java @@ -190,6 +190,11 @@ public static DirtyState calculateDirtyState(Formatter formatter, File file) thr public static DirtyState calculateDirtyState(Formatter formatter, File file, byte[] rawBytes) throws IOException { String raw = new String(rawBytes, formatter.getEncoding()); + // check that all characters were encodable + String encodingError = EncodingErrorMsg.msg(raw, rawBytes, formatter.getEncoding()); + if (encodingError != null) { + throw new IllegalArgumentException(encodingError); + } String rawUnix = LineEnding.toUnix(raw); // enforce the format diff --git a/lib/src/main/java/com/diffplug/spotless/generic/ReplaceStep.java b/lib/src/main/java/com/diffplug/spotless/generic/ReplaceStep.java index 85fee5ed04..aaf0dccd51 100644 --- a/lib/src/main/java/com/diffplug/spotless/generic/ReplaceStep.java +++ b/lib/src/main/java/com/diffplug/spotless/generic/ReplaceStep.java @@ -37,12 +37,12 @@ public static FormatterStep create(String name, CharSequence target, CharSequenc private static final class State implements Serializable { private static final long serialVersionUID = 1L; - private final CharSequence target; - private final CharSequence replacement; + private final String target; + private final String replacement; State(CharSequence target, CharSequence replacement) { - this.target = target; - this.replacement = replacement; + this.target = target.toString(); + this.replacement = replacement.toString(); } FormatterFunc toFormatter() { diff --git a/plugin-gradle/CHANGES.md b/plugin-gradle/CHANGES.md index 089a48f211..3e76cef696 100644 --- a/plugin-gradle/CHANGES.md +++ b/plugin-gradle/CHANGES.md @@ -3,6 +3,8 @@ We adhere to the [keepachangelog](https://keepachangelog.com/en/1.0.0/) format (starting after version `3.27.0`). ## [Unreleased] +### Fixed +* If the encoding was set incorrectly, `spotlessApply` could clobber special characters. Spotless now prevents this, and helps to suggest the correct encoding. ([#575](https://github.com/diffplug/spotless/pull/575)) ## [4.0.0] - 2020-05-17 **TLDR: This version improves performance and adds support for the local Gradle Build Cache. You will not need to make any changes in your buildscript.** It is a breaking change only for a few users who have built *other* plugins on top of this one. diff --git a/plugin-gradle/src/test/java/com/diffplug/gradle/spotless/EncodingTest.java b/plugin-gradle/src/test/java/com/diffplug/gradle/spotless/EncodingTest.java index a7a2334e5d..9c7a856d86 100644 --- a/plugin-gradle/src/test/java/com/diffplug/gradle/spotless/EncodingTest.java +++ b/plugin-gradle/src/test/java/com/diffplug/gradle/spotless/EncodingTest.java @@ -51,8 +51,8 @@ public void globalIsRespected() throws Exception { " encoding 'US-ASCII'", "}"); setFile("test.java").toContent("µ"); - gradleRunner().withArguments("spotlessApply").build(); - assertFile("test.java").hasContent("??"); + gradleRunner().withArguments("spotlessApply").buildAndFail().getOutput().contains("Encoding error!"); + assertFile("test.java").hasContent("µ"); } @Test @@ -75,8 +75,8 @@ public void globalIsRespectedButCanBeOverridden() throws Exception { "}"); setFile("test.java").toContent("µ"); setFile("utf32.encoded").toContent("µ", Charset.forName("UTF-32")); - gradleRunner().withArguments("spotlessApply").build(); - assertFile("test.java").hasContent("??"); - assertFile("utf32.encoded").hasContent("A", Charset.forName("UTF-32")); + gradleRunner().withArguments("spotlessApply").buildAndFail().getOutput().contains("Encoding error!"); + assertFile("test.java").hasContent("µ"); + assertFile("utf32.encoded").hasContent("µ", Charset.forName("UTF-32")); } } diff --git a/plugin-maven/CHANGES.md b/plugin-maven/CHANGES.md index cc53a6ee9b..36ba5dcd3a 100644 --- a/plugin-maven/CHANGES.md +++ b/plugin-maven/CHANGES.md @@ -3,6 +3,8 @@ We adhere to the [keepachangelog](https://keepachangelog.com/en/1.0.0/) format (starting after version `1.27.0`). ## [Unreleased] +### Fixed +* If the encoding was set incorrectly, `spotless:apply` could clobber special characters. Spotless now prevents this, and helps to suggest the correct encoding. ([#575](https://github.com/diffplug/spotless/pull/575)) ## [1.31.0] - 2020-05-05 ### Added diff --git a/testlib/src/test/java/com/diffplug/spotless/EncodingErrorMsgTest.java b/testlib/src/test/java/com/diffplug/spotless/EncodingErrorMsgTest.java new file mode 100644 index 0000000000..e1f5d81fa1 --- /dev/null +++ b/testlib/src/test/java/com/diffplug/spotless/EncodingErrorMsgTest.java @@ -0,0 +1,98 @@ +/* + * Copyright 2016 DiffPlug + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.diffplug.spotless; + +import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +import javax.annotation.Nullable; + +import org.assertj.core.api.Assertions; +import org.junit.Test; + +public class EncodingErrorMsgTest { + @Test + public void cp1252asUtf8() throws UnsupportedEncodingException { + // empty case + cp1252asUtf8("", null); + // single char + cp1252asUtf8("a", null); + cp1252asUtf8("°", "Encoding error! Spotless uses UTF-8 by default. At line 1 col 1:\n" + + "� <- UTF-8\n" + + "° <- windows-1252\n" + + "° <- ISO-8859-1\n" + + "ー <- Shift_JIS"); + // multiline + cp1252asUtf8("\n123\nabc\n", null); + cp1252asUtf8("\n123\nabc°\nABC", "Encoding error! Spotless uses UTF-8 by default. At line 3 col 4:\n" + + "abc�␤AB <- UTF-8\n" + + "abc°␤AB <- windows-1252\n" + + "abc°␤AB <- ISO-8859-1\n" + + "abcー␤AB <- Shift_JIS"); + } + + private void cp1252asUtf8(String test, @Nullable String expectedMessage) throws UnsupportedEncodingException { + byte[] cp1252 = test.getBytes("cp1252"); + String asUTF = new String(cp1252, StandardCharsets.UTF_8); + String actualMessage = EncodingErrorMsg.msg(asUTF, cp1252, StandardCharsets.UTF_8); + Assertions.assertThat(actualMessage).isEqualTo(expectedMessage); + } + + @Test + public void utf8asCP1252() throws UnsupportedEncodingException { + // unfortunately, if you treat UTF8 as Cp1252, it looks weird, but it usually roundtrips faithfully + // which makes it hard to detect + + // empty case + utf8asCP1252("", null); + // single char + utf8asCP1252("a", null); + utf8asCP1252("°", null); + // multibyte UTF-8 can hide too + utf8asCP1252("😂", null); + // but some will trigger problems we can detect + utf8asCP1252("⍻", "Encoding error! You configured Spotless to use windows-1252. At line 1 col 2:\n" + + "â�» <- windows-1252\n" + + "⍻ <- UTF-8\n" + + "⍻ <- ISO-8859-1\n" + + "竝サ <- Shift_JIS"); // there are some codepoints where it doesn't + // multiline + utf8asCP1252("\n123\nabc\n", null); + utf8asCP1252("\n123\nabc°\nABC", null); + utf8asCP1252("\n123\nabc😂\nABC", null); + utf8asCP1252("\n123\nabc⍻\nABC", "Encoding error! You configured Spotless to use windows-1252. At line 3 col 5:\n" + + "bcâ�»␤A <- windows-1252\n" + + "bc⍻␤ABC <- UTF-8\n" + + "bc⍻␤A <- ISO-8859-1\n" + + "bc竝サ␤AB <- Shift_JIS"); + } + + private void utf8asCP1252(String test, @Nullable String expectedMessage) throws UnsupportedEncodingException { + byte[] utf8 = test.getBytes(StandardCharsets.UTF_8); + String asCp1252 = new String(utf8, "cp1252"); + String actualMessage = EncodingErrorMsg.msg(asCp1252, utf8, Charset.forName("cp1252")); + Assertions.assertThat(actualMessage).isEqualTo(expectedMessage); + } + + @Test + public void canUseUnrepresentableOnPurpose() throws UnsupportedEncodingException { + String pathologic = new String(new char[]{EncodingErrorMsg.UNREPRESENTABLE}); + byte[] pathologicBytes = pathologic.getBytes(StandardCharsets.UTF_8); + String pathologicMsg = EncodingErrorMsg.msg(pathologic, pathologicBytes, StandardCharsets.UTF_8); + Assertions.assertThat(pathologicMsg).isNull(); + } +}