Merge pull request #575 from diffplug/feat/cautious-encoding

Be proactive about encoding mismatch
diffplug · May 21, 2020 · aad37ce · aad37ce
2 parents 81f84c0 + 94131b0
commit aad37ce
Show file tree

Hide file tree

Showing 8 changed files with 267 additions and 9 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -10,6 +10,8 @@ This document is intended for Spotless developers.
 We adhere to the [keepachangelog](https://keepachangelog.com/en/1.0.0/) format (starting after version `1.27.0`).
 
 ## [Unreleased]
+### Added
+* `PaddedCell.calculateDirtyState` is now defensive about misconfigured character encoding. ([#575](https://github.com/diffplug/spotless/pull/575))
 
 ## [1.30.1] - 2020-05-17
 ### Fixed

diff --git a/lib/src/main/java/com/diffplug/spotless/EncodingErrorMsg.java b/lib/src/main/java/com/diffplug/spotless/EncodingErrorMsg.java
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2016 DiffPlug
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.diffplug.spotless;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+
+import javax.annotation.Nullable;
+
+class EncodingErrorMsg {
+	static final char UNREPRESENTABLE = '�';
+	private static int CONTEXT = 3;
+
+	static @Nullable String msg(String chars, byte[] bytes, Charset charset) {
+		int unrepresentable = chars.indexOf(UNREPRESENTABLE);
+		if (unrepresentable == -1) {
+			return null;
+		}
+
+		// sometimes the '�' is really in a file, such as for *this* file
+		// so we have to handle that corner case
+		ByteBuffer byteBuf = ByteBuffer.wrap(bytes);
+		CharBuffer charBuf = CharBuffer.allocate(chars.length());
+		CoderResult result = charset.newDecoder()
+				.onMalformedInput(CodingErrorAction.REPORT)
+				.onUnmappableCharacter(CodingErrorAction.REPORT)
+				.decode(byteBuf, charBuf, true);
+		if (!result.isError()) {
+			return null;
+		} else {
+			// there really is an encoding error, so we'll send a message
+			return new EncodingErrorMsg(chars, byteBuf, charset, unrepresentable).message.toString();
+		}
+	}
+
+	private final ByteBuffer byteBuf;
+	private final CharBuffer charBuf;
+	private final int unrepresentable;
+	private final StringBuilder message;
+
+	private EncodingErrorMsg(String chars, ByteBuffer byteBuf, Charset charset, int unrepresentable) {
+		this.byteBuf = byteBuf;
+		this.unrepresentable = unrepresentable;
+		// make a new, smaller charBuf better suited to our request
+		charBuf = CharBuffer.allocate(Math.min(unrepresentable + 2 * CONTEXT, chars.length()));
+
+		message = new StringBuilder("Encoding error! ");
+		if (charset.equals(StandardCharsets.UTF_8)) {
+			message.append("Spotless uses UTF-8 by default.");
+		} else {
+			message.append("You configured Spotless to use " + charset.name() + ".");
+		}
+
+		int line = 1;
+		int col = 1;
+		for (int i = 0; i < unrepresentable; ++i) {
+			char c = chars.charAt(i);
+			if (c == '\n') {
+				++line;
+				col = 1;
+			} else if (c != '\r') {
+				++col;
+			}
+		}
+		message.append("  At line " + line + " col " + col + ":");
+
+		// https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html
+		LinkedHashSet<Charset> encodings = new LinkedHashSet<>();
+		encodings.add(charset); // the encoding we are using
+		encodings.add(StandardCharsets.UTF_8);  // followed by likely encodings
+		addIfAvailable(encodings, "windows-1252");
+		encodings.add(StandardCharsets.ISO_8859_1);
+		addIfAvailable(encodings, "Shift_JIS");
+		addIfAvailable(encodings, "Big5");
+		addIfAvailable(encodings, "Big5-HKSCS");
+		addIfAvailable(encodings, "GBK");
+		addIfAvailable(encodings, "GB2312");
+		addIfAvailable(encodings, "GB18030");
+
+		Iterator<Charset> iterator = encodings.iterator();
+		appendExample(iterator.next(), true);
+		while (iterator.hasNext()) {
+			appendExample(iterator.next(), false);
+		}
+	}
+
+	private static void addIfAvailable(Collection<Charset> charsets, String name) {
+		try {
+			charsets.add(Charset.forName(name));
+		} catch (UnsupportedCharsetException e) {
+			// no worries
+		}
+	}
+
+	private void appendExample(Charset charset, boolean must) {
+		byteBuf.clear();
+		charBuf.clear();
+
+		CharsetDecoder decoder = charset.newDecoder();
+		if (!must) {
+			// bail early if we can
+			CoderResult r = decoder
+					.onMalformedInput(CodingErrorAction.REPORT)
+					.onUnmappableCharacter(CodingErrorAction.REPORT)
+					.decode(byteBuf, charBuf, true);
+			if (r.isError()) {
+				return;
+			}
+		} else {
+			decoder
+					.onMalformedInput(CodingErrorAction.REPLACE)
+					.onUnmappableCharacter(CodingErrorAction.REPLACE)
+					.decode(byteBuf, charBuf, true);
+		}
+		charBuf.flip();
+
+		int start = Math.max(unrepresentable - CONTEXT, 0);
+		int end = Math.min(charBuf.limit(), unrepresentable + CONTEXT + 1);
+		message.append('\n');
+		message.append(charBuf.subSequence(start, end).toString()
+				.replace('\n', '␤')
+				.replace('\r', '␍')
+				.replace('\t', '⇥'));
+		message.append(" <- ");
+		message.append(charset.name());
+	}
+}
diff --git a/lib/src/main/java/com/diffplug/spotless/PaddedCell.java b/lib/src/main/java/com/diffplug/spotless/PaddedCell.java
@@ -190,6 +190,11 @@ public static DirtyState calculateDirtyState(Formatter formatter, File file) thr
 
 	public static DirtyState calculateDirtyState(Formatter formatter, File file, byte[] rawBytes) throws IOException {
 		String raw = new String(rawBytes, formatter.getEncoding());
+		// check that all characters were encodable
+		String encodingError = EncodingErrorMsg.msg(raw, rawBytes, formatter.getEncoding());
+		if (encodingError != null) {
+			throw new IllegalArgumentException(encodingError);
+		}
 		String rawUnix = LineEnding.toUnix(raw);
 
 		// enforce the format

diff --git a/lib/src/main/java/com/diffplug/spotless/generic/ReplaceStep.java b/lib/src/main/java/com/diffplug/spotless/generic/ReplaceStep.java
@@ -37,12 +37,12 @@ public static FormatterStep create(String name, CharSequence target, CharSequenc
 	private static final class State implements Serializable {
 		private static final long serialVersionUID = 1L;
 
-		private final CharSequence target;
-		private final CharSequence replacement;
+		private final String target;
+		private final String replacement;
 
 		State(CharSequence target, CharSequence replacement) {
-			this.target = target;
-			this.replacement = replacement;
+			this.target = target.toString();
+			this.replacement = replacement.toString();
 		}
 
 		FormatterFunc toFormatter() {

diff --git a/plugin-gradle/CHANGES.md b/plugin-gradle/CHANGES.md
@@ -3,6 +3,8 @@
 We adhere to the [keepachangelog](https://keepachangelog.com/en/1.0.0/) format (starting after version `3.27.0`).
 
 ## [Unreleased]
+### Fixed
+* If the encoding was set incorrectly, `spotlessApply` could clobber special characters.  Spotless now prevents this, and helps to suggest the correct encoding. ([#575](https://github.com/diffplug/spotless/pull/575))
 
 ## [4.0.0] - 2020-05-17
 **TLDR: This version improves performance and adds support for the local Gradle Build Cache.  You will not need to make any changes in your buildscript.**  It is a breaking change only for a few users who have built *other* plugins on top of this one.

diff --git a/plugin-gradle/src/test/java/com/diffplug/gradle/spotless/EncodingTest.java b/plugin-gradle/src/test/java/com/diffplug/gradle/spotless/EncodingTest.java
@@ -51,8 +51,8 @@ public void globalIsRespected() throws Exception {
 				"    encoding 'US-ASCII'",
 				"}");
 		setFile("test.java").toContent("µ");
-		gradleRunner().withArguments("spotlessApply").build();
-		assertFile("test.java").hasContent("??");
+		gradleRunner().withArguments("spotlessApply").buildAndFail().getOutput().contains("Encoding error!");
+		assertFile("test.java").hasContent("µ");
 	}
 
 	@Test
@@ -75,8 +75,8 @@ public void globalIsRespectedButCanBeOverridden() throws Exception {
 				"}");
 		setFile("test.java").toContent("µ");
 		setFile("utf32.encoded").toContent("µ", Charset.forName("UTF-32"));
-		gradleRunner().withArguments("spotlessApply").build();
-		assertFile("test.java").hasContent("??");
-		assertFile("utf32.encoded").hasContent("A", Charset.forName("UTF-32"));
+		gradleRunner().withArguments("spotlessApply").buildAndFail().getOutput().contains("Encoding error!");
+		assertFile("test.java").hasContent("µ");
+		assertFile("utf32.encoded").hasContent("µ", Charset.forName("UTF-32"));
 	}
 }
diff --git a/plugin-maven/CHANGES.md b/plugin-maven/CHANGES.md
@@ -3,6 +3,8 @@
 We adhere to the [keepachangelog](https://keepachangelog.com/en/1.0.0/) format (starting after version `1.27.0`).
 
 ## [Unreleased]
+### Fixed
+* If the encoding was set incorrectly, `spotless:apply` could clobber special characters.  Spotless now prevents this, and helps to suggest the correct encoding. ([#575](https://github.com/diffplug/spotless/pull/575))
 
 ## [1.31.0] - 2020-05-05
 ### Added

diff --git a/testlib/src/test/java/com/diffplug/spotless/EncodingErrorMsgTest.java b/testlib/src/test/java/com/diffplug/spotless/EncodingErrorMsgTest.java
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2016 DiffPlug
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.diffplug.spotless;
+
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import javax.annotation.Nullable;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+
+public class EncodingErrorMsgTest {
+	@Test
+	public void cp1252asUtf8() throws UnsupportedEncodingException {
+		// empty case
+		cp1252asUtf8("", null);
+		// single char
+		cp1252asUtf8("a", null);
+		cp1252asUtf8("°", "Encoding error! Spotless uses UTF-8 by default.  At line 1 col 1:\n" +
+				"� <- UTF-8\n" +
+				"° <- windows-1252\n" +
+				"° <- ISO-8859-1\n" +
+				"ｰ <- Shift_JIS");
+		// multiline
+		cp1252asUtf8("\n123\nabc\n", null);
+		cp1252asUtf8("\n123\nabc°\nABC", "Encoding error! Spotless uses UTF-8 by default.  At line 3 col 4:\n" +
+				"abc�␤AB <- UTF-8\n" +
+				"abc°␤AB <- windows-1252\n" +
+				"abc°␤AB <- ISO-8859-1\n" +
+				"abcｰ␤AB <- Shift_JIS");
+	}
+
+	private void cp1252asUtf8(String test, @Nullable String expectedMessage) throws UnsupportedEncodingException {
+		byte[] cp1252 = test.getBytes("cp1252");
+		String asUTF = new String(cp1252, StandardCharsets.UTF_8);
+		String actualMessage = EncodingErrorMsg.msg(asUTF, cp1252, StandardCharsets.UTF_8);
+		Assertions.assertThat(actualMessage).isEqualTo(expectedMessage);
+	}
+
+	@Test
+	public void utf8asCP1252() throws UnsupportedEncodingException {
+		// unfortunately, if you treat UTF8 as Cp1252, it looks weird, but it usually roundtrips faithfully
+		// which makes it hard to detect
+
+		// empty case
+		utf8asCP1252("", null);
+		// single char
+		utf8asCP1252("a", null);
+		utf8asCP1252("°", null);
+		// multibyte UTF-8 can hide too
+		utf8asCP1252("😂", null);
+		// but some will trigger problems we can detect
+		utf8asCP1252("⍻", "Encoding error! You configured Spotless to use windows-1252.  At line 1 col 2:\n" +
+				"â�» <- windows-1252\n" +
+				"⍻ <- UTF-8\n" +
+				"â» <- ISO-8859-1\n" +
+				"竝ｻ <- Shift_JIS"); // there are some codepoints where it doesn't
+		// multiline
+		utf8asCP1252("\n123\nabc\n", null);
+		utf8asCP1252("\n123\nabc°\nABC", null);
+		utf8asCP1252("\n123\nabc😂\nABC", null);
+		utf8asCP1252("\n123\nabc⍻\nABC", "Encoding error! You configured Spotless to use windows-1252.  At line 3 col 5:\n" +
+				"bcâ�»␤A <- windows-1252\n" +
+				"bc⍻␤ABC <- UTF-8\n" +
+				"bcâ»␤A <- ISO-8859-1\n" +
+				"bc竝ｻ␤AB <- Shift_JIS");
+	}
+
+	private void utf8asCP1252(String test, @Nullable String expectedMessage) throws UnsupportedEncodingException {
+		byte[] utf8 = test.getBytes(StandardCharsets.UTF_8);
+		String asCp1252 = new String(utf8, "cp1252");
+		String actualMessage = EncodingErrorMsg.msg(asCp1252, utf8, Charset.forName("cp1252"));
+		Assertions.assertThat(actualMessage).isEqualTo(expectedMessage);
+	}
+
+	@Test
+	public void canUseUnrepresentableOnPurpose() throws UnsupportedEncodingException {
+		String pathologic = new String(new char[]{EncodingErrorMsg.UNREPRESENTABLE});
+		byte[] pathologicBytes = pathologic.getBytes(StandardCharsets.UTF_8);
+		String pathologicMsg = EncodingErrorMsg.msg(pathologic, pathologicBytes, StandardCharsets.UTF_8);
+		Assertions.assertThat(pathologicMsg).isNull();
+	}
+}