Skip to content

Commit

Permalink
Merge pull request #575 from diffplug/feat/cautious-encoding
Browse files Browse the repository at this point in the history
Be proactive about encoding mismatch
  • Loading branch information
nedtwigg committed May 21, 2020
2 parents 81f84c0 + 94131b0 commit aad37ce
Show file tree
Hide file tree
Showing 8 changed files with 267 additions and 9 deletions.
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ This document is intended for Spotless developers.
We adhere to the [keepachangelog](https://keepachangelog.com/en/1.0.0/) format (starting after version `1.27.0`).

## [Unreleased]
### Added
* `PaddedCell.calculateDirtyState` is now defensive about misconfigured character encoding. ([#575](https://github.com/diffplug/spotless/pull/575))

## [1.30.1] - 2020-05-17
### Fixed
Expand Down
149 changes: 149 additions & 0 deletions lib/src/main/java/com/diffplug/spotless/EncodingErrorMsg.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
/*
* Copyright 2016 DiffPlug
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.diffplug.spotless;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashSet;

import javax.annotation.Nullable;

class EncodingErrorMsg {
static final char UNREPRESENTABLE = '�';
private static int CONTEXT = 3;

static @Nullable String msg(String chars, byte[] bytes, Charset charset) {
int unrepresentable = chars.indexOf(UNREPRESENTABLE);
if (unrepresentable == -1) {
return null;
}

// sometimes the '�' is really in a file, such as for *this* file
// so we have to handle that corner case
ByteBuffer byteBuf = ByteBuffer.wrap(bytes);
CharBuffer charBuf = CharBuffer.allocate(chars.length());
CoderResult result = charset.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT)
.decode(byteBuf, charBuf, true);
if (!result.isError()) {
return null;
} else {
// there really is an encoding error, so we'll send a message
return new EncodingErrorMsg(chars, byteBuf, charset, unrepresentable).message.toString();
}
}

private final ByteBuffer byteBuf;
private final CharBuffer charBuf;
private final int unrepresentable;
private final StringBuilder message;

private EncodingErrorMsg(String chars, ByteBuffer byteBuf, Charset charset, int unrepresentable) {
this.byteBuf = byteBuf;
this.unrepresentable = unrepresentable;
// make a new, smaller charBuf better suited to our request
charBuf = CharBuffer.allocate(Math.min(unrepresentable + 2 * CONTEXT, chars.length()));

message = new StringBuilder("Encoding error! ");
if (charset.equals(StandardCharsets.UTF_8)) {
message.append("Spotless uses UTF-8 by default.");
} else {
message.append("You configured Spotless to use " + charset.name() + ".");
}

int line = 1;
int col = 1;
for (int i = 0; i < unrepresentable; ++i) {
char c = chars.charAt(i);
if (c == '\n') {
++line;
col = 1;
} else if (c != '\r') {
++col;
}
}
message.append(" At line " + line + " col " + col + ":");

// https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html
LinkedHashSet<Charset> encodings = new LinkedHashSet<>();
encodings.add(charset); // the encoding we are using
encodings.add(StandardCharsets.UTF_8); // followed by likely encodings
addIfAvailable(encodings, "windows-1252");
encodings.add(StandardCharsets.ISO_8859_1);
addIfAvailable(encodings, "Shift_JIS");
addIfAvailable(encodings, "Big5");
addIfAvailable(encodings, "Big5-HKSCS");
addIfAvailable(encodings, "GBK");
addIfAvailable(encodings, "GB2312");
addIfAvailable(encodings, "GB18030");

Iterator<Charset> iterator = encodings.iterator();
appendExample(iterator.next(), true);
while (iterator.hasNext()) {
appendExample(iterator.next(), false);
}
}

private static void addIfAvailable(Collection<Charset> charsets, String name) {
try {
charsets.add(Charset.forName(name));
} catch (UnsupportedCharsetException e) {
// no worries
}
}

private void appendExample(Charset charset, boolean must) {
byteBuf.clear();
charBuf.clear();

CharsetDecoder decoder = charset.newDecoder();
if (!must) {
// bail early if we can
CoderResult r = decoder
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT)
.decode(byteBuf, charBuf, true);
if (r.isError()) {
return;
}
} else {
decoder
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.decode(byteBuf, charBuf, true);
}
charBuf.flip();

int start = Math.max(unrepresentable - CONTEXT, 0);
int end = Math.min(charBuf.limit(), unrepresentable + CONTEXT + 1);
message.append('\n');
message.append(charBuf.subSequence(start, end).toString()
.replace('\n', '␤')
.replace('\r', '␍')
.replace('\t', '⇥'));
message.append(" <- ");
message.append(charset.name());
}
}
5 changes: 5 additions & 0 deletions lib/src/main/java/com/diffplug/spotless/PaddedCell.java
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,11 @@ public static DirtyState calculateDirtyState(Formatter formatter, File file) thr

public static DirtyState calculateDirtyState(Formatter formatter, File file, byte[] rawBytes) throws IOException {
String raw = new String(rawBytes, formatter.getEncoding());
// check that all characters were encodable
String encodingError = EncodingErrorMsg.msg(raw, rawBytes, formatter.getEncoding());
if (encodingError != null) {
throw new IllegalArgumentException(encodingError);
}
String rawUnix = LineEnding.toUnix(raw);

// enforce the format
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ public static FormatterStep create(String name, CharSequence target, CharSequenc
private static final class State implements Serializable {
private static final long serialVersionUID = 1L;

private final CharSequence target;
private final CharSequence replacement;
private final String target;
private final String replacement;

State(CharSequence target, CharSequence replacement) {
this.target = target;
this.replacement = replacement;
this.target = target.toString();
this.replacement = replacement.toString();
}

FormatterFunc toFormatter() {
Expand Down
2 changes: 2 additions & 0 deletions plugin-gradle/CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
We adhere to the [keepachangelog](https://keepachangelog.com/en/1.0.0/) format (starting after version `3.27.0`).

## [Unreleased]
### Fixed
* If the encoding was set incorrectly, `spotlessApply` could clobber special characters. Spotless now prevents this, and helps to suggest the correct encoding. ([#575](https://github.com/diffplug/spotless/pull/575))

## [4.0.0] - 2020-05-17
**TLDR: This version improves performance and adds support for the local Gradle Build Cache. You will not need to make any changes in your buildscript.** It is a breaking change only for a few users who have built *other* plugins on top of this one.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ public void globalIsRespected() throws Exception {
" encoding 'US-ASCII'",
"}");
setFile("test.java").toContent("µ");
gradleRunner().withArguments("spotlessApply").build();
assertFile("test.java").hasContent("??");
gradleRunner().withArguments("spotlessApply").buildAndFail().getOutput().contains("Encoding error!");
assertFile("test.java").hasContent("µ");
}

@Test
Expand All @@ -75,8 +75,8 @@ public void globalIsRespectedButCanBeOverridden() throws Exception {
"}");
setFile("test.java").toContent("µ");
setFile("utf32.encoded").toContent("µ", Charset.forName("UTF-32"));
gradleRunner().withArguments("spotlessApply").build();
assertFile("test.java").hasContent("??");
assertFile("utf32.encoded").hasContent("A", Charset.forName("UTF-32"));
gradleRunner().withArguments("spotlessApply").buildAndFail().getOutput().contains("Encoding error!");
assertFile("test.java").hasContent("µ");
assertFile("utf32.encoded").hasContent("µ", Charset.forName("UTF-32"));
}
}
2 changes: 2 additions & 0 deletions plugin-maven/CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
We adhere to the [keepachangelog](https://keepachangelog.com/en/1.0.0/) format (starting after version `1.27.0`).

## [Unreleased]
### Fixed
* If the encoding was set incorrectly, `spotless:apply` could clobber special characters. Spotless now prevents this, and helps to suggest the correct encoding. ([#575](https://github.com/diffplug/spotless/pull/575))

## [1.31.0] - 2020-05-05
### Added
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* Copyright 2016 DiffPlug
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.diffplug.spotless;

import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

import javax.annotation.Nullable;

import org.assertj.core.api.Assertions;
import org.junit.Test;

public class EncodingErrorMsgTest {
@Test
public void cp1252asUtf8() throws UnsupportedEncodingException {
// empty case
cp1252asUtf8("", null);
// single char
cp1252asUtf8("a", null);
cp1252asUtf8("°", "Encoding error! Spotless uses UTF-8 by default. At line 1 col 1:\n" +
"� <- UTF-8\n" +
"° <- windows-1252\n" +
"° <- ISO-8859-1\n" +
"ー <- Shift_JIS");
// multiline
cp1252asUtf8("\n123\nabc\n", null);
cp1252asUtf8("\n123\nabc°\nABC", "Encoding error! Spotless uses UTF-8 by default. At line 3 col 4:\n" +
"abc�␤AB <- UTF-8\n" +
"abc°␤AB <- windows-1252\n" +
"abc°␤AB <- ISO-8859-1\n" +
"abcー␤AB <- Shift_JIS");
}

private void cp1252asUtf8(String test, @Nullable String expectedMessage) throws UnsupportedEncodingException {
byte[] cp1252 = test.getBytes("cp1252");
String asUTF = new String(cp1252, StandardCharsets.UTF_8);
String actualMessage = EncodingErrorMsg.msg(asUTF, cp1252, StandardCharsets.UTF_8);
Assertions.assertThat(actualMessage).isEqualTo(expectedMessage);
}

@Test
public void utf8asCP1252() throws UnsupportedEncodingException {
// unfortunately, if you treat UTF8 as Cp1252, it looks weird, but it usually roundtrips faithfully
// which makes it hard to detect

// empty case
utf8asCP1252("", null);
// single char
utf8asCP1252("a", null);
utf8asCP1252("°", null);
// multibyte UTF-8 can hide too
utf8asCP1252("😂", null);
// but some will trigger problems we can detect
utf8asCP1252("⍻", "Encoding error! You configured Spotless to use windows-1252. At line 1 col 2:\n" +
"� <- windows-1252\n" +
"⍻ <- UTF-8\n" +
"⍻ <- ISO-8859-1\n" +
"竝サ <- Shift_JIS"); // there are some codepoints where it doesn't
// multiline
utf8asCP1252("\n123\nabc\n", null);
utf8asCP1252("\n123\nabc°\nABC", null);
utf8asCP1252("\n123\nabc😂\nABC", null);
utf8asCP1252("\n123\nabc⍻\nABC", "Encoding error! You configured Spotless to use windows-1252. At line 3 col 5:\n" +
"bcâ�»␤A <- windows-1252\n" +
"bc⍻␤ABC <- UTF-8\n" +
"bc⍻␤A <- ISO-8859-1\n" +
"bc竝サ␤AB <- Shift_JIS");
}

private void utf8asCP1252(String test, @Nullable String expectedMessage) throws UnsupportedEncodingException {
byte[] utf8 = test.getBytes(StandardCharsets.UTF_8);
String asCp1252 = new String(utf8, "cp1252");
String actualMessage = EncodingErrorMsg.msg(asCp1252, utf8, Charset.forName("cp1252"));
Assertions.assertThat(actualMessage).isEqualTo(expectedMessage);
}

@Test
public void canUseUnrepresentableOnPurpose() throws UnsupportedEncodingException {
String pathologic = new String(new char[]{EncodingErrorMsg.UNREPRESENTABLE});
byte[] pathologicBytes = pathologic.getBytes(StandardCharsets.UTF_8);
String pathologicMsg = EncodingErrorMsg.msg(pathologic, pathologicBytes, StandardCharsets.UTF_8);
Assertions.assertThat(pathologicMsg).isNull();
}
}

0 comments on commit aad37ce

Please sign in to comment.