Merge pull request #271 from fixrtm/fix-model-charset-detection

fix: all model files will be read as platform-native charset
fixrtm · Aug 14, 2021 · 6639002 · 6639002
2 parents b083d77 + be2d923
commit 6639002
Show file tree

Hide file tree

Showing 2 changed files with 79 additions and 4 deletions.
diff --git a/src/main/java/com/anatawa12/fixRtm/ngtlib/renderer/model/PolygonModelCharsetDetector.kt b/src/main/java/com/anatawa12/fixRtm/ngtlib/renderer/model/PolygonModelCharsetDetector.kt
@@ -0,0 +1,60 @@
+package com.anatawa12.fixRtm.ngtlib.renderer.model
+
+import com.anatawa12.fixRtm.MS932
+import jp.ngt.ngtlib.renderer.model.ModelFormatException
+import java.io.ByteArrayInputStream
+import java.io.IOException
+import java.io.InputStream
+import java.io.SequenceInputStream
+import java.lang.ref.SoftReference
+import java.nio.charset.Charset
+import java.nio.charset.StandardCharsets
+
+object PolygonModelCharsetDetector {
+    // allocating huge array makes GC many times so cache them
+    // but the buffer will be not necessary after finish loading models,
+    // so it should be weak/soft reference.
+    private val buffer: ThreadLocal<SoftReference<ByteArray>> = ThreadLocal<SoftReference<ByteArray>>()
+
+    // windows-31j: Shift_JIS with Microsoft Extension. Also known as Microsoft Code Page 932
+    private val tryingCharsets = arrayOf(StandardCharsets.UTF_8, MS932)
+
+    private fun getBuffer(): ByteArray? {
+        val ref: SoftReference<ByteArray>? = buffer.get()
+        var bytes: ByteArray? = if (ref == null) null else ref.get()
+        if (bytes == null) {
+            // 4 Mi bytes
+            buffer.set(SoftReference(ByteArray(1024 * 1024).also { bytes = it }))
+        }
+        return bytes
+    }
+
+    // TODO: test
+    fun detectCharset(inputStream: InputStream): Pair<Charset, InputStream> {
+        val buf = getBuffer()
+        var c = 0
+        try {
+            // read bytes fully to buf.
+            var i: Int
+            while (inputStream.read(buf, c, buf!!.size - c).also { i = it } != -1) {
+                c += i
+                if (c == buf.size) break
+            }
+        } catch (e: IOException) {
+            throw ModelFormatException("On read file for charset detection", e)
+        }
+        // empty: any charset should return empty string so use default one
+        if (c == 0) return Pair(Charset.defaultCharset(), inputStream)
+        val returnInputStream: InputStream = SequenceInputStream(ByteArrayInputStream(buf, 0, c), inputStream)
+        for (tryingCharset in tryingCharsets) {
+            var s = String(buf, 0, c, tryingCharset)
+            // trim last few chars to not make error for last bytes
+            s = s.substring(0, s.length - 10)
+
+            // No U+FFFD should mean no decoding error.
+            if (s.indexOf('\ufffd') == -1) return Pair(tryingCharset, returnInputStream)
+        }
+        // no charsets are valid: use UTF8
+        return Pair(StandardCharsets.UTF_8, returnInputStream)
+    }
+}
diff --git a/src/main/ngtlib-patches/jp/ngt/ngtlib/renderer/model/PolygonModel.java.patch b/src/main/ngtlib-patches/jp/ngt/ngtlib/renderer/model/PolygonModel.java.patch
@@ -13,7 +13,22 @@
     public final List<GroupObject> groupObjects = new ArrayList<>(16);
     protected GroupObject currentGroupObject;
     int lineCount;
-@@ -92,11 +92,11 @@
+@@ -46,11 +46,13 @@
+    protected void init(InputStream[] is) throws ModelFormatException {
+       this.loadModel(is[0]);
+    }
+
+    private void loadModel(InputStream inputStream) {
+-      BufferedReader bufferedreader = new BufferedReader(new InputStreamReader(inputStream));
++      kotlin.Pair<java.nio.charset.Charset, InputStream> pair = com.anatawa12.fixRtm.ngtlib.renderer.model.PolygonModelCharsetDetector.INSTANCE.detectCharset(inputStream);
++      inputStream = pair.component2();
++      BufferedReader bufferedreader = new BufferedReader(new InputStreamReader(inputStream, pair.component1()));
+       Stream<String> stream = bufferedreader.lines();
+       stream.forEachOrdered((line) -> {
+          line = repS.matcher(line).replaceAll(" ").trim();
+          this.parseLine(line, ++this.lineCount);
+       });
+@@ -92,11 +94,11 @@
 
     public final float[] getSize() {
        return this.sizeBox;
@@ -26,7 +41,7 @@
        }
 
        NGTTessellator ngttessellator = NGTTessellator.instance;
-@@ -107,18 +107,18 @@
+@@ -107,18 +109,18 @@
           GL11.glShadeModel(7424);
        }
 
@@ -47,7 +62,7 @@
        }
 
        for(GroupObject groupobject : this.groupObjects) {
-@@ -133,11 +133,11 @@
+@@ -133,11 +135,11 @@
           GL11.glShadeModel(7424);
        }
 
@@ -60,7 +75,7 @@
        }
 
        for(GroupObject groupobject : this.groupObjects) {
-@@ -151,15 +151,15 @@
+@@ -151,15 +153,15 @@
           GL11.glShadeModel(7424);
        }