Skip to content

Commit

Permalink
several more features. Need to re-profile soon.
Browse files Browse the repository at this point in the history
  • Loading branch information
cnuernber committed Mar 27, 2022
1 parent 151adb1 commit bf87382
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 14 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 9.021
* `:trim-leading-whitespace?`, `:trim-trailing-whitespace?`, and `:nil-empty-values?` are
all supported in csv parsing to bring feature set up to par with univocity. These changes
allow all tech.ml.dataset csv-based unit tests to succeed.

## 9.020
* `:column-whitelist` and `:column-blacklist` are now supported for csv parsing.

Expand Down
53 changes: 53 additions & 0 deletions java/tech/v3/datatype/CharBuffer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package tech.v3.datatype;


public final class CharBuffer
{
public final boolean trimLeading;
public final boolean trimTrailing;
public final boolean nilEmpty;
char[] buffer;
int len;

public CharBuffer(boolean _trimLeading, boolean _trimTrailing, boolean _nilEmpty) {
trimLeading = _trimLeading;
trimTrailing = _trimTrailing;
nilEmpty = _nilEmpty;
buffer = new char[32];
len = 0;
}
public final boolean isspace(char val) {
return val == ' ' || val == '\t';
}
public final void append(char val) {
if(!trimLeading ||
len != 0 ||
!isspace(val)) {
if (len == buffer.length) {
char[] newbuffer = new char[buffer.length * 2];
System.arraycopy(buffer, 0, newbuffer, 0, len);
buffer = newbuffer;
}
buffer[len] = val;
++len;
}
}
public final void clear() { len = 0; }
public final int length() { return len; }
public final String toString() {
int strlen = len;
if(len != 0 && trimTrailing) {
int idx = len - 1;
for (; idx >= 0 && isspace(buffer[idx]); --idx);
strlen = idx + 1;
}
if(strlen == 0) {
if(nilEmpty) {
return null;
}
return "";
} else {
return new String(buffer, 0, strlen);
}
}
}
7 changes: 4 additions & 3 deletions java/tech/v3/datatype/CharReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import java.util.Iterator;
import java.io.EOFException;

public class CharReader
public final class CharReader
{
public final Iterator buffers;
public static final char lf = '\n';
public static final char cr = '\r';

public final char quot;
public final char sep;
public static final long EOF=-1;
Expand Down Expand Up @@ -56,7 +57,7 @@ public final void unread() {
--curPos;
}

public final long csvRead(StringBuilder sb) {
public final long csvRead(CharBuffer sb) {
while(curBuffer != null) {
for(; curPos < buflen; ++curPos) {
final char curChar = curBuffer[curPos];
Expand Down Expand Up @@ -84,7 +85,7 @@ public final long csvRead(StringBuilder sb) {
return EOF;
}

public final long csvReadQuote(StringBuilder sb) throws EOFException {
public final long csvReadQuote(CharBuffer sb) throws EOFException {
while(curBuffer != null) {
for(; curPos < buflen; ++curPos) {
final char curChar = curBuffer[curPos];
Expand Down
37 changes: 27 additions & 10 deletions src/tech/v3/datatype/char_input.clj
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
[tech.v3.parallel.queue-iter :as queue-iter]
[com.github.ztellman.primitive-math :as pmath]
[clojure.set :as set])
(:import [tech.v3.datatype CharReader UnaryPredicate UnaryPredicates$LongUnaryPredicate]
(:import [tech.v3.datatype CharReader UnaryPredicate UnaryPredicates$LongUnaryPredicate
CharBuffer]
[java.io Reader StringReader]
[java.util Iterator Arrays ArrayList List NoSuchElementException]
[java.lang AutoCloseable]
Expand Down Expand Up @@ -97,7 +98,12 @@
(char? v)
v
(string? v)
(first v)
(do
(when-not (== 1 (.length ^String v))
(throw (Exception.
(format "Only single character separators allowed: - \"%s\""
v))))
(first v))
(number? v)
(unchecked-char v)))

Expand All @@ -112,10 +118,12 @@
[[reader->char-buf-iter]].
* `:async?` - default to true - reads the reader in an offline thread into character
buffers."
buffers.
* `:trim-leading-whitespace?` - When true, leading spaces are ignored. Defaults to true."
^CharReader [rdr & [options]]
(let [quote (->character (get options :quote \"))
separator (->character (get options :separator \,))
trim-leading? (get options :trim-leading-whitespace? true)
async? (get options :async? true)
options (if async?
(assoc options
Expand All @@ -142,7 +150,8 @@
(or (nil? data)
(== 0 (.size data))
(and (== 1 (.size data))
(.equals "" (.get data 0)))))
(or (nil? (.get data 0))
(.equals "" (.get data 0))))))

(def ^{:private true
:tag UnaryPredicate}
Expand All @@ -153,7 +162,7 @@


(defn- read-row
^RowRecord [^CharReader rdr ^StringBuilder sb ^ArrayList row
^RowRecord [^CharReader rdr ^CharBuffer sb ^ArrayList row
^UnaryPredicate filter-fn]
(.clear row)
(let [tag (long (loop [tag (.csvRead rdr sb)
Expand All @@ -164,7 +173,7 @@
(when (== tag SEP)
(when (.unaryLong filter-fn col-idx)
(.add row (.toString sb)))
(.delete sb 0 (.length sb)))
(.clear sb))
(recur (long (if (== tag SEP)
(.csvRead rdr sb)
(.csvReadQuote rdr sb)))
Expand All @@ -174,7 +183,7 @@
(do
(when (.unaryLong filter-fn col-idx)
(.add row (.toString sb)))
(.delete sb 0 (.length sb))
(.clear sb)
tag))))
new-row (.clone row)]
(.clear row)
Expand All @@ -184,7 +193,7 @@


(deftype ^:private CSVReadIter [^CharReader rdr
^StringBuilder sb
^CharBuffer sb
^ArrayList row-builder
^{:unsynchronized-mutable true
:tag RowRecord} cur-row
Expand Down Expand Up @@ -228,11 +237,19 @@
* `:close-reader?` - Close the reader when iteration is finished - defaults to true.
* `:column-whitelist` - Sequence of allowed column names.
* `:column-blacklist` - Sequence of dis-allowed column names. When conflicts with
`:column-whitelist` then `:column-whitelist` wins."
`:column-whitelist` then `:column-whitelist` wins.
* `:trim-leading-whitespace?` - When true, leading spaces are ignored. Defaults to true.
* `:trim-trailing-whitespace?` - When true, trainling spaces and tabs are ignored. Defaults
to true
* `:nil-empty-values?` - When true, empty strings are elided entirely and returned as nil
values. Defaults to true."
^Iterator [input & [options]]
(let [rdr (reader->char-reader input options)
sb (StringBuilder.)
sb (CharBuffer. (get options :trim-leading-whitespace? true)
(get options :trim-trailing-whitespace? true)
(get options :nil-empty-values? true))
row (ArrayList.)
nil-empty? (get options :nil-empty-values? true)
next-row (read-row rdr sb row true-unary-predicate)
^RoaringBitmap column-whitelist
(when (or (contains? options :column-whitelist)
Expand Down
11 changes: 11 additions & 0 deletions test/data/datatype_parser.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
id, char, word, bool, boolstr, boolean
1, t, true, true, true, t
2, f, False, true, true, y
3, y, YES, false, false, n
4, n, NO, false, false, f
5, T, positive, true, true, true
6, F, negative, false, false, false
7, Y, yep, true, true, positive
8, N, not, false, false, negative
9, A, pos, false, False, negative
10, z, neg, false, false, negative
10 changes: 9 additions & 1 deletion test/tech/v3/datatype/char_input_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
(deftest funky-csv
(is (= [["a,b" "c\"def\"" "one,two" "\"a,b,c\"def\"g\""]
["abba" "def" "1" "2"]
["df" "ef" "5" ""]]
["df" "ef" "5" nil]]
(-> (read-csv (java.io.File. "test/data/funky.csv"))
(iterator-seq)
(vec)))))
Expand Down Expand Up @@ -79,3 +79,11 @@ air, moon roof, loaded\",4799.00")
(is (= 2 (count csv)))
(is (= ["Year" "Make" "Model"] (first csv)))
(is (= ["1997" "Ford" "E350"] (second csv)))))


(deftest trim-leading
(let [header (first (read-csv-compat (java.io.File. "test/data/datatype_parser.csv")))]
(is (= "word" (header 2))))
(let [header (first (read-csv-compat (java.io.File. "test/data/datatype_parser.csv")
:trim-leading-whitespace? false))]
(is (= " word" (header 2)))))

0 comments on commit bf87382

Please sign in to comment.