Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use Java code to parse timestamp for performance.
When using JRuby TimestampParser, the performance is slower than Java implementation. There are some unhandled pattern by SimpleDateFormat Java class. So, my current code is to check a configured format and if the format can convert from ruby's pattern like %xx to SimpleDateFormat pattern, then this code replaced the pattern. Otherwise, TimestampParser is used. TimestampParserFactory.java is used to determine either TimestampParser or JavaTimestampParser based on a format. If configured format doesn't have '%' character, then use JavaTimestampParser. If '%' is used and these patterns can replace with SimpleDateFormat pattern, then use JavaTimestampParser. Otherwise, TimestampParser is used. JavaTimestampParser uses SimpleDateFormat to parse patterns. When using JavaTimestampParser and it may need locale configuration to parse some text like 'MMMM' pattern(e.g. April). So, I added a locale parameter for TimestampParser.ParserTask. CsvParserPlugin use TimestampParserFactory to get TimestampParser instance. Note: - It may be better to be able to configure java, jruby, or auto parameter for TimestampParserFactory to rollback this behavior to use JRuby implementation. - This code added a support for SimpleDateFormat pattern, this may cause a complicated configuration from user's point of view. - This doesn't improve performance when using non supported pattern by SimpleDateFormat.
- Loading branch information
Showing
8 changed files
with
530 additions
and
2 deletions.
There are no files selected for viewing
74 changes: 74 additions & 0 deletions
74
embulk-core/src/main/java/org/embulk/spi/time/JavaTimestampParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
package org.embulk.spi.time; | ||
|
||
import java.text.AttributedCharacterIterator; | ||
import java.text.DateFormat; | ||
import java.text.ParseException; | ||
import java.text.SimpleDateFormat; | ||
import java.util.Date; | ||
import java.util.Locale; | ||
|
||
class JavaTimestampParser extends TimestampParser | ||
{ | ||
private static final char LOCALE_SEPARATOR = '_'; | ||
|
||
private final SimpleDateFormat dateFormat; | ||
private final boolean timeZoneParsed; | ||
|
||
static Locale toLocale(String localeText) | ||
{ | ||
if (localeText == null || localeText.length() == 0) { | ||
return null; | ||
} | ||
|
||
String language; | ||
int sepPos = localeText.indexOf(LOCALE_SEPARATOR); | ||
if (sepPos == -1) { | ||
return new Locale(localeText); | ||
} else { | ||
language = localeText.substring(0, sepPos); | ||
localeText = localeText.substring(sepPos + 1); | ||
} | ||
|
||
sepPos = localeText.indexOf(LOCALE_SEPARATOR); | ||
return sepPos == -1 ? | ||
new Locale(language, localeText) : | ||
new Locale(language, | ||
localeText.substring(0, sepPos), | ||
localeText.substring(sepPos + 1)); | ||
} | ||
|
||
public JavaTimestampParser(String javaFormatText, ParserTask task) | ||
{ | ||
super(javaFormatText, task); | ||
Locale locale = toLocale(task.getLocale()); | ||
dateFormat = locale == null ? | ||
new SimpleDateFormat(javaFormatText) : | ||
new SimpleDateFormat(javaFormatText, locale); | ||
timeZoneParsed = isTimeZoneParsed(); | ||
|
||
if (!timeZoneParsed) { | ||
dateFormat.setTimeZone(getDefaultTimeZone().toTimeZone()); | ||
} | ||
} | ||
|
||
@Override | ||
public Timestamp parse(String text) throws TimestampParseException | ||
{ | ||
try { | ||
return Timestamp.ofEpochMilli(dateFormat.parse(text).getTime()); | ||
} catch (ParseException e) { | ||
throw new TimestampParseException(e); | ||
} | ||
} | ||
|
||
public String getFormat() | ||
{ | ||
return dateFormat.toPattern(); | ||
} | ||
|
||
boolean isTimeZoneParsed() | ||
{ | ||
AttributedCharacterIterator it = dateFormat.formatToCharacterIterator(new Date()); | ||
return it.getAllAttributeKeys().contains(DateFormat.Field.TIME_ZONE); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
132 changes: 132 additions & 0 deletions
132
embulk-core/src/main/java/org/embulk/spi/time/TimestampParserFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
package org.embulk.spi.time; | ||
|
||
import org.embulk.spi.time.TimestampParser.ParserTask; | ||
|
||
public class TimestampParserFactory | ||
{ | ||
public enum ParserType | ||
{ | ||
AUTO, | ||
JAVA, | ||
JRUBY | ||
} | ||
|
||
private static final char DATE_FORMAT_QUOTE_CHAR = '\''; | ||
private static final String[] RUBY_TO_JAVA_FORMAT_TABLE = new String[128]; | ||
|
||
// Note: Some patterns like %c, %C, %e, %j, %N, %t, %U, and so on are not handled. | ||
static | ||
{ | ||
RUBY_TO_JAVA_FORMAT_TABLE['a'] = "EEE"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['A'] = "EEEE"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['b'] = "MMM"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['B'] = "MMMM"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['d'] = "dd"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['D'] = "MM/dd/yy"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['F'] = "yyyy-MM-dd"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['h'] = "MMM"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['H'] = "HH"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['I'] = "hh"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['m'] = "MM"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['L'] = "SSS"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['M'] = "mm"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['p'] = "a"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['P'] = "a"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['S'] = "ss"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['T'] = "HH:mm:ss"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['x'] = "MM/dd/yy"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['X'] = "HH:mm:ss"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['y'] = "yy"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['Y'] = "yyyy"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['z'] = "Z"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['Z'] = "z"; | ||
RUBY_TO_JAVA_FORMAT_TABLE['%'] = "%"; | ||
} | ||
|
||
private final ParserType parserType; | ||
|
||
public TimestampParserFactory() | ||
{ | ||
this(ParserType.AUTO); | ||
} | ||
|
||
public TimestampParserFactory(ParserType parserType) | ||
{ | ||
this.parserType = parserType; | ||
} | ||
|
||
public TimestampParser newInstance(String format, ParserTask task) | ||
{ | ||
String javaFormat; | ||
|
||
switch(parserType) { | ||
case JAVA: | ||
return new JavaTimestampParser(format, task); | ||
case JRUBY: | ||
return new TimestampParser(format, task); | ||
default: // AUTO | ||
if (format != null && format.indexOf('%') == -1) { | ||
return new JavaTimestampParser(format, task); | ||
} else { | ||
javaFormat = toJavaFormat(format); | ||
return javaFormat != null ? | ||
new JavaTimestampParser(javaFormat, task) : | ||
new TimestampParser(format, task); | ||
} | ||
} | ||
} | ||
|
||
String toJavaFormat(String format) | ||
{ | ||
StringBuilder builder = new StringBuilder(); | ||
int formatLen = format.length(); | ||
boolean quoted = false; | ||
|
||
for (int i = 0;i < formatLen;i++) { | ||
char c = format.charAt(i); | ||
if (isJavaFormatReserved(c)) { | ||
if (!quoted) { | ||
quoted = true; | ||
builder.append(DATE_FORMAT_QUOTE_CHAR); | ||
} | ||
builder.append(c); | ||
} else { | ||
if (quoted) { | ||
quoted = false; | ||
builder.append(DATE_FORMAT_QUOTE_CHAR); | ||
} | ||
if (c == '%' && (i + 1) < formatLen) { | ||
c = format.charAt(i + 1); | ||
String convertedFormat = c < RUBY_TO_JAVA_FORMAT_TABLE.length ? | ||
RUBY_TO_JAVA_FORMAT_TABLE[c] : null; | ||
if (convertedFormat != null) { | ||
builder.append(convertedFormat); | ||
i++; | ||
} else { | ||
// If there is unsupported format, then return null to use Ruby TimestampParser. | ||
return null; | ||
} | ||
} else if (isJavaDateFormatQuoteChar(c)) { | ||
builder.append("''"); | ||
} else { | ||
builder.append(c); | ||
} | ||
} | ||
} | ||
if (quoted) { | ||
builder.append(DATE_FORMAT_QUOTE_CHAR); | ||
} | ||
|
||
return builder.toString(); | ||
} | ||
|
||
private boolean isJavaFormatReserved(char c) | ||
{ | ||
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); | ||
} | ||
|
||
private boolean isJavaDateFormatQuoteChar(char c) | ||
{ | ||
return c == DATE_FORMAT_QUOTE_CHAR; | ||
} | ||
} |
133 changes: 133 additions & 0 deletions
133
embulk-core/src/test/java/org/embulk/spi/time/TestJavaTimestampParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
package org.embulk.spi.time; | ||
|
||
import static org.junit.Assert.*; | ||
|
||
import java.util.Locale; | ||
|
||
import org.embulk.EmbulkTestRuntime; | ||
import org.embulk.config.ConfigSource; | ||
import org.embulk.spi.Exec; | ||
import org.junit.Rule; | ||
import org.junit.Test; | ||
|
||
public class TestJavaTimestampParser | ||
{ | ||
@Rule | ||
public EmbulkTestRuntime runtime = new EmbulkTestRuntime(); | ||
|
||
@Test | ||
public void testParse() throws TimestampParseException | ||
{ | ||
// NOTE: SimpleDateFormat cannot handle micro/nano seconds. | ||
JavaTimestampParser parser = new JavaTimestampParser("yyyy-MM-dd HH:mm:ss.SSS z", createTestTask()); | ||
assertEquals("Verify milliseconds is supported.", | ||
Timestamp.ofEpochMilli(1416365189123L), | ||
parser.parse("2014-11-19 02:46:29.123 UTC")); | ||
} | ||
|
||
@Test | ||
public void testParseWithoutTimeZone() throws TimestampParseException | ||
{ | ||
JavaTimestampParser parser = new JavaTimestampParser("yyyy-MM-dd HH:mm:ss.SSS", createTestTask("UTC")); | ||
assertEquals("Verify default timezone is used for no timezone format.", | ||
Timestamp.ofEpochMilli(1416365189123L), | ||
parser.parse("2014-11-19 02:46:29.123")); | ||
} | ||
|
||
@Test | ||
public void testIsTimeZoneParsed() | ||
{ | ||
JavaTimestampParser parser = new JavaTimestampParser("z", createTestTask()); | ||
assertTrue("Verify timezone is parsed", parser.isTimeZoneParsed()); | ||
} | ||
|
||
@Test | ||
public void testIsTimeZoneParsedForRFC822() | ||
{ | ||
JavaTimestampParser parser = new JavaTimestampParser("Z", createTestTask()); | ||
assertTrue("Verify timezone is parsed", parser.isTimeZoneParsed()); | ||
} | ||
|
||
@Test | ||
public void testIsTimeZoneParsedForNoTimezoneFormat() | ||
{ | ||
JavaTimestampParser parser = new JavaTimestampParser("yyyy", createTestTask()); | ||
assertFalse("Verify timezone is parsed", parser.isTimeZoneParsed()); | ||
} | ||
|
||
@Test | ||
public void testToLocaleLanguage() | ||
{ | ||
Locale locale = JavaTimestampParser.toLocale("ja"); | ||
assertEquals("Verify locale language", "ja", locale.getLanguage()); | ||
} | ||
|
||
@Test | ||
public void testToLocaleCountry() | ||
{ | ||
Locale locale = JavaTimestampParser.toLocale("_JP"); | ||
assertEquals("Verify locale language", "", locale.getLanguage()); | ||
assertEquals("Verify locale country", "JP", locale.getCountry()); | ||
} | ||
|
||
@Test | ||
public void testToLocaleLanguageAndCountry() | ||
{ | ||
Locale locale = JavaTimestampParser.toLocale("ja_JP"); | ||
assertEquals("Verify locale language", "ja", locale.getLanguage()); | ||
assertEquals("Verify locale country", "JP", locale.getCountry()); | ||
} | ||
|
||
@Test | ||
public void testToLocaleLanguageAndCountryAndBlank() | ||
{ | ||
Locale locale = JavaTimestampParser.toLocale("ja_JP_"); | ||
assertEquals("Verify locale language", "ja", locale.getLanguage()); | ||
assertEquals("Verify locale country", "JP", locale.getCountry()); | ||
assertEquals("Verify locale country", "", locale.getVariant()); | ||
} | ||
|
||
@Test | ||
public void testToLocaleVariant() | ||
{ | ||
Locale locale = JavaTimestampParser.toLocale("__Var_"); | ||
assertEquals("Verify locale language", "", locale.getLanguage()); | ||
assertEquals("Verify locale country", "", locale.getCountry()); | ||
assertEquals("Verify locale variant", "Var_", locale.getVariant()); | ||
} | ||
|
||
@Test | ||
public void testToLocaleLangCountryVariant() | ||
{ | ||
Locale locale = JavaTimestampParser.toLocale("ja_JP_Var_"); | ||
assertEquals("Verify locale language", "ja", locale.getLanguage()); | ||
assertEquals("Verify locale country", "JP", locale.getCountry()); | ||
assertEquals("Verify locale variant", "Var_", locale.getVariant()); | ||
} | ||
|
||
@Test | ||
public void testNoLocale() | ||
{ | ||
assertNull("Verify null is returned for null", JavaTimestampParser.toLocale(null)); | ||
assertNull("Verify null is returned for zero length", JavaTimestampParser.toLocale("")); | ||
} | ||
|
||
@Test | ||
public void testGetFormat() | ||
{ | ||
JavaTimestampParser parser = new JavaTimestampParser("yyyy", createTestTask()); | ||
assertEquals("Verify format", "yyyy", parser.getFormat()); | ||
} | ||
|
||
private TimestampParser.ParserTask createTestTask() | ||
{ | ||
ConfigSource config = Exec.newConfigSource(); | ||
return config.loadConfig(TimestampParser.ParserTask.class); | ||
} | ||
|
||
private TimestampParser.ParserTask createTestTask(String timeZone) | ||
{ | ||
ConfigSource config = Exec.newConfigSource().set("default_timezone", timeZone); | ||
return config.loadConfig(TimestampParser.ParserTask.class); | ||
} | ||
} |
Oops, something went wrong.