Skip to content

Commit

Permalink
Use Java code to parse timestamp for performance.
Browse files Browse the repository at this point in the history
When using JRuby TimestampParser, the performance is slower than Java
implementation. There are some unhandled pattern by SimpleDateFormat
Java class. So, my current code is to check a configured format and
if the format can convert from ruby's pattern like %xx to
SimpleDateFormat pattern, then this code replaced the pattern.
Otherwise, TimestampParser is used.

TimestampParserFactory.java is used to determine either TimestampParser
or JavaTimestampParser based on a format. If configured format doesn't
have '%' character, then use JavaTimestampParser. If '%' is used and
these patterns can replace with SimpleDateFormat pattern, then use
JavaTimestampParser. Otherwise, TimestampParser is used.

JavaTimestampParser uses SimpleDateFormat to parse patterns. When using
JavaTimestampParser and it may need locale configuration to parse some
text like 'MMMM' pattern(e.g. April). So, I added a locale parameter for
TimestampParser.ParserTask.

CsvParserPlugin use TimestampParserFactory to get TimestampParser
instance.

Note:
- It may be better to be able to configure java, jruby, or auto
  parameter for TimestampParserFactory to rollback this behavior to use
  JRuby implementation.
- This code added a support for SimpleDateFormat pattern, this may cause
  a complicated configuration from user's point of view.
- This doesn't improve performance when using non supported pattern by
  SimpleDateFormat.
  • Loading branch information
hata committed Apr 28, 2015
1 parent 44af8b7 commit 26ea959
Show file tree
Hide file tree
Showing 8 changed files with 530 additions and 2 deletions.
@@ -0,0 +1,74 @@
package org.embulk.spi.time;

import java.text.AttributedCharacterIterator;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;

class JavaTimestampParser extends TimestampParser
{
private static final char LOCALE_SEPARATOR = '_';

private final SimpleDateFormat dateFormat;
private final boolean timeZoneParsed;

static Locale toLocale(String localeText)
{
if (localeText == null || localeText.length() == 0) {
return null;
}

String language;
int sepPos = localeText.indexOf(LOCALE_SEPARATOR);
if (sepPos == -1) {
return new Locale(localeText);
} else {
language = localeText.substring(0, sepPos);
localeText = localeText.substring(sepPos + 1);
}

sepPos = localeText.indexOf(LOCALE_SEPARATOR);
return sepPos == -1 ?
new Locale(language, localeText) :
new Locale(language,
localeText.substring(0, sepPos),
localeText.substring(sepPos + 1));
}

public JavaTimestampParser(String javaFormatText, ParserTask task)
{
super(javaFormatText, task);
Locale locale = toLocale(task.getLocale());
dateFormat = locale == null ?
new SimpleDateFormat(javaFormatText) :
new SimpleDateFormat(javaFormatText, locale);
timeZoneParsed = isTimeZoneParsed();

if (!timeZoneParsed) {
dateFormat.setTimeZone(getDefaultTimeZone().toTimeZone());
}
}

@Override
public Timestamp parse(String text) throws TimestampParseException
{
try {
return Timestamp.ofEpochMilli(dateFormat.parse(text).getTime());
} catch (ParseException e) {
throw new TimestampParseException(e);
}
}

public String getFormat()
{
return dateFormat.toPattern();
}

boolean isTimeZoneParsed()
{
AttributedCharacterIterator it = dateFormat.formatToCharacterIterator(new Date());
return it.getAllAttributeKeys().contains(DateFormat.Field.TIME_ZONE);
}
}
Expand Up @@ -33,7 +33,7 @@ public TimestampFormatter newFormatter(TimestampFormatter.FormatterTask task)

public TimestampParser newParser(TimestampParser.ParserTask task)
{
return new TimestampParser(format, task);
return new TimestampParserFactory().newInstance(format, task);
}

private static Set<String> availableTimeZoneNames = ImmutableSet.copyOf(DateTimeZone.getAvailableIDs());
Expand Down
Expand Up @@ -3,4 +3,13 @@
public class TimestampParseException
extends Exception
{
public TimestampParseException()
{
super();
}

public TimestampParseException(Throwable t)
{
super(t);
}
}
Expand Up @@ -18,6 +18,10 @@ public interface ParserTask
@ConfigDefault("\"UTC\"")
public DateTimeZone getDefaultTimeZone();

@Config("locale")
@ConfigDefault("\"\"")
public String getLocale();

@ConfigInject
public ScriptingContainer getJRuby();
}
Expand Down Expand Up @@ -59,4 +63,9 @@ public Timestamp parse(String text) throws TimestampParseException

return Timestamp.ofEpochSecond(sec, usec * 1000);
}

DateTimeZone getDefaultTimeZone()
{
return defaultTimeZone;
}
}
@@ -0,0 +1,132 @@
package org.embulk.spi.time;

import org.embulk.spi.time.TimestampParser.ParserTask;

public class TimestampParserFactory
{
public enum ParserType
{
AUTO,
JAVA,
JRUBY
}

private static final char DATE_FORMAT_QUOTE_CHAR = '\'';
private static final String[] RUBY_TO_JAVA_FORMAT_TABLE = new String[128];

// Note: Some patterns like %c, %C, %e, %j, %N, %t, %U, and so on are not handled.
static
{
RUBY_TO_JAVA_FORMAT_TABLE['a'] = "EEE";
RUBY_TO_JAVA_FORMAT_TABLE['A'] = "EEEE";
RUBY_TO_JAVA_FORMAT_TABLE['b'] = "MMM";
RUBY_TO_JAVA_FORMAT_TABLE['B'] = "MMMM";
RUBY_TO_JAVA_FORMAT_TABLE['d'] = "dd";
RUBY_TO_JAVA_FORMAT_TABLE['D'] = "MM/dd/yy";
RUBY_TO_JAVA_FORMAT_TABLE['F'] = "yyyy-MM-dd";
RUBY_TO_JAVA_FORMAT_TABLE['h'] = "MMM";
RUBY_TO_JAVA_FORMAT_TABLE['H'] = "HH";
RUBY_TO_JAVA_FORMAT_TABLE['I'] = "hh";
RUBY_TO_JAVA_FORMAT_TABLE['m'] = "MM";
RUBY_TO_JAVA_FORMAT_TABLE['L'] = "SSS";
RUBY_TO_JAVA_FORMAT_TABLE['M'] = "mm";
RUBY_TO_JAVA_FORMAT_TABLE['p'] = "a";
RUBY_TO_JAVA_FORMAT_TABLE['P'] = "a";
RUBY_TO_JAVA_FORMAT_TABLE['S'] = "ss";
RUBY_TO_JAVA_FORMAT_TABLE['T'] = "HH:mm:ss";
RUBY_TO_JAVA_FORMAT_TABLE['x'] = "MM/dd/yy";
RUBY_TO_JAVA_FORMAT_TABLE['X'] = "HH:mm:ss";
RUBY_TO_JAVA_FORMAT_TABLE['y'] = "yy";
RUBY_TO_JAVA_FORMAT_TABLE['Y'] = "yyyy";
RUBY_TO_JAVA_FORMAT_TABLE['z'] = "Z";
RUBY_TO_JAVA_FORMAT_TABLE['Z'] = "z";
RUBY_TO_JAVA_FORMAT_TABLE['%'] = "%";
}

private final ParserType parserType;

public TimestampParserFactory()
{
this(ParserType.AUTO);
}

public TimestampParserFactory(ParserType parserType)
{
this.parserType = parserType;
}

public TimestampParser newInstance(String format, ParserTask task)
{
String javaFormat;

switch(parserType) {
case JAVA:
return new JavaTimestampParser(format, task);
case JRUBY:
return new TimestampParser(format, task);
default: // AUTO
if (format != null && format.indexOf('%') == -1) {
return new JavaTimestampParser(format, task);
} else {
javaFormat = toJavaFormat(format);
return javaFormat != null ?
new JavaTimestampParser(javaFormat, task) :
new TimestampParser(format, task);
}
}
}

String toJavaFormat(String format)
{
StringBuilder builder = new StringBuilder();
int formatLen = format.length();
boolean quoted = false;

for (int i = 0;i < formatLen;i++) {
char c = format.charAt(i);
if (isJavaFormatReserved(c)) {
if (!quoted) {
quoted = true;
builder.append(DATE_FORMAT_QUOTE_CHAR);
}
builder.append(c);
} else {
if (quoted) {
quoted = false;
builder.append(DATE_FORMAT_QUOTE_CHAR);
}
if (c == '%' && (i + 1) < formatLen) {
c = format.charAt(i + 1);
String convertedFormat = c < RUBY_TO_JAVA_FORMAT_TABLE.length ?
RUBY_TO_JAVA_FORMAT_TABLE[c] : null;
if (convertedFormat != null) {
builder.append(convertedFormat);
i++;
} else {
// If there is unsupported format, then return null to use Ruby TimestampParser.
return null;
}
} else if (isJavaDateFormatQuoteChar(c)) {
builder.append("''");
} else {
builder.append(c);
}
}
}
if (quoted) {
builder.append(DATE_FORMAT_QUOTE_CHAR);
}

return builder.toString();
}

private boolean isJavaFormatReserved(char c)
{
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
}

private boolean isJavaDateFormatQuoteChar(char c)
{
return c == DATE_FORMAT_QUOTE_CHAR;
}
}
@@ -0,0 +1,133 @@
package org.embulk.spi.time;

import static org.junit.Assert.*;

import java.util.Locale;

import org.embulk.EmbulkTestRuntime;
import org.embulk.config.ConfigSource;
import org.embulk.spi.Exec;
import org.junit.Rule;
import org.junit.Test;

public class TestJavaTimestampParser
{
@Rule
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();

@Test
public void testParse() throws TimestampParseException
{
// NOTE: SimpleDateFormat cannot handle micro/nano seconds.
JavaTimestampParser parser = new JavaTimestampParser("yyyy-MM-dd HH:mm:ss.SSS z", createTestTask());
assertEquals("Verify milliseconds is supported.",
Timestamp.ofEpochMilli(1416365189123L),
parser.parse("2014-11-19 02:46:29.123 UTC"));
}

@Test
public void testParseWithoutTimeZone() throws TimestampParseException
{
JavaTimestampParser parser = new JavaTimestampParser("yyyy-MM-dd HH:mm:ss.SSS", createTestTask("UTC"));
assertEquals("Verify default timezone is used for no timezone format.",
Timestamp.ofEpochMilli(1416365189123L),
parser.parse("2014-11-19 02:46:29.123"));
}

@Test
public void testIsTimeZoneParsed()
{
JavaTimestampParser parser = new JavaTimestampParser("z", createTestTask());
assertTrue("Verify timezone is parsed", parser.isTimeZoneParsed());
}

@Test
public void testIsTimeZoneParsedForRFC822()
{
JavaTimestampParser parser = new JavaTimestampParser("Z", createTestTask());
assertTrue("Verify timezone is parsed", parser.isTimeZoneParsed());
}

@Test
public void testIsTimeZoneParsedForNoTimezoneFormat()
{
JavaTimestampParser parser = new JavaTimestampParser("yyyy", createTestTask());
assertFalse("Verify timezone is parsed", parser.isTimeZoneParsed());
}

@Test
public void testToLocaleLanguage()
{
Locale locale = JavaTimestampParser.toLocale("ja");
assertEquals("Verify locale language", "ja", locale.getLanguage());
}

@Test
public void testToLocaleCountry()
{
Locale locale = JavaTimestampParser.toLocale("_JP");
assertEquals("Verify locale language", "", locale.getLanguage());
assertEquals("Verify locale country", "JP", locale.getCountry());
}

@Test
public void testToLocaleLanguageAndCountry()
{
Locale locale = JavaTimestampParser.toLocale("ja_JP");
assertEquals("Verify locale language", "ja", locale.getLanguage());
assertEquals("Verify locale country", "JP", locale.getCountry());
}

@Test
public void testToLocaleLanguageAndCountryAndBlank()
{
Locale locale = JavaTimestampParser.toLocale("ja_JP_");
assertEquals("Verify locale language", "ja", locale.getLanguage());
assertEquals("Verify locale country", "JP", locale.getCountry());
assertEquals("Verify locale country", "", locale.getVariant());
}

@Test
public void testToLocaleVariant()
{
Locale locale = JavaTimestampParser.toLocale("__Var_");
assertEquals("Verify locale language", "", locale.getLanguage());
assertEquals("Verify locale country", "", locale.getCountry());
assertEquals("Verify locale variant", "Var_", locale.getVariant());
}

@Test
public void testToLocaleLangCountryVariant()
{
Locale locale = JavaTimestampParser.toLocale("ja_JP_Var_");
assertEquals("Verify locale language", "ja", locale.getLanguage());
assertEquals("Verify locale country", "JP", locale.getCountry());
assertEquals("Verify locale variant", "Var_", locale.getVariant());
}

@Test
public void testNoLocale()
{
assertNull("Verify null is returned for null", JavaTimestampParser.toLocale(null));
assertNull("Verify null is returned for zero length", JavaTimestampParser.toLocale(""));
}

@Test
public void testGetFormat()
{
JavaTimestampParser parser = new JavaTimestampParser("yyyy", createTestTask());
assertEquals("Verify format", "yyyy", parser.getFormat());
}

private TimestampParser.ParserTask createTestTask()
{
ConfigSource config = Exec.newConfigSource();
return config.loadConfig(TimestampParser.ParserTask.class);
}

private TimestampParser.ParserTask createTestTask(String timeZone)
{
ConfigSource config = Exec.newConfigSource().set("default_timezone", timeZone);
return config.loadConfig(TimestampParser.ParserTask.class);
}
}

0 comments on commit 26ea959

Please sign in to comment.