Permalink
Browse files

JSON SerDe for Hive

This JSON SerDe is designed to allow querying of arbitrary JSON structures. It
supports all Hive types except for UNION.
  • Loading branch information...
1 parent 2b93087 commit ea95716cf22cb0e1bafade7ea3cf1fe44a7e6ebd Jon Natkins committed Aug 29, 2012
Showing with 481 additions and 0 deletions.
  1. +105 −0 hive-serdes/pom.xml
  2. +376 −0 hive-serdes/src/main/java/com/cloudera/hive/serde/JSONSerDe.java
View
105 hive-serdes/pom.xml
@@ -0,0 +1,105 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>com.cloudera.serde</groupId>
+ <artifactId>hive-serdes</artifactId>
+ <version>1.0-SNAPSHOT</version>
+ <packaging>jar</packaging>
+
+ <name>hive-serdes</name>
+ <url>http://www.cloudera.com</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <eclipse.output.directory>eclipse-classes</eclipse.output.directory>
+ <hive.version>0.8.1-cdh4.0.1</hive.version>
+ <hadoop.version>2.0.0-cdh4.0.1</hadoop.version>
+ </properties>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-eclipse-plugin</artifactId>
+ <version>2.9</version>
+ <configuration>
+ <buildOutputDirectory>eclipse-classes</buildOutputDirectory>
+ <downloadSources>true</downloadSources>
+ <downloadJavadocs>false</downloadJavadocs>
+ </configuration>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>1.7.1</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.3.2</version>
+ <configuration>
+ <source>1.6</source>
+ <target>1.6</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.2</version>
+ <scope>test</scope>
+ </dependency>
+
+ <!-- JSON Parser -->
+ <dependency>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-core-asl</artifactId>
+ <version>1.9.8</version>
+ </dependency>
+
+ <!-- Hadoop Dependencies -->
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-serde</artifactId>
+ <version>${hive.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-common</artifactId>
+ <version>${hadoop.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ </dependencies>
+
+ <repositories>
+ <repository>
+ <id>cloudera</id>
+ <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
+ <releases>
+ <enabled>true</enabled>
+ </releases>
+ <snapshots>
+ <enabled>false</enabled>
+ </snapshots>
+ </repository>
+ </repositories>
+</project>
View
376 hive-serdes/src/main/java/com/cloudera/hive/serde/JSONSerDe.java
@@ -0,0 +1,376 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.cloudera.hive.serde;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.serde.Constants;
+import org.apache.hadoop.hive.serde2.SerDe;
+import org.apache.hadoop.hive.serde2.SerDeException;
+import org.apache.hadoop.hive.serde2.SerDeStats;
+import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.codehaus.jackson.map.ObjectMapper;
+
+/**
+ * This SerDe can be used for processing JSON data in Hive. It supports
+ * arbitrary JSON data, and can handle all Hive types except for UNION.
+ * However, the JSON data is expected to be a series of discrete records,
+ * rather than a JSON array of objects.
+ *
+ * The Hive table is expected to contain columns with names corresponding to
+ * fields in the JSON data, but it is not necessary for every JSON field to
+ * have a corresponding Hive column. Those JSON fields will be ignored during
+ * queries.
+ *
+ * Example:
+ *
+ * { "a": 1, "b": [ "str1", "str2" ], "c": { "field1": "val1" } }
+ *
+ * Could correspond to a table:
+ *
+ * CREATE TABLE foo (a INT, b ARRAY<STRING>, c STRUCT<field1:STRING>);
+ *
+ * JSON objects can also interpreted as a Hive MAP type, so long as the keys
+ * and values in the JSON object are all of the appropriate types. For example,
+ * in the JSON above, another valid table declaraction would be:
+ *
+ * CREATE TABLE foo (a INT, b ARRAY<STRING>, c MAP<STRING,STRING>);
+ *
+ * Only STRING keys are supported for Hive MAPs.
+ */
+public class JSONSerDe implements SerDe {
+
+ private StructTypeInfo rowTypeInfo;
+ private ObjectInspector rowOI;
+ private List<String> colNames;
+ private List<Object> row = new ArrayList<Object>();
+
+ /**
+ * An initialization function used to gather information about the table.
+ * Typically, a SerDe implementation will be interested in the list of
+ * column names and their types. That information will be used to help perform
+ * actual serialization and deserialization of data.
+ */
+ @Override
+ public void initialize(Configuration conf, Properties tbl)
+ throws SerDeException {
+ // Get a list of the table's column names.
+ String colNamesStr = tbl.getProperty(Constants.LIST_COLUMNS);
+ colNames = Arrays.asList(colNamesStr.split(","));
+
+ // Get a list of TypeInfos for the columns. This list lines up with
+ // the list of column names.
+ String colTypesStr = tbl.getProperty(Constants.LIST_COLUMN_TYPES);
+ List<TypeInfo> colTypes =
+ TypeInfoUtils.getTypeInfosFromTypeString(colTypesStr);
+
+ rowTypeInfo =
+ (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(colNames, colTypes);
+ rowOI =
+ TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(rowTypeInfo);
+ }
+
+ /**
+ * This method does the work of deserializing a record into Java objects that
+ * Hive can work with via the ObjectInspector interface. For this SerDe, the
+ * blob that is passed in is a JSON string, and the Jackson JSON parser is
+ * being used to translate the string into Java objects.
+ *
+ * The JSON deserialization works by taking the column names in the Hive
+ * table, and looking up those fields in the parsed JSON object. If the value
+ * of the field is not a primitive, the object is parsed further.
+ */
+ @Override
+ public Object deserialize(Writable blob) throws SerDeException {
+ Map<?,?> root = null;
+ row.clear();
+ try {
+ ObjectMapper mapper = new ObjectMapper();
+ // This is really a Map<String, Object>. For more information about how
+ // Jackson parses JSON in this example, see
+ // http://wiki.fasterxml.com/JacksonDataBinding
+ root = mapper.readValue(blob.toString(), Map.class);
+ } catch (Exception e) {
+ throw new SerDeException(e);
+ }
+
+ Object value= null;
+ for (String fieldName : rowTypeInfo.getAllStructFieldNames()) {
+ try {
+ TypeInfo fieldTypeInfo = rowTypeInfo.getStructFieldTypeInfo(fieldName);
+ value = parseField(root.get(fieldName), fieldTypeInfo);
+ } catch (Exception e) {
+ value = null;
+ }
+ row.add(value);
+ }
+ return row;
+ }
+
+ /**
+ * Parses a JSON object according to the Hive column's type.
+ *
+ * @param field - The JSON object to parse
+ * @param fieldTypeInfo - Metadata about the Hive column
+ * @return - The parsed value of the field
+ */
+ private Object parseField(Object field, TypeInfo fieldTypeInfo) {
+ switch (fieldTypeInfo.getCategory()) {
+ case PRIMITIVE:
+ // Jackson will return the right thing in this case, so just return
+ // the object
+ return field;
+ case LIST:
+ return parseList(field, (ListTypeInfo) fieldTypeInfo);
+ case MAP:
+ return parseMap(field, (MapTypeInfo) fieldTypeInfo);
+ case STRUCT:
+ return parseStruct(field, (StructTypeInfo) fieldTypeInfo);
+ case UNION:
+ // Unsupported by JSON
+ default:
+ return null;
+ }
+ }
+
+ /**
+ * Parses a JSON object and its fields. The Hive metadata is used to
+ * determine how to parse the object fields.
+ *
+ * @param field - The JSON object to parse
+ * @param fieldTypeInfo - Metadata about the Hive column
+ * @return - A map representing the object and its fields
+ */
+ private Object parseStruct(Object field, StructTypeInfo fieldTypeInfo) {
+ Map<Object,Object> map = (Map<Object,Object>)field;
+ ArrayList<TypeInfo> structTypes = fieldTypeInfo.getAllStructFieldTypeInfos();
+ ArrayList<String> structNames = fieldTypeInfo.getAllStructFieldNames();
+
+ List<Object> structRow = new ArrayList<Object>(structTypes.size());
+ for (int i = 0; i < structNames.size(); i++) {
+ structRow.add(parseField(map.get(structNames.get(i)), structTypes.get(i)));
+ }
+ return structRow;
+ }
+
+ /**
+ * Parse a JSON list and its elements. This uses the Hive metadata for the
+ * list elements to determine how to parse the elements.
+ *
+ * @param field - The JSON list to parse
+ * @param fieldTypeInfo - Metadata about the Hive column
+ * @return - A list of the parsed elements
+ */
+ private Object parseList(Object field, ListTypeInfo fieldTypeInfo) {
+ ArrayList<Object> list = (ArrayList<Object>) field;
+ TypeInfo elemTypeInfo = fieldTypeInfo.getListElementTypeInfo();
+
+ for (int i = 0; i < list.size(); i++) {
+ list.set(i, parseField(list.get(i), elemTypeInfo));
+ }
+
+ return list.toArray();
+ }
+
+ /**
+ * Parse a JSON object as a map. This uses the Hive metadata for the map
+ * values to determine how to parse the values. The map is assumed to have
+ * a string for a key.
+ *
+ * @param field - The JSON list to parse
+ * @param fieldTypeInfo - Metadata about the Hive column
+ * @return
+ */
+ private Object parseMap(Object field, MapTypeInfo fieldTypeInfo) {
+ Map<Object,Object> map = (Map<Object,Object>) field;
+ TypeInfo valueTypeInfo = fieldTypeInfo.getMapValueTypeInfo();
+
+ for (Map.Entry<Object,Object> entry : map.entrySet()) {
+ map.put(entry.getKey(), parseField(entry.getValue(), valueTypeInfo));
+ }
+ return map;
+ }
+
+ /**
+ * Return an ObjectInspector for the row of data
+ */
+ @Override
+ public ObjectInspector getObjectInspector() throws SerDeException {
+ return rowOI;
+ }
+
+ /**
+ * Unimplemented
+ */
+ @Override
+ public SerDeStats getSerDeStats() {
+ return null;
+ }
+
+ /**
+ * JSON is just a textual representation, so our serialized class
+ * is just Text.
+ */
+ @Override
+ public Class<? extends Writable> getSerializedClass() {
+ return Text.class;
+ }
+
+ /**
+ * This method takes an object representing a row of data from Hive, and uses
+ * the ObjectInspector to get the data for each column and serialize it. This
+ * implementation deparses the row into an object that Jackson can easily
+ * serialize into a JSON blob.
+ */
+ @Override
+ public Writable serialize(Object obj, ObjectInspector oi)
+ throws SerDeException {
+ Object deparsedObj = deparseRow(obj, oi);
+ ObjectMapper mapper = new ObjectMapper();
+ try {
+ // Let Jackson do the work of serializing the object
+ return new Text(mapper.writeValueAsString(deparsedObj));
+ } catch (Exception e) {
+ throw new SerDeException(e);
+ }
+ }
+
+ /**
+ * Deparse a Hive object into a Jackson-serializable object. This uses
+ * the ObjectInspector to extract the column data.
+ *
+ * @param obj - Hive object to deparse
+ * @param oi - ObjectInspector for the object
+ * @return - A deparsed object
+ */
+ private Object deparseObject(Object obj, ObjectInspector oi) {
+ switch (oi.getCategory()) {
+ case LIST:
+ return deparseList(obj, (ListObjectInspector)oi);
+ case MAP:
+ return deparseMap(obj, (MapObjectInspector)oi);
+ case PRIMITIVE:
+ return deparsePrimitive(obj, (PrimitiveObjectInspector)oi);
+ case STRUCT:
+ return deparseStruct(obj, (StructObjectInspector)oi, false);
+ case UNION:
+ // Unsupported by JSON
+ default:
+ return null;
+ }
+ }
+
+ /**
+ * Deparses a row of data. We have to treat this one differently from
+ * other structs, because the field names for the root object do not match
+ * the column names for the Hive table.
+ *
+ * @param obj - Object representing the top-level row
+ * @param structOI - ObjectInspector for the row
+ * @return - A deparsed row of data
+ */
+ private Object deparseRow(Object obj, ObjectInspector structOI) {
+ return deparseStruct(obj, (StructObjectInspector)structOI, true);
+ }
+
+ /**
+ * Deparses struct data into a serializable JSON object.
+ *
+ * @param obj - Hive struct data
+ * @param structOI - ObjectInspector for the struct
+ * @param isRow - Whether or not this struct represents a top-level row
+ * @return - A deparsed struct
+ */
+ private Object deparseStruct(Object obj,
+ StructObjectInspector structOI,
+ boolean isRow) {
+ Map<Object,Object> struct = new HashMap<Object,Object>();
+ List<? extends StructField> fields = structOI.getAllStructFieldRefs();
+ for (int i = 0; i < fields.size(); i++) {
+ StructField field = fields.get(i);
+ // The top-level row object is treated slightly differently from other
+ // structs, because the field names for the row do not correctly reflect
+ // the Hive column names. For lower-level structs, we can get the field
+ // name from the associated StructField object.
+ String fieldName = isRow ? colNames.get(i) : field.getFieldName();
+ ObjectInspector fieldOI = field.getFieldObjectInspector();
+ Object fieldObj = structOI.getStructFieldData(obj, field);
+ struct.put(fieldName, deparseObject(fieldObj, fieldOI));
+ }
+ return struct;
+ }
+
+ /**
+ * Deparses a primitive type.
+ *
+ * @param obj - Hive object to deparse
+ * @param oi - ObjectInspector for the object
+ * @return - A deparsed object
+ */
+ private Object deparsePrimitive(Object obj, PrimitiveObjectInspector primOI) {
+ return primOI.getPrimitiveJavaObject(obj);
+ }
+
+ private Object deparseMap(Object obj, MapObjectInspector mapOI) {
+ Map<Object,Object> map = new HashMap<Object,Object>();
+ ObjectInspector mapValOI = mapOI.getMapValueObjectInspector();
+ Map<?,?> fields = mapOI.getMap(obj);
+ for (Map.Entry<?,?> field : fields.entrySet()) {
+ Object fieldName = field.getKey();
+ Object fieldObj = field.getValue();
+ map.put(fieldName, deparseObject(fieldObj, mapValOI));
+ }
+ return map;
+ }
+
+ /**
+ * Deparses a list and its elements.
+ *
+ * @param obj - Hive object to deparse
+ * @param oi - ObjectInspector for the object
+ * @return - A deparsed object
+ */
+ private Object deparseList(Object obj, ListObjectInspector listOI) {
+ List<Object> list = new ArrayList<Object>();
+ List<?> field = listOI.getList(obj);
+ ObjectInspector elemOI = listOI.getListElementObjectInspector();
+ for (Object elem : field) {
+ list.add(deparseObject(elem, elemOI));
+ }
+ return list;
+ }
+}

0 comments on commit ea95716

Please sign in to comment.