Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

JSON SerDe for Hive

This JSON SerDe is designed to allow querying of arbitrary JSON structures. It
supports all Hive types except for UNION.
  • Loading branch information...
commit ea95716cf22cb0e1bafade7ea3cf1fe44a7e6ebd 1 parent 2b93087
Jon Natkins authored
105 hive-serdes/pom.xml
View
@@ -0,0 +1,105 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>com.cloudera.serde</groupId>
+ <artifactId>hive-serdes</artifactId>
+ <version>1.0-SNAPSHOT</version>
+ <packaging>jar</packaging>
+
+ <name>hive-serdes</name>
+ <url>http://www.cloudera.com</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <eclipse.output.directory>eclipse-classes</eclipse.output.directory>
+ <hive.version>0.8.1-cdh4.0.1</hive.version>
+ <hadoop.version>2.0.0-cdh4.0.1</hadoop.version>
+ </properties>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-eclipse-plugin</artifactId>
+ <version>2.9</version>
+ <configuration>
+ <buildOutputDirectory>eclipse-classes</buildOutputDirectory>
+ <downloadSources>true</downloadSources>
+ <downloadJavadocs>false</downloadJavadocs>
+ </configuration>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>1.7.1</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.3.2</version>
+ <configuration>
+ <source>1.6</source>
+ <target>1.6</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.2</version>
+ <scope>test</scope>
+ </dependency>
+
+ <!-- JSON Parser -->
+ <dependency>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-core-asl</artifactId>
+ <version>1.9.8</version>
+ </dependency>
+
+ <!-- Hadoop Dependencies -->
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-serde</artifactId>
+ <version>${hive.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-common</artifactId>
+ <version>${hadoop.version}</version>
+ <scope>provided</scope>
+ </dependency>
+ </dependencies>
+
+ <repositories>
+ <repository>
+ <id>cloudera</id>
+ <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
+ <releases>
+ <enabled>true</enabled>
+ </releases>
+ <snapshots>
+ <enabled>false</enabled>
+ </snapshots>
+ </repository>
+ </repositories>
+</project>
376 hive-serdes/src/main/java/com/cloudera/hive/serde/JSONSerDe.java
View
@@ -0,0 +1,376 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.cloudera.hive.serde;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.serde.Constants;
+import org.apache.hadoop.hive.serde2.SerDe;
+import org.apache.hadoop.hive.serde2.SerDeException;
+import org.apache.hadoop.hive.serde2.SerDeStats;
+import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.codehaus.jackson.map.ObjectMapper;
+
+/**
+ * This SerDe can be used for processing JSON data in Hive. It supports
+ * arbitrary JSON data, and can handle all Hive types except for UNION.
+ * However, the JSON data is expected to be a series of discrete records,
+ * rather than a JSON array of objects.
+ *
+ * The Hive table is expected to contain columns with names corresponding to
+ * fields in the JSON data, but it is not necessary for every JSON field to
+ * have a corresponding Hive column. Those JSON fields will be ignored during
+ * queries.
+ *
+ * Example:
+ *
+ * { "a": 1, "b": [ "str1", "str2" ], "c": { "field1": "val1" } }
+ *
+ * Could correspond to a table:
+ *
+ * CREATE TABLE foo (a INT, b ARRAY<STRING>, c STRUCT<field1:STRING>);
+ *
+ * JSON objects can also interpreted as a Hive MAP type, so long as the keys
+ * and values in the JSON object are all of the appropriate types. For example,
+ * in the JSON above, another valid table declaraction would be:
+ *
+ * CREATE TABLE foo (a INT, b ARRAY<STRING>, c MAP<STRING,STRING>);
+ *
+ * Only STRING keys are supported for Hive MAPs.
+ */
+public class JSONSerDe implements SerDe {
+
+ private StructTypeInfo rowTypeInfo;
+ private ObjectInspector rowOI;
+ private List<String> colNames;
+ private List<Object> row = new ArrayList<Object>();
+
+ /**
+ * An initialization function used to gather information about the table.
+ * Typically, a SerDe implementation will be interested in the list of
+ * column names and their types. That information will be used to help perform
+ * actual serialization and deserialization of data.
+ */
+ @Override
+ public void initialize(Configuration conf, Properties tbl)
+ throws SerDeException {
+ // Get a list of the table's column names.
+ String colNamesStr = tbl.getProperty(Constants.LIST_COLUMNS);
+ colNames = Arrays.asList(colNamesStr.split(","));
+
+ // Get a list of TypeInfos for the columns. This list lines up with
+ // the list of column names.
+ String colTypesStr = tbl.getProperty(Constants.LIST_COLUMN_TYPES);
+ List<TypeInfo> colTypes =
+ TypeInfoUtils.getTypeInfosFromTypeString(colTypesStr);
+
+ rowTypeInfo =
+ (StructTypeInfo) TypeInfoFactory.getStructTypeInfo(colNames, colTypes);
+ rowOI =
+ TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(rowTypeInfo);
+ }
+
+ /**
+ * This method does the work of deserializing a record into Java objects that
+ * Hive can work with via the ObjectInspector interface. For this SerDe, the
+ * blob that is passed in is a JSON string, and the Jackson JSON parser is
+ * being used to translate the string into Java objects.
+ *
+ * The JSON deserialization works by taking the column names in the Hive
+ * table, and looking up those fields in the parsed JSON object. If the value
+ * of the field is not a primitive, the object is parsed further.
+ */
+ @Override
+ public Object deserialize(Writable blob) throws SerDeException {
+ Map<?,?> root = null;
+ row.clear();
+ try {
+ ObjectMapper mapper = new ObjectMapper();
+ // This is really a Map<String, Object>. For more information about how
+ // Jackson parses JSON in this example, see
+ // http://wiki.fasterxml.com/JacksonDataBinding
+ root = mapper.readValue(blob.toString(), Map.class);
+ } catch (Exception e) {
+ throw new SerDeException(e);
+ }
+
+ Object value= null;
+ for (String fieldName : rowTypeInfo.getAllStructFieldNames()) {
+ try {
+ TypeInfo fieldTypeInfo = rowTypeInfo.getStructFieldTypeInfo(fieldName);
+ value = parseField(root.get(fieldName), fieldTypeInfo);
+ } catch (Exception e) {
+ value = null;
+ }
+ row.add(value);
+ }
+ return row;
+ }
+
+ /**
+ * Parses a JSON object according to the Hive column's type.
+ *
+ * @param field - The JSON object to parse
+ * @param fieldTypeInfo - Metadata about the Hive column
+ * @return - The parsed value of the field
+ */
+ private Object parseField(Object field, TypeInfo fieldTypeInfo) {
+ switch (fieldTypeInfo.getCategory()) {
+ case PRIMITIVE:
+ // Jackson will return the right thing in this case, so just return
+ // the object
+ return field;
+ case LIST:
+ return parseList(field, (ListTypeInfo) fieldTypeInfo);
+ case MAP:
+ return parseMap(field, (MapTypeInfo) fieldTypeInfo);
+ case STRUCT:
+ return parseStruct(field, (StructTypeInfo) fieldTypeInfo);
+ case UNION:
+ // Unsupported by JSON
+ default:
+ return null;
+ }
+ }
+
+ /**
+ * Parses a JSON object and its fields. The Hive metadata is used to
+ * determine how to parse the object fields.
+ *
+ * @param field - The JSON object to parse
+ * @param fieldTypeInfo - Metadata about the Hive column
+ * @return - A map representing the object and its fields
+ */
+ private Object parseStruct(Object field, StructTypeInfo fieldTypeInfo) {
+ Map<Object,Object> map = (Map<Object,Object>)field;
+ ArrayList<TypeInfo> structTypes = fieldTypeInfo.getAllStructFieldTypeInfos();
+ ArrayList<String> structNames = fieldTypeInfo.getAllStructFieldNames();
+
+ List<Object> structRow = new ArrayList<Object>(structTypes.size());
+ for (int i = 0; i < structNames.size(); i++) {
+ structRow.add(parseField(map.get(structNames.get(i)), structTypes.get(i)));
+ }
+ return structRow;
+ }
+
+ /**
+ * Parse a JSON list and its elements. This uses the Hive metadata for the
+ * list elements to determine how to parse the elements.
+ *
+ * @param field - The JSON list to parse
+ * @param fieldTypeInfo - Metadata about the Hive column
+ * @return - A list of the parsed elements
+ */
+ private Object parseList(Object field, ListTypeInfo fieldTypeInfo) {
+ ArrayList<Object> list = (ArrayList<Object>) field;
+ TypeInfo elemTypeInfo = fieldTypeInfo.getListElementTypeInfo();
+
+ for (int i = 0; i < list.size(); i++) {
+ list.set(i, parseField(list.get(i), elemTypeInfo));
+ }
+
+ return list.toArray();
+ }
+
+ /**
+ * Parse a JSON object as a map. This uses the Hive metadata for the map
+ * values to determine how to parse the values. The map is assumed to have
+ * a string for a key.
+ *
+ * @param field - The JSON list to parse
+ * @param fieldTypeInfo - Metadata about the Hive column
+ * @return
+ */
+ private Object parseMap(Object field, MapTypeInfo fieldTypeInfo) {
+ Map<Object,Object> map = (Map<Object,Object>) field;
+ TypeInfo valueTypeInfo = fieldTypeInfo.getMapValueTypeInfo();
+
+ for (Map.Entry<Object,Object> entry : map.entrySet()) {
+ map.put(entry.getKey(), parseField(entry.getValue(), valueTypeInfo));
+ }
+ return map;
+ }
+
+ /**
+ * Return an ObjectInspector for the row of data
+ */
+ @Override
+ public ObjectInspector getObjectInspector() throws SerDeException {
+ return rowOI;
+ }
+
+ /**
+ * Unimplemented
+ */
+ @Override
+ public SerDeStats getSerDeStats() {
+ return null;
+ }
+
+ /**
+ * JSON is just a textual representation, so our serialized class
+ * is just Text.
+ */
+ @Override
+ public Class<? extends Writable> getSerializedClass() {
+ return Text.class;
+ }
+
+ /**
+ * This method takes an object representing a row of data from Hive, and uses
+ * the ObjectInspector to get the data for each column and serialize it. This
+ * implementation deparses the row into an object that Jackson can easily
+ * serialize into a JSON blob.
+ */
+ @Override
+ public Writable serialize(Object obj, ObjectInspector oi)
+ throws SerDeException {
+ Object deparsedObj = deparseRow(obj, oi);
+ ObjectMapper mapper = new ObjectMapper();
+ try {
+ // Let Jackson do the work of serializing the object
+ return new Text(mapper.writeValueAsString(deparsedObj));
+ } catch (Exception e) {
+ throw new SerDeException(e);
+ }
+ }
+
+ /**
+ * Deparse a Hive object into a Jackson-serializable object. This uses
+ * the ObjectInspector to extract the column data.
+ *
+ * @param obj - Hive object to deparse
+ * @param oi - ObjectInspector for the object
+ * @return - A deparsed object
+ */
+ private Object deparseObject(Object obj, ObjectInspector oi) {
+ switch (oi.getCategory()) {
+ case LIST:
+ return deparseList(obj, (ListObjectInspector)oi);
+ case MAP:
+ return deparseMap(obj, (MapObjectInspector)oi);
+ case PRIMITIVE:
+ return deparsePrimitive(obj, (PrimitiveObjectInspector)oi);
+ case STRUCT:
+ return deparseStruct(obj, (StructObjectInspector)oi, false);
+ case UNION:
+ // Unsupported by JSON
+ default:
+ return null;
+ }
+ }
+
+ /**
+ * Deparses a row of data. We have to treat this one differently from
+ * other structs, because the field names for the root object do not match
+ * the column names for the Hive table.
+ *
+ * @param obj - Object representing the top-level row
+ * @param structOI - ObjectInspector for the row
+ * @return - A deparsed row of data
+ */
+ private Object deparseRow(Object obj, ObjectInspector structOI) {
+ return deparseStruct(obj, (StructObjectInspector)structOI, true);
+ }
+
+ /**
+ * Deparses struct data into a serializable JSON object.
+ *
+ * @param obj - Hive struct data
+ * @param structOI - ObjectInspector for the struct
+ * @param isRow - Whether or not this struct represents a top-level row
+ * @return - A deparsed struct
+ */
+ private Object deparseStruct(Object obj,
+ StructObjectInspector structOI,
+ boolean isRow) {
+ Map<Object,Object> struct = new HashMap<Object,Object>();
+ List<? extends StructField> fields = structOI.getAllStructFieldRefs();
+ for (int i = 0; i < fields.size(); i++) {
+ StructField field = fields.get(i);
+ // The top-level row object is treated slightly differently from other
+ // structs, because the field names for the row do not correctly reflect
+ // the Hive column names. For lower-level structs, we can get the field
+ // name from the associated StructField object.
+ String fieldName = isRow ? colNames.get(i) : field.getFieldName();
+ ObjectInspector fieldOI = field.getFieldObjectInspector();
+ Object fieldObj = structOI.getStructFieldData(obj, field);
+ struct.put(fieldName, deparseObject(fieldObj, fieldOI));
+ }
+ return struct;
+ }
+
+ /**
+ * Deparses a primitive type.
+ *
+ * @param obj - Hive object to deparse
+ * @param oi - ObjectInspector for the object
+ * @return - A deparsed object
+ */
+ private Object deparsePrimitive(Object obj, PrimitiveObjectInspector primOI) {
+ return primOI.getPrimitiveJavaObject(obj);
+ }
+
+ private Object deparseMap(Object obj, MapObjectInspector mapOI) {
+ Map<Object,Object> map = new HashMap<Object,Object>();
+ ObjectInspector mapValOI = mapOI.getMapValueObjectInspector();
+ Map<?,?> fields = mapOI.getMap(obj);
+ for (Map.Entry<?,?> field : fields.entrySet()) {
+ Object fieldName = field.getKey();
+ Object fieldObj = field.getValue();
+ map.put(fieldName, deparseObject(fieldObj, mapValOI));
+ }
+ return map;
+ }
+
+ /**
+ * Deparses a list and its elements.
+ *
+ * @param obj - Hive object to deparse
+ * @param oi - ObjectInspector for the object
+ * @return - A deparsed object
+ */
+ private Object deparseList(Object obj, ListObjectInspector listOI) {
+ List<Object> list = new ArrayList<Object>();
+ List<?> field = listOI.getList(obj);
+ ObjectInspector elemOI = listOI.getListElementObjectInspector();
+ for (Object elem : field) {
+ list.add(deparseObject(elem, elemOI));
+ }
+ return list;
+ }
+}
Please sign in to comment.
Something went wrong with that request. Please try again.