In [19]:
import os
import urllib.request
import zipfile
import json

In [4]:
es_hadoop = urllib.request.URLopener()
es_hadoop.retrieve("http://download.elastic.co/hadoop/elasticsearch-hadoop-6.1.1.zip", "es-hadoop.zip")
with zipfile.ZipFile("es-hadoop.zip","r") as zip_ref:
    zip_ref.extractall()

In [5]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars elasticsearch-hadoop-6.1.1/dist/elasticsearch-spark-20_2.11-6.1.1.jar pyspark-shell'

In [10]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('acsdata').getOrCreate()

In [42]:
es_write_conf_acs = {
	# specify the node that we are sending data to (this should be the master)
	"es.nodes" : 'elasticsearch',
	# specify the port in case it is not the default port
	"es.port" : '9000',
	# specify a resource in the form 'index/doc-type'
	"es.resource" : 'pyxis-demographics/demographics',
	# is the input JSON?
	"es.input.json" : "yes",
	# is there a field in the mapping that should be used to specify the ES document ID
	"es.mapping.id": "id"
}

In [9]:
df = spark.read.json('data.json')
df.printSchema()

root
 |-- estimateAnnotation: string (nullable = true)
 |-- estimateValue: long (nullable = true)
 |-- geoId: string (nullable = true)
 |-- marginAnnotation: string (nullable = true)
 |-- marginOfErrorValue: long (nullable = true)
 |-- variableCode: string (nullable = true)
 |-- vintage: long (nullable = true)



In [22]:
def generate_Additional_Fields(geoId, vintage, variableCode):
    zipCode = geoId.split('US')[1]
    code = variableCode.split('_')
    id = '{}-{}-{}'.format(zipCode, vintage, variableCode)
    return {'id': id, 'zipCode': zipCode, 'variableCode': code[0], 'variableCodeType': code[1]}


In [23]:
generate_Additional_Fields('8600000US006012018B25064_001',1,'c_c')

{'id': '006012018B25064_001-1-c_c',
 'zipCode': '006012018B25064_001',
 'variableCodeType': 'c'}

In [30]:
def acs_formatter(x):
    additional_Fields = generate_Additional_Fields(
        x.geoId, x.vintage, x.variableCode)
    id = additional_Fields['id']
    return (id, json.dumps({
        'id': id,
        'geoId': additional_Fields['zipCode'],
        'variableCodeType': additional_Fields['variableCodeType'],
        'variableCode': additional_Fields['variableCode'],
        'estimateValue': x.estimateValue,
        'vintage': x.vintage
    })
    )

In [37]:
formattedata = df.rdd.map(lambda x:formatter(x))

In [39]:
formattedata.take(1)

[('00601-2018-B25064_001',
  '{"id": "00601-2018-B25064_001", "zipCode": "00601", "variableCodeType": "001", "estimateAnnotation": "", "estimateValue": 363, "geoId": "8600000US00601", "marginAnnotation": "", "marginOfErrorValue": 40, "variableCode": "B25064_001", "vintage": 2018}')]

In [40]:
formattedata.count()

33120

In [43]:
formattedata.saveAsNewAPIHadoopFile(
	path='-',
	outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
	keyClass="org.apache.hadoop.io.NullWritable",
	valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
	conf=es_write_conf_acs)