## Gerar dados com Pyspark no Hadoop e consultá-los com Pyspark no Hadoop

In [None]:
import uuid
import random
import datetime

data = []
for i in range(4000000, 5000000):
    dt = datetime.datetime.fromtimestamp(random.randint(1577847600000000, 1641005999999999) / 1000000)
    data.append(
        {
            "id": str(uuid.uuid4()),
            "code": i,
            "option": "option {0}".format(random.randint(1,5)),
            "description": "description {0}".format(i),
            "value": random.gauss(400, 50),
            "rate": random.random(),
            "created_at": dt,
            "updated_at": dt,
            "status": True if random.randint(0,1) == 1 else False,
            "year": dt.strftime('%Y'),
            "month": dt.strftime('%m')
        }
    )

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master('local[2]') \
    .config('spark.executor.memory', '2g') \
    .config('spark.jars.packages', 'org.apache.spark:spark-avro_2.12:3.2.0') \
    .config('spark.sql.avro.compression.codec', 'snappy') \
    .getOrCreate()

In [None]:
from pyspark.sql.types import *

schema = StructType(
    [
        StructField('id', StringType(), True),
        StructField('code', LongType(), True),
        StructField('option', StringType(), True),
        StructField('description', StringType(), True),
        StructField('value', DoubleType(), True),
        StructField('rate', DoubleType(), True),
        StructField('created_at', TimestampType(), True),
        StructField('updated_at', TimestampType(), True),
        StructField('status', BooleanType(), True),
        StructField('year', StringType(), True),
        StructField('month', StringType(), True)
    ]
)

In [None]:
df = spark.createDataFrame(data, schema)
df.write \
    .format('avro') \
    .option('compression', 'snappy') \
    .partitionBy("year", "month") \
    .save('hdfs://dataserver:9000/warehouse', mode="append")
    #.option("maxRecordsPerFile", 1000) \

In [None]:
df = spark.read.format("avro").load("hdfs://dataserver:9000/warehouse")

In [None]:
df.count()

In [None]:
spark.stop()

## Consultar dados no Hadoop com Drill 

In [None]:
import urllib
import json

In [None]:
query = """
    select code as nk, cast(round(value, 0) as integer) as metric, to_char(created_at, 'yyyy-MM-dd HH:mm:ss.SSS') as created_at from hdfs.`warehouse` limit 10
"""

In [None]:
url = 'http://localhost:8047/query.json'

In [None]:
data = json.dumps({'queryType': 'SQL', 'query': query}).encode('utf-8')

In [None]:
headers = {'Content-Type': 'application/json'}

In [None]:
method = 'POST'

In [None]:
request = urllib.request.Request(url=url, data=data, headers=headers, method=method)

In [None]:
with urllib.request.urlopen(request) as f:
    response = f.read().decode('utf-8')

## Carregar dados para a memória no format CSV

In [None]:
import io

In [None]:
if json.loads(response)['rows'][0]:
    output = io.StringIO()
    string = []
    for row in json.loads(response)['rows']:
        for value in row:
            if row[value]:
                string.append(str(row[value]))
            else:
                string.append('')
        output.write(';'.join(string) + '\n')
        string = []
    output.seek(0)

In [None]:
import psycopg2

In [None]:
connection = psycopg2.connect(host='192.168.43.3',port='5432',dbname='dw_pags',user='postgres',password='123456')

In [None]:
cursor = connection.cursor()

In [None]:
cursor.execute('create temp table sgt_test(nk integer unique, metric bigint, created_at timestamp)')

In [None]:
connection.commit()

In [None]:
cursor.copy_from(file=output, table='sgt_test', sep=';', null='')

In [None]:
connection.commit()

In [None]:
cursor.execute('''
    insert into dim_test(nk, metric, created_at)
        select nk
             , metric
             , created_at
          from sgt_test
    on conflict(nk)
    do update set metric = excluded.metric, created_at = excluded.created_at
''')

In [None]:
connection.commit()

In [None]:
cursor.close()

In [None]:
connection.close()