## Gerar dados com Pyspark no Hadoop e consultá-los com Pyspark no Hadoop

In [None]:
import uuid
import random
import datetime

data = []
for i in range(1000000, 2000000):
    dt = datetime.datetime.fromtimestamp(random.randint(1577847600000000, 1641005999999999) / 1000000)
    data.append(
        {
            "id": str(uuid.uuid4()),
            "code": i,
            "option": "option {0}".format(random.randint(1,5)),
            "description": "description {0}".format(i),
            "value": random.gauss(400, 50),
            "rate": random.random(),
            "created_at": dt,
            "updated_at": dt,
            "status": True if random.randint(0,1) == 1 else False,
            "year": dt.strftime('%Y'),
            "month": dt.strftime('%m')
        }
    )

In [57]:
import uuid
import pandas
d = [
        {
            "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, "Serviço de Atendimento ao Consumidor")),
            "name": "SAC"
        },
        {
            "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, "Ouvidoria")),
            "name": "Ouvidoria"
        },
        {
            "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, "Suporte Técnico")),
            "name": "Suporte Técnico"
        },
        {
            "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, "None")),
            "name": 'Vendas'
        }
    ]
df = pandas.DataFrame(d)
df.to_parquet('tb_lob.parquet')

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master('local[2]') \
    .config('spark.executor.memory', '2g') \
    .config('spark.jars.packages', 'org.apache.spark:spark-avro_2.12:3.2.0') \
    .config('spark.sql.avro.compression.codec', 'snappy') \
    .getOrCreate()

In [None]:
from pyspark.sql.types import *

schema = StructType(
    [
        StructField('id', StringType(), True),
        StructField('code', LongType(), True),
        StructField('option', StringType(), True),
        StructField('description', StringType(), True),
        StructField('value', DoubleType(), True),
        StructField('rate', DoubleType(), True),
        StructField('created_at', TimestampType(), True),
        StructField('updated_at', TimestampType(), True),
        StructField('status', BooleanType(), True),
        StructField('year', StringType(), True),
        StructField('month', StringType(), True)
    ]
)

In [None]:
df = spark.createDataFrame(data, schema)
df.write \
    .format('avro') \
    .option('compression', 'snappy') \
    .partitionBy("year", "month") \
    .save('hdfs://dataserver:9000/warehouse', mode="append")
    #.option("maxRecordsPerFile", 1000) \

In [None]:
df = spark.read.format("avro").load("hdfs://dataserver:9000/warehouse")

In [None]:
df.count()

In [None]:
spark.stop()

## Consultar dados no Hadoop com Drill 

In [5]:
import logging
import urllib
import json
import re
import io
import psycopg2

In [6]:
def read_file_system(file_system_query, parameter):
    try:
        url = 'http://localhost:8047/status'
        method = 'GET'
        request = urllib.request.Request(url=url, method=method)
        with urllib.request.urlopen(request) as f:
            response = f.read().decode('utf-8')
        if not re.search('Running!', response):
            raise Exception('Apache Drill not is running!')
            
        url = 'http://localhost:8047/query.json'
        data = json.dumps({'queryType': 'SQL', 'query': file_system_query.format(**parameter)}).encode('utf-8')
        headers = {'Content-Type': 'application/json'}
        method = 'POST'
        request = urllib.request.Request(url=url, data=data, headers=headers, method=method)
        with urllib.request.urlopen(request) as f:
            response = f.read().decode('utf-8')
            
        if json.loads(response)['queryState'] == 'FAILED':
            raise Exception('Query failed!')
            
        return json.loads(response)['rows']
            
    except Exception as e:
        raise Exception(e)

In [7]:
def json_to_csv(data):
    output = io.StringIO()
    string = []
    for row in data:
        for value in row:
            if row[value]:
                string.append(str(row[value]))
            else:
                string.append('')
        output.write(';'.join(string) + '\n')
        string = []
    output.seek(0)
    return output

In [8]:
def load_pgsql(stage_table_creation, output, table_merge):
    try:
        connection = psycopg2.connect(host='192.168.43.3',port='5432',dbname='dw_pags',user='postgres',password='123456')
        cursor = connection.cursor()
        
        cursor.execute(stage_table_creation)
        connection.commit()
        
        cursor.copy_from(file=output, table='{0}'.format(re.search('table ([a-zA-Z0-9_-]+)', stage_table_creation).group(1)), sep=';', null='')
        connection.commit()
        
        cursor.execute(table_merge)
        connection.commit()
        
        cursor.close()
        connection.close()
    except Exception as e:
        raise Exception(e)

In [56]:
import datetime

if __name__ == '__main__':
    
    logging.basicConfig(level=logging.CRITICAL, format='[%(asctime)s %(levelname)s %(name)s] %(message)s')
    logger = logging.getLogger(__name__)
    
    ################################################### parameter ###################################################
    interval = 86400
    period = 2
    end_interval = datetime.datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)

    while end_interval < datetime.datetime.now():
        end_interval += datetime.timedelta(seconds=interval)
    start_interval = end_interval - datetime.timedelta(seconds=interval)

    parameter = []
    for i in range(period):
        parameter.append(
            {
                "start_interval": (start_interval - datetime.timedelta(seconds=interval * i)).strftime('%Y-%m-%d %H:%M:%S'),
                 "end_interval": (end_interval - datetime.timedelta(seconds=interval * i) - datetime.timedelta(seconds=0)).strftime('%Y-%m-%d %H:%M:%S')
            }
        )
        
    parameter = [{"interaction_number": 1}]
    #################################################################################################################

    file_system_query = """
        select * from dfs.`/home/danilo/Documents/estudos/python/example/tb_lob.parquet`
    """
    
    stage_table_creation = """
        create temp table sgt_lob(
            nk character varying unique,
            name character varying
        )
    """
    
    table_merge = """
        insert into dim_lob(nk, name)
            select nk
                 , name
              from sgt_lob
        on conflict(nk)
        do update set name = excluded.name, updated_at = now()
            where dim_lob.name <> excluded.name
               or dim_lob.name is null
    """
    
    try:
        for value in parameter:
            data = read_file_system(file_system_query, value)
            if data:
                output = json_to_csv(data)
                load_pgsql(stage_table_creation, output, table_merge)
            else:
                logger.warning('Query returns empty!')
    except Exception as e:
        logger.critical(e)