In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark
import pandas as pd
import os
from pyspark.sql.functions import *
import requests
from urllib.request import urlopen

In [None]:
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['JAVA_HOME'] = '/usr/local/jdk8u222-b10/'
os.environ['HADOOP_USER_NAME']='hive'
os.environ['PYSPARK_PYTHON'] ='/home/anaconda3/envs/main/bin/python'
conf = pyspark.SparkConf().setAll([
     ('spark.driver.maxResultSize', '0'),
     ('spark.driver.memory', '5g'),
     ('spark.sql.repl.eagerEval.enabled','true'),
     ('spark.sql.warehouse.dir','/users/hive/warehouse/'),
     ('hive.strict.managed.tables','false'),
     ('hive.metastore.uris', 'thrift://masternode.bigdata:9083'),
     ('metastore.client.capability.check','false')
    ])
spark = SparkSession.builder.master("local[*]").appName("view_air_quality_factor_3").config(conf=conf).enableHiveSupport().getOrCreate();

In [None]:
schema = StructType([
    StructField("location_id", StringType(), True),
    StructField("location_type", StringType(), True),
    StructField("location_name_th", StringType(), True),
    StructField("province_id", StringType(), True),
    StructField("province_name", StringType(), True),
    StructField("latitude", DoubleType(), True),
    StructField("longitude", DoubleType(), True),
    StructField("datetime_", TimestampType(), True),
    StructField("time24hr", IntegerType(), True),
    StructField("dayofweek", IntegerType(), True),
    StructField("month", IntegerType(), True),
    StructField("wind_speed", FloatType(), True),
    StructField("wind_direction", FloatType(), True),
    StructField("air_temperature", FloatType(), True),
    StructField("srad", FloatType(), True),
    StructField("so2", FloatType(), True),
    StructField("relative_humidity", FloatType(), True),
    StructField("rainfall", FloatType(), True),
    StructField("pm25", FloatType(), True),
    StructField("pm10", FloatType(), True),
    StructField("o3", FloatType(), True),
    StructField("nard", FloatType(), True),
    StructField("nox", FloatType(), True),
    StructField("no2", FloatType(), True),
    StructField("no_", FloatType(), True),
    StructField("co", FloatType(), True),
    StructField("bp", FloatType(), True),
    StructField("index_color", StringType(), True),
    StructField("index_name", StringType(), True),
    StructField("desc_", StringType(), True),
    StructField("physical", StringType(), True),
    StructField("suggestion", StringType(), True),
    StructField("confidence", StringType(), True),
    StructField("aqi", FloatType(), True),
    StructField("aqi_level", IntegerType(), True),
    StructField("stationpressure", FloatType(), True),
    StructField("vaporpressure", FloatType(), True),
    StructField("index_symbol", StringType(), True),
    StructField("update_dt", TimestampType(), True),
])

## Read JSON

In [None]:
url = "https://covid19.ddc.moph.go.th/api/Cases/round-1to2-line-lists"
r = requests.get(url)
df = spark.createDataFrame(r.json()['data'], schema=schema)

In [None]:
df = pd.read_csv("5.csv",sep="|", on_bad_lines='skip')
df.to_csv('/tmp/5.csv', sep="|", index=False)
df.head(1)

In [None]:
dup = spark.read.format("csv").load("file:///tmp/5.csv", sep="|", index=False, schema=schema)
# df.show()

In [None]:
dup.write.mode("append").saveAsTable("default.view_air_quality_factor_3")

In [None]:
dup.write.mode("overwrite").saveAsTable("default.view_air_quality_factor_3")

## Get Data From Hive With Trino

In [None]:
import trino
import pandas as pd

In [None]:
conn = trino.dbapi.connect(
    host="172.16.23.200",
    port=8443,
    user="hive",
    catalog="hive",
    schema="default",
)

In [None]:
df = pd.read_sql("SELECT * FROM default.view_air_quality_factor_2 offset 4000000 limit 1000000", conn)

In [None]:
df.shape

In [None]:
df.to_csv('5.csv', sep="|", index=False, header=False)