## วิธีติดตั้ง PySpark
- Version ที่ติดตั้งกับ pip ต้อง Version เดียวกัน
- Version ที่ Trino อ่านได้จะเป็น Version 3.1.3 ขึ้นไป 
```bash
$ yum -y install java-1.8.0-openjdk **CentOS**
$ apt install default-jdk scala git -y **Ubuntu**
$ wget https://dlcdn.apache.org/spark/spark-3.1.3/spark-3.1.3-bin-hadoop2.7.tgz
$ tar xvf spark-3.1.3-bin-hadoop2.7.tgz
$ sudo mv spark-3.1.3-bin-hadoop2.7/ /opt/spark 
$ vim ~/.bashrc
$ echo "export SPARK_HOME=/opt/spark" >> ~/.bashrc
$ echo "export PATH=$SPARK_HOME:$PATH" >> ~/.bashrc
$ echo "export PYSPARK_PYTHON=/root/anaconda3/bin/python" >> ~/.bashrc
$ echo "export SPARK_HOME=/opt/spark" >> ~/.profile
$ echo "export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin" >> ~/.profile
$ echo "export PYSPARK_PYTHON=/usr/bin/python3" >> ~/.profile
$ source ~/.bashrc
$ source ~/.profile
$ pip install pyspark==3.1.3
```
--------------------------
## Schema ที่ใช้งานได้
```bash
org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
```
## หากใช้ไม่ได้ให้ใช้คำสั่งนี้
```bash
ALTER TABLE potential_plant SET SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe';
```


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark
import pandas as pd
import os

### PySpark Config

In [None]:
#แบบที่ 1
spark = SparkSession.builder \
        .master('local[*]') \
        .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
        .config("spark.sql.warehouse.dir","/users/hive/warehouse/") \
        .appName('myappname') \
        .enableHiveSupport() \
        .getOrCreate()
#แบบที่ 2 ส่วนใหญ่ใช้แบบนี้
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['JAVA_HOME'] = '/usr/local/jdk8u222-b10'
os.environ['HADOOP_USER_NAME']='hive'
os.environ['PYSPARK_PYTHON'] ='/root/anaconda3/bin/python'
conf = pyspark.SparkConf().setAll([
#    ('spark.sql.hive.metastore.version', '2.3.9'),
     ('spark.driver.maxResultSize', '0'),
     ('spark.driver.memory', '2g'),
     ('spark.sql.repl.eagerEval.enabled','true'),
     ('hive.strict.managed.tables','false'),
     ('hive.metastore.uris', 'thrift://nn01.bigdata:9083'),
     ('metastore.client.capability.check','false')
    ])
spark = SparkSession.builder.master("local[*]").appName("NookTest").config(conf=conf).enableHiveSupport().getOrCreate();

### Schema Create Parquet Type

In [None]:
schema = StructType([
    StructField("year", StringType(), True),
    StructField("weeknum", IntegerType(), True),
    StructField("province", StringType(), True),
    StructField("new_case", IntegerType(), True),
    StructField("total_case", IntegerType(), True),
    StructField("new_case_excludeabroad", IntegerType(), True),
    StructField("total_case_excludeabroad", IntegerType(), True),
    StructField("new_death", IntegerType(), True),
    StructField("total_death", IntegerType(), True),
    StructField("update_date", TimestampType(), True)])

### Get Data From API And Compare With Pandas

In [None]:
df = pd.read_json("https://covid19.ddc.moph.go.th/api/Cases/today-cases-by-provinces")
df.to_csv('/tmp/tbl_covid_0.csv', sep=";", index=False)

### Read File after Pandas to_csv by Sep=";"

In [None]:
df = spark.read.format("csv").load("file:///root/tbl_covid_0.csv", sep=";", index=False, schema=schema)
df.show()

### Save To Hive

In [None]:
df.write.mode("overwrite").saveAsTable("nook.tbl_covid")

### Walk OS Remove File in HDFS

In [None]:
def removeFile():
    os.system("hdfs dfs -rm -r /user/hive/warehouse/nook.db/tbl_covid/*")
    
removeFile()

### Walk OS List File in HDFS

In [None]:
def listFile():
    os.system("hdfs dfs -ls /user/hive/warehouse/nook.db/tbl_covid/")
    
listFile()

In [None]:
df = spark.sql("select * from nook.tbl_covid limit 2")
df.show()

### Add Columns

In [None]:
x = spark.sql("ALTER TABLE nook.tbl_covid ADD columns (Testcols string)")

In [None]:
a = spark.sql('SELECT * FROM nook.tbl_covid limit 1')
a

In [None]:
a.write.mode("overwrite").saveAsTable("nook.tbl_covid2")

In [None]:
a = spark.sql("DROP table nook.tbl_covid_2")