In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark
import pandas as pd
from urllib.parse import quote
import os
import requests
from datetime import datetime
import pymysql
from sqlalchemy import create_engine

In [None]:
mysqlcon = create_engine('mysql+pymysql://root:%s@192.168.10.22/test' % quote('P@ssw0rd'))

In [None]:
now = datetime.now()
today = now.strftime("%Y-%m-%d %H:%M:%S")

In [None]:
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['JAVA_HOME'] = '/usr/local/jdk8u222-b10'
os.environ['HADOOP_USER_NAME']='hive'
os.environ['PYSPARK_PYTHON'] ='/HDFS01/anaconda3/envs/main/bin/python'
conf = pyspark.SparkConf().setAll([
     ('spark.driver.maxResultSize', '0'),
     ('spark.driver.memory', '2g'),
     ("spark.driver.allowMultipleContexts", "true"),
     ('spark.sql.repl.eagerEval.enabled','true'),
     ('hive.strict.managed.tables','false'),
     ('hive.metastore.uris', 'thrift://nn01.bigdata:9083'),
     ('metastore.client.capability.check','false')
    ])
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("read_mysql_to_hive") \
        .config(conf=conf) \
        .enableHiveSupport() \
        .getOrCreate();

In [None]:
cols = StructType([
    StructField("task_id", StringType(), True),
    StructField("dag_id", StringType(), True),
    StructField("run_id", StringType(), True),
    StructField("start_date", StringType(), True),
    StructField("end_date", StringType(), True),
    StructField("duration", DoubleType(), True),
    StructField("state", StringType(), True),
    StructField("try_number", IntegerType(), True),
    StructField("hostname", StringType(), True),
    StructField("unixname", StringType(), True),
    StructField("job_id", FloatType(), True),
    StructField("pool", StringType(), True),
    StructField("queue", StringType(), True),
    StructField("priority_weight", IntegerType(), True),
    StructField("operator", StringType(), True),
    StructField("queued_dttm", StringType(), True),
    StructField("pid", FloatType(), True),
    StructField("max_tries", IntegerType(), True),
    StructField("executor_config", StringType(), True),
    StructField("pool_slots", IntegerType(), True),
    StructField("queued_by_job_id", FloatType(), True),
    StructField("external_executor_id", StringType(), True),
    StructField("trigger_id", StringType(), True),
    StructField("trigger_timeout", StringType(), True),
    StructField("next_method", StringType(), True),
    StructField("next_kwargs", StringType(), True),
])

In [None]:
def get_mysql_data():
    sql = "SELECT * FROM data limit 50"
    strMysql = pd.read_sql_query(sql, con = mysqlcon)
    print("Read Data From MySQL ...")
    return strMysql

In [None]:
def write_to_hive():
    mysql_data = get_mysql_data()
    df = spark.createDataFrame(mysql_data, schema = cols)
    df = df.withColumn("update_date", lit(today))
    if spark.sql("show tables in pyspark").filter("tableName == 'eg_mysql'").count() > 0:
        print("Save to Hive: Append Mode")
        df.write \
            .mode("append") \
            .partitionBy("dag_id") \
            .saveAsTable("pyspark.eg_mysql")
    else:
        print("Save to Hive: Overwrite Mode")
        df.write \
            .mode("overwrite") \
            .partitionBy("dag_id") \
            .saveAsTable("pyspark.eg_mysql")

In [None]:
if __name__ == '__main__':
    write_to_hive()