In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType
from time import sleep
import shutil
import pyspark
import pandas as pd
import dask.dataframe as dd
import os
from datetime import datetime
import numpy as np
import sys
import glob

#Spark Environment
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['JAVA_HOME'] = '/usr/local/jdk8u222-b10'
os.environ['HADOOP_USER_NAME']='hive'
os.environ['PYSPARK_PYTHON'] ='/HDFS01/anaconda3/envs/main/bin/python'
conf = pyspark.SparkConf().setAll([
     ('spark.driver.maxResultSize', '0'),
     ('spark.driver.memory', '4g'),
     ('spark.sql.repl.eagerEval.enabled','true'),
     ('hive.strict.managed.tables','false'),
     ('spark.sql.warehouse.dir','/user/hive/warehouse/ais/a10'),
     ('hive.metastore.uris', 'thrift://nn01.bigdata:9083'),
     ('metastore.client.capability.check','false')
    ])
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("log_cgn") \
        .config(conf=conf) \
        .enableHiveSupport() \
        .getOrCreate();

class MainLog:
    def __init__(self) -> None:
        self.countFile()
        
    def countFile(self):
        count_file = len(glob.glob1("/HDFS01/airflow/notebooks/Pasit/AIS/","*.tmp"))
        if count_file > 0:
            self.RunTask()
        else:
            sys.exit('---------- file not found ----------')

    def getSchema(self):
        schema = StructType([
            StructField("datetime", StringType(), True),
            StructField("a10_ip", StringType(), True),
            StructField("hostname_a10", StringType(), True),
            StructField("protocal", StringType(), True),
            StructField("src_nat", StringType(), True),
            StructField("src_port", StringType(), True),
            StructField("dect_nat", StringType(), True),
            StructField("dect_port", StringType(), True),
            StructField("dest_ip", StringType(), True),
            StructField("dest_port", StringType(), True),
            StructField("number", StringType(), True),
            StructField("method", StringType(), True),
            StructField("url", StringType(), True),
            StructField("month", StringType(), True)
        ])
        return schema
    
    def getDT(self):
        now = datetime.now()
        timestamp = now.strftime("%Y-%m-%d %H:%M:%S")
        return str(timestamp)
    
    def check_sep_ip(self, row):
        port_list = []
        ip_list = []
        for i in row.str.split(':'):
            if len(i) == 2:
                ip_list.append(i[0])
                port_list.append(i[1])
            else:
                port_list.append(i[-1])
                i = i[:-1]
                ip_list.append(':'.join(i))

        return ip_list,port_list
    
    def RunTask(self):
        i = 0
        print('Start Task at:', self.getDT())
        for file in os.listdir("/HDFS01/airflow/notebooks/Pasit/AIS/"):
            ext = os.path.splitext(file)[-1].lower()
            data = None
            dask_D = None
            dask_H = None
            df_H = None
            df_D = None
            df_D1 = None
            df_H1 = None
            data_concat = None
            result = None
            if i > 2:
                break
            if ext == '.tmp':
                print(f'--------Read Temp File: {file}----------')
                i += 1
                print("Number of file: ",i)
                data = pd.read_csv(f"/HDFS01/airflow/notebooks/Pasit/AIS/{file}",sep="\r", header=None)
                data = data[0].str.split(' ',expand=True)
                df_H = data.loc[data[12].str.contains("H") == True].reset_index(drop=True)
                df_D = data.loc[data[12].str.contains("D") == True].reset_index(drop=True)
                df_H = df_H.drop(columns=[0,1,2,3,4,11,12,17,21,22,24])
                df_D = df_D.drop(columns=[0,1,2,3,4,11,12,17,19,20,21,23,24])
                print('--------Map and clean field special charactor----------')
                df_H = df_H \
                    .astype(str) \
                    .replace("\[",'', regex=True) \
                    .replace('\]','', regex=True) \
                    .replace('"','', regex=True)
                df_D = df_D \
                        .astype(str) \
                        .replace("\[",'', regex=True) \
                        .replace('\]','', regex=True) \
                        .replace('"','', regex=True)
                print('--------Map to new Dataframe (D, H)----------')
                df_D1 = pd.DataFrame()
                df_D1['datetime']= df_D[5]+' '+df_D[6]+' '+df_D[7]+' '+df_D[8]
                df_D1['a10_ip'] = df_D[9]
                df_D1['hostname_a10'] = df_D[10]
                df_D1['protocal'] = df_D[13]
                df_D1['src_nat'],df_D1['src_port'] = self.check_sep_ip(df_D[14])
                df_D1['dect_nat'],df_D1['dect_port'] = self.check_sep_ip(df_D[15])
                df_D1['dest_ip'],df_D1['dest_port'] = self.check_sep_ip(df_D[16])
                df_D1['number'] = df_D[18]
                df_D1['method'] = str(np.nan) #Null Value for Type D
                df_D1['url'] = str(np.nan) #Null Value for Type D
                df_D1['month'] = df_D[22]
                
                df_H1 = pd.DataFrame()
                df_H1['datetime']= df_H[5]+' '+df_H[6]+' '+df_H[7]+' '+df_H[8]
                df_H1['a10_ip'] = df_H[9]
                df_H1['hostname_a10'] = df_H[10]
                df_H1['protocal'] = df_H[13]
                df_H1['src_nat'],df_H1['src_port'] = self.check_sep_ip(df_H[14])
                df_H1['dect_nat'],df_H1['dect_port'] = self.check_sep_ip(df_H[15])
                df_H1['dest_ip'],df_H1['dest_port'] = self.check_sep_ip(df_H[16])
                df_H1['number'] = df_H[18]
                df_H1['method'] = df_H[19]
                df_H1['url'] = df_H[20]
                df_H1['month'] = df_H[23]
                #data_concat = pd.concat([df_H1, df_D1])
                #---------Test Example using Dask DataFrame for optimize and redure memory
                dask_D = dd.from_pandas(df_D1, npartitions=4)
                dask_H = dd.from_pandas(df_H1, npartitions=4)
                data_concat = dd.concat([df_H1, df_D1])
                data_concat = data_concat.compute()
                #-------------------------------------------------
                #Spark DataFrame
                result = spark.createDataFrame(data_concat, schema=self.getSchema())
                print('--------Save to HDFS by Parqurt----------')
                #Write Parquet
                if spark.sql('show tables in ais').filter("tableName == 'a10'").count() > 0:
                    result.write \
                        .mode('append') \
                        .saveAsTable('ais.a10')
                else:
                    result.write \
                    .mode('overwrite') \
                    .saveAsTable('ais.a10')
                
                #self.toLzo(f"/HDFS01/airflow/notebooks/Pasit/AIS/{file}")
                self.moveTemp(file)
                print('Successfully Transfer Log Total files:', i)
            else:
                pass
        spark.stop()

    #def toLzo(self, src):
        #os.system(f"lzop -c {src} > {src}.lzo")
    def getEndTask(self):
        now = datetime.now()
        timestamp = now.strftime("%Y-%m-%d %H:%M:%S")
        return str(timestamp)
    
    def moveTemp(self, src):
        shutil.move(f"/HDFS01/airflow/notebooks/Pasit/AIS/{src}", f"/HDFS01/airflow/notebooks/Pasit/AIS/tmp")

if __name__ == '__main__':
    obj = MainLog()
    print(obj.getEndTask())