In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType
from datetime import datetime, timedelta
import shutil
import pyspark
import pandas as pd
import os
import sys
import glob
import logging

#Spark Environment
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['JAVA_HOME'] = '/usr/local/jdk8u222-b10'
os.environ['HADOOP_USER_NAME']='hive'
os.environ['PYSPARK_PYTHON'] ='/HDFS01/anaconda3/envs/main/bin/python'
conf = pyspark.SparkConf().setAll([
     ('spark.driver.maxResultSize', '0'),
     ('spark.driver.memory', '4g'),
     ('spark.sql.repl.eagerEval.enabled','true'),
     ('hive.strict.managed.tables','false'),
     ('hive.metastore.uris', 'thrift://nn01.bigdata:9083'),
     ('metastore.client.capability.check','false')
    ])
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("log_cgn") \
        .config(conf=conf) \
        .enableHiveSupport() \
        .getOrCreate();

#-------------Define Global-----------
STRPATH = "/HDFS01/airflow/notebooks/Pasit/AIS"
EXTPATH = "*.tmp"
#-------------Define Global-----------

class MainLog:
    def __init__(self) -> None:
        self.countFile()
        
    def countFile(self):
        count_file = len(glob.glob1(f"{STRPATH}/",f"{EXTPATH}"))
        if count_file > 0:
            self.RunTask()
        else:
            sys.exit('---------- file not found ----------')
          
    def getSchema(self):
        schema = StructType([
            StructField("parameter", StringType(), True),
            StructField("start_date", StringType(), True),
            StructField("stop_date", StringType(), True),
            StructField("duration", StringType(), True),
            StructField("hostname", StringType(), True),
            StructField("protocal", StringType(), True),
            StructField("srcip", StringType(), True),
            StructField("srcport", StringType(), True),
            StructField("srcnatip", StringType(), True),
            StructField("srcnatport", StringType(), True),
            StructField("dstip", StringType(), True),
            StructField("dstport", StringType(), True),
            StructField("dstnatip", StringType(), True),
            StructField("dstnatport", StringType(), True),
            StructField("user", StringType(), True),
            StructField("sent", StringType(), True),
            StructField("received", StringType(), True),
            StructField("domain", StringType(), True),
            StructField("method", StringType(), True),
            StructField("url", StringType(), True),
            StructField("uri", StringType(), True),
            StructField("rm", StringType(), True),
            StructField("locations", StringType(), True),
            StructField("dt", StringType(), True),
            StructField("mon", StringType(), True),
            StructField("start_filter", StringType(), True),
            StructField("stop_filter", StringType(), True)
        ])
        return schema
    
    def getDT(self):
        now = datetime.now()
        timestamp = now.strftime("%Y-%m-%d %H:%M:%S")
        dt_hdfs = now.strftime("%Y%m%d%H"+'00')
        return [str(timestamp), str(dt_hdfs)]
    
    def writelog(self, msg: str, level: str):
        logging.basicConfig(filename=f"cgn_log_{self.getDT()[0]}.log", level=logging.INFO)
        if level == 'info':
            logging.info(msg)
        else:
            logging.error(msg)
    
    def check_sep_ip(self, row):
        port_list = []
        ip_list = []
        for i in row.str.split(':'):
            if len(i) == 2:
                ip_list.append(i[0])
                port_list.append(i[1])
            else:
                port_list.append(i[-1])
                i = i[:-1]
                ip_list.append(':'.join(i))

        return ip_list,port_list
    
    def RunTask(self):
        i = 0
        self.writelog(f"{self.getDT()[0]} Start Task", "info")
        print('Start Task at:', self.getDT()[0])
        for file in os.listdir(f"{STRPATH}/"):
            ext = os.path.splitext(file)[-1].lower()
            data = None
            df_H = None
            df_D = None
            df_D1 = None
            df_H1 = None
            result_d = None
            result_h = None
            #if i > 10:
            #    break
            if ext == '.tmp':
                self.writelog(f"{self.getDT()[0]} Read Temp File:{file}", "info")
                i += 1
                data = pd.read_csv(f"{STRPATH}/{file}",sep="\r", header=None)
                data = data[0].str.split(' ',expand=True)
                df_H = data.loc[data[12].str.contains("H") == True].reset_index(drop=True)
                df_D = data.loc[data[12].str.contains("D") == True].reset_index(drop=True)
                self.writelog(f"{self.getDT()[0]}: clean field and remove special charactor","info")
                df_H = df_H \
                    .astype(str) \
                    .replace("\[",'', regex=True) \
                    .replace('\]','', regex=True) \
                    .replace('"','', regex=True)
                df_D = df_D \
                    .astype(str) \
                    .replace("\[",'', regex=True) \
                    .replace('\]','', regex=True) \
                    .replace('"','', regex=True)
                self.writelog(f"{self.getDT()[0]} overwrite to new dataframe (D)", "info")
                df_D1 = pd.DataFrame()
                df_D1['parameter'] = df_D[12]
                df_D1['start_date']= ""
                df_D1['stop_date'] = df_D[5]+' '+df_D[6]+' '+df_D[7]+' '+df_D[8]
                df_D1['duration'] = df_D[19]
                df_D1['hostname'] = df_D[10]
                df_D1['protocal'] = df_D[13]
                df_D1['srcip'],df_D1['srcport'] = self.check_sep_ip(df_D[14])
                df_D1['srcnatip'],df_D1['srcnatport'] = self.check_sep_ip(df_D[15])
                df_D1['dstip'],df_D1['dstport'] = self.check_sep_ip(df_D[16])
                df_D1['dstnatip'],df_D1['dstnatport'] = self.check_sep_ip(df_D[17])
                df_D1['user'] = df_D[18]
                df_D1['sent'] = df_D[20]
                df_D1['received'] = df_D[21]
                df_D1['domain'] = ""
                df_D1['method'] = ""
                df_D1['url'] = ""
                df_D1['uri'] = ""
                df_D1['rm'] = ""
                df_D1['locations'] = df_D[22]
                df_D1['dt'] = self.getDT()[1]
                df_D1['mon'] = df_D[22]
                df_D1['start_filter'] = ""
                df_D1['stop_filter'] = pd.to_datetime(df_D[5]+' '+df_D[6]+' '+df_D[7]+' '+df_D[8]).astype(str)
                
                self.writelog(f"{self.getDT()[0]} overwrite to new dataframe (H)", "info")
                df_H1 = pd.DataFrame()
                df_H1['parameter'] = df_H[12]
                df_H1['start_date'] = df_H[5]+' '+df_H[6]+' '+df_H[7]+' '+df_H[8]
                df_H1['stop_date'] = ""
                df_H1['duration'] = ""
                df_H1['hostname'] = df_H[10]
                df_H1['protocal'] = df_H[13]
                df_H1['srcip'],df_H1['srcport'] = self.check_sep_ip(df_H[14])
                df_H1['srcnatip'],df_H1['srcnatport'] = self.check_sep_ip(df_H[15])
                df_H1['dstip'],df_H1['dstport'] = self.check_sep_ip(df_H[16])
                df_H1['dstnatip'],df_H1['dstnatport'] = self.check_sep_ip(df_H[17])
                df_H1['user'] = df_H[18]
                df_H1['sent'] = df_H[21]
                df_H1['received'] = df_H[22]
                df_H1['domain'] = ""
                df_H1['method'] = df_H[19]
                df_H1['url'] = df_H[20]
                df_H1['uri'] = ""
                df_H1['rm'] = ""
                df_H1['locations'] = df_H[23]
                df_H1['dt'] = self.getDT()[1]
                df_H1['mon'] = df_H[23]
                df_H1['mon'] = df_H[23]
                df_H1['start_filter'] = pd.to_datetime(df_H[5]+' '+df_H[6]+' '+df_H[7]+' '+df_H[8]).astype(str)
                df_H1['stop_filter'] = ""
                
                self.writelog(f"{self.getDT()[0]} send dataframe to pyspark and assign schema", "info")
                result_d = spark.createDataFrame(df_D1, schema=self.getSchema())
                result_h = spark.createDataFrame(df_H1, schema=self.getSchema())
                
                if spark.sql('show tables in ais').filter("tableName == 'cgn_d'").count() > 0:
                    self.writelog(f"{self.getDT()[0]} save data to hdfs (D) type: append", "info")
                    result_d.write \
                        .mode('append') \
                        .partitionBy('dt') \
                        .saveAsTable('ais.cgn_d')
                else:
                    self.writelog(f"{self.getDT()[0]} save data to hdfs (D) type: create new table", "info")
                    result_d.write \
                        .mode('overwrite') \
                        .partitionBy('dt') \
                        .saveAsTable('ais.cgn_d')
                    
                if spark.sql('show tables in ais').filter("tableName == 'cgn_h'").count() > 0:
                    self.writelog(f"{self.getDT()[0]} save data to hdfs (H) type: append", "info")
                    result_h.write \
                        .mode('append') \
                        .partitionBy('dt') \
                        .saveAsTable('ais.cgn_h')
                else:
                    self.writelog(f"{self.getDT()[0]} save data to hdfs (H) type: create new table", "info")
                    result_h.write \
                        .mode('overwrite') \
                        .partitionBy('dt') \
                        .saveAsTable('ais.cgn_h')
                #self.toLzo(f"/HDFS01/airflow/notebooks/Pasit/AIS/{file}")
                self.moveTemp(file)
                self.writelog(f"{self.getDT()[0]} successfully transfer log data file: {i}", "info")
            else:
                pass
        spark.stop()

    #def toLzo(self, src):
        #os.system(f"lzop -c {src} > {src}.lzo")
        
    def getEndTask(self):
        now = datetime.now()
        timestamp = now.strftime("%Y-%m-%d %H:%M:%S")
        return str(timestamp)
    
    def moveTemp(self, src):
        shutil.move(f"{STRPATH}/{src}", f"{STRPATH}/tmp")
        self.writelog(f"{self.getDT()[0]} move file {src} after save to hdfs", "info")

if __name__ == '__main__':
    obj = MainLog()
    print(obj.getEndTask())
    print("""+-----------+\nSuccess All!\n+-----------+""")