In [None]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark
import pandas as pd
import os
import requests
from datetime import datetime, timedelta
import dask.dataframe as dd
from time import time
import numpy as np

In [None]:
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['JAVA_HOME'] = '/usr/local/jdk8u222-b10'
os.environ['HADOOP_USER_NAME']='hive'
os.environ['PYSPARK_PYTHON'] ='/HDFS01/anaconda3/envs/main/bin/python'
conf = pyspark.SparkConf().setAll([
     ('spark.driver.maxResultSize', '0'),
     ('spark.driver.memory', '2g'),
     ('spark.sql.repl.eagerEval.enabled','true'),
     ('hive.strict.managed.tables','false'),
     ('hive.metastore.uris', 'thrift://nn01.bigdata:9083'),
     ('metastore.client.capability.check','false')
    ])
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("testreplace") \
        .config(conf=conf) \
        .enableHiveSupport() \
        .getOrCreate();

In [None]:
def get_datetime():
    now = datetime.now()
    dt_hdfs = now.strftime("%Y%m%d%H"+'00')
    
    return dt_hdfs

In [None]:
def check_sep_ip(row):
    port_list = []
    ip_list = []
    for i in row.str.split(':'):
        if len(i) == 2:
            ip_list.append(i[0])
            port_list.append(i[1])
        else:
            port_list.append(i[-1])
            i = i[:-1]
            ip_list.append(':'.join(i))

    return ip_list,port_list

In [None]:
data = pd.read_csv('/HDFS01/airflow/notebooks/Pasit/AIS/2022081001_mpm_10.tmp',sep='\r', header=None)

In [None]:
data = data[0].str.split(' ',expand=True)

In [None]:
df_H = data.loc[data[12].str.contains("H") == True].reset_index(drop=True)
df_D = data.loc[data[12].str.contains("D") == True].reset_index(drop=True)

In [None]:
df_H = df_H \
          .astype(str) \
          .replace("\[",'', regex=True) \
          .replace('\]','', regex=True) \
          .replace('"','', regex=True)

df_D = df_D \
           .astype(str) \
           .replace("\[",'', regex=True) \
           .replace('\]','', regex=True) \
           .replace('"','', regex=True)

In [None]:
df_H1 = pd.DataFrame()
df_H1['parameter'] = df_H[12]
df_H1['start_date'] = df_H[5]+' '+df_H[6]+' '+df_H[7]+' '+df_H[8]
df_H1['stop_date'] = ""
df_H1['duration'] = ""
df_H1['hostname'] = df_H[10]
df_H1['protocal'] = df_H[13]
df_H1['srcip'],df_H1['srcport'] = check_sep_ip(df_H[14])
df_H1['srcnatip'],df_H1['srcnatport'] = check_sep_ip(df_H[15])
df_H1['dstip'],df_H1['dstport'] = check_sep_ip(df_H[16])
df_H1['dstnatip'],df_H1['dstnatport'] = check_sep_ip(df_H[17])
df_H1['user'] = df_D[18]
df_H1['sent'] = df_H[21]
df_H1['received'] = df_H[22]
df_H1['domain'] = ""
df_H1['method'] = df_H[19]
df_H1['url'] = df_H[20]
df_H1['uri'] = ""
df_H1['rm'] = ""
df_H1['locations'] = df_H[23]
df_H1['dt'] = self.getDT()[1]
df_H1['mon'] = df_H[23]
df_H1['mon'] = df_H[23]
df_H1['start_filter'] = pd.to_datetime(df_H[5]+' '+df_H[6]+' '+df_H[7]+' '+df_H[8]).astype(str)
df_H1['stop_filter'] = ""

In [None]:
df_D1 = pd.DataFrame()
df_D1['parameter'] = df_D[12]
df_D1['start_date']= ""
df_D1['stop_date'] = df_D[5]+' '+df_D[6]+' '+df_D[7]+' '+df_D[8]
df_D1['duration'] = df_D[19]
df_D1['hostname'] = df_D[10]
df_D1['protocal'] = df_D[13]
df_D1['srcip'],df_D1['srcport'] = check_sep_ip(df_D[14])
df_D1['srcnatip'],df_D1['srcnatport'] = check_sep_ip(df_D[15])
df_D1['dstip'],df_D1['dstport'] = check_sep_ip(df_D[16])
df_D1['dstnatip'],df_D1['dstnatport'] = check_sep_ip(df_D[17])
df_D1['user'] = df_D[18]
df_D1['sent'] = df_D[20]
df_D1['received'] = df_D[21]
df_D1['domain'] = ""
df_D1['method'] = ""
df_D1['url'] = ""
df_D1['uri'] = ""
df_D1['rm'] = ""
df_D1['locations'] = df_D[22]
df_D1['dt'] = self.getDT()[1]
df_D1['mon'] = df_D[22]
df_D1['start_filter'] = ""
df_D1['stop_filter'] = pd.to_datetime(df_D[5]+' '+df_D[6]+' '+df_D[7]+' '+df_D[8]).astype(str)

In [None]:
schema = StructType([
        StructField("parameter", StringType(), True),
        StructField("start_date", StringType(), True),
        StructField("stop_date", StringType(), True),
        StructField("duration", StringType(), True),
        StructField("hostname", StringType(), True),
        StructField("protocal", StringType(), True),
        StructField("srcip", StringType(), True),
        StructField("srcport", StringType(), True),
        StructField("srcnatip", StringType(), True),
        StructField("srcnatport", StringType(), True),
        StructField("dstip", StringType(), True),
        StructField("dstport", StringType(), True),
        StructField("dstnatip", StringType(), True),
        StructField("dstnatport", StringType(), True),
        StructField("user", StringType(), True),
        StructField("sent", StringType(), True),
        StructField("received", StringType(), True),
        StructField("domain", StringType(), True),
        StructField("method", StringType(), True),
        StructField("url", StringType(), True),
        StructField("uri", StringType(), True),
        StructField("rm", StringType(), True),
        StructField("locations", StringType(), True),
        StructField("dt", StringType(), True),
        StructField("mon", StringType(), True),
        StructField("start_filter", StringType(), True),
        StructField("stop_filter", StringType(), True),
])

In [None]:
dh = spark.createDataFrame(df_H1, schema=schema)

In [None]:
dd = spark.createDataFrame(df_D1, schema=schema)

In [None]:
dh.write \
    .mode('overwrite') \
    .partitionBy('dt') \
    .saveAsTable('ais.cgn_h')

In [None]:
dd.write \
    .mode('overwrite') \
    .partitionBy('dt') \
    .saveAsTable('ais.cgn_d')