## Amazon S3 Server Access Log Format
https://docs.aws.amazon.com/AmazonS3/latest/dev/LogFormat.html

In [None]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.regexp_extract
import org.apache.spark.sql.types._

In [None]:
val schema_csv = StructType(Array(StructField("events", StringType, true)))
val df_logs = spark.read.format("csv")
                    .schema(schema_csv)
                    .option("header", "false")
                    .option("delimiter", "\\t")
                    .load(s"./assets/s3_server_access_log.txt")

In [None]:
df_logs.select("events").show(1, false)

In [None]:
var reg = ""
reg += """(\S+)\s""" // bucket_owner
reg += """(\S+)\s""" // bucket
reg += """\[([^\]]+)\]\s""" // time
reg += """(\S+)\s""" // remote_ip
reg += """(\S+)\s""" // requester
reg += """(\S+)\s""" // request_id
reg += """(\S+)\s""" // operation
reg += """(\S+)\s""" // key
reg += """(-|"-"|"\S+ \S+ (?:-|\S+)")\s""" // request_uri
reg += """(\S+)\s""" // http_status
reg += """(\S+)\s""" // error_code
reg += """(\S+)\s""" // bytes_sent
reg += """(\S+)\s""" // object_size
reg += """(\S+)\s""" // total_time
reg += """(\S+)\s""" // turn_around_time
reg += """(-|"[^"]+")\s""" // referer
reg += """(-|"[^"]+")\s""" // user_agent
reg += """(\S+)\s""" // version_id
reg += """(\S+)\s""" // host_id
reg += """(\S+)\s""" // signature_version
reg += """(\S+)\s""" // cipher_suite
reg += """(\S+)\s""" // authentication_type
reg += """(\S+)\s""" // host_header
reg += """(\S+)""" // tls_version

println(reg)
val df_logs_parsed = df_logs.select(
    regexp_extract($"events", reg, 1).as("bucket_owner")
    ,regexp_extract($"events", reg, 2).as("bucket")
    ,regexp_extract($"events", reg, 3).as("time")
    ,regexp_extract($"events", reg, 4).as("remote_ip")
    ,regexp_extract($"events", reg, 5).as("requester")
    ,regexp_extract($"events", reg, 6).as("request_id")
    ,regexp_extract($"events", reg, 7).as("operation")
    ,regexp_extract($"events", reg, 8).as("key")
    ,regexp_extract($"events", reg, 9).as("request_uri")
    ,regexp_extract($"events", reg, 10).as("http_status")
    ,regexp_extract($"events", reg, 11).as("error_code")
    ,regexp_extract($"events", reg, 12).as("bytes_sent")
    ,regexp_extract($"events", reg, 13).as("object_size")
    ,regexp_extract($"events", reg, 14).as("total_time")
    ,regexp_extract($"events", reg, 15).as("turn_around_time")
    ,regexp_extract($"events", reg, 16).as("referer")
    ,regexp_extract($"events", reg, 17).as("user_agent")
    ,regexp_extract($"events", reg, 18).as("version_id")
    ,regexp_extract($"events", reg, 19).as("host_id")
    ,regexp_extract($"events", reg, 20).as("signature_version")
    ,regexp_extract($"events", reg, 21).as("cipher_suite")
    ,regexp_extract($"events", reg, 22).as("authentication_type")
    ,regexp_extract($"events", reg, 23).as("host_header")
    ,regexp_extract($"events", reg, 24).as("tls_version")
)
df_logs_parsed.select("*").show(1, false)
