# Analysis on the most common eventName: `RunInstances`

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.cluster import DBSCAN, HDBSCAN
from sklearn.datasets import make_blobs
from sklearn import preprocessing

import polars as pl


The following required CPU features were not detected:
    ssse3, sse4.1, sse4.2, popcnt
Continuing to use this version of Polars on this processor will likely result in a crash.
Install the `polars-lts-cpu` package instead of `polars` to run Polars with better compatibility.

Hint: If you are on an Apple ARM machine (e.g. M1) this is likely due to running Python under Rosetta.
It is recommended to install a native version of Python that does not run under Rosetta x86-64 emulation.




In [10]:
cloudtrails_log_schema = pl.Struct({
    "eventVersion": pl.Utf8,
    "userIdentity": pl.Struct({
        "type": pl.Utf8,
        "principalId": pl.Utf8,
        "arn": pl.Utf8,
        "accountId": pl.Utf8,
        "invokedby": pl.Utf8,
        "accesskeyId": pl.Utf8,
        "userName": pl.Utf8,
    }),
    "sessionContext": pl.Struct({
        "attributes": pl.Struct({
            "mfaAuthenticated": pl.Utf8,
            "creationDate": pl.Utf8,
        }),
        "sessionIssuer": pl.Struct({
            "type": pl.Utf8,
            "principalId": pl.Utf8,
            "arn": pl.Utf8,
            "accountId": pl.Utf8,
            "userName": pl.Utf8,
        }),
    }),
    "eventTime": pl.Utf8,
    "eventSource": pl.Utf8,
    "eventName": pl.Utf8,
    "awsRegion": pl.Utf8,
    "sourceIPAddress": pl.Utf8,
    "userAgent": pl.Utf8,
    "errorCode": pl.Utf8,
    "errorMessage": pl.Utf8,
    "requestParameters": pl.Utf8,
    "responseElements": pl.Utf8,
    "additionalEventData": pl.Utf8,
    "requestId": pl.Utf8,
    "eventId": pl.Utf8,
    "resources": pl.List(pl.Struct({
        "ARN": pl.Utf8,
        "accountId": pl.Utf8,
        "type": pl.Utf8,
    })),
    "eventType": pl.Utf8,
    "apiVersion": pl.Utf8,
    "readonly": pl.Utf8,
    "recipientAccountId": pl.Utf8,
    "serviceEventDetails": pl.Utf8,
    "sharedEventId": pl.Utf8,
    "vpcEndpointId": pl.Utf8,
})

In [2]:
df = pl.read_ndjson("../data/raw/flaws_cloudtrail02.ndjson")

In [6]:
df.head(2)

userAgent,eventID,userIdentity,eventType,sourceIPAddress,eventName,eventSource,recipientAccountId,requestParameters,awsRegion,requestID,responseElements,eventVersion,eventTime,errorMessage,errorCode,apiVersion
str,str,struct[6],str,str,str,str,str,struct[44],str,str,struct[9],str,str,str,str,str
"""[S3Console/0.4…","""3038ebd2-c98a-…","{""Root"",""811596193553"",""arn:aws:iam::811596193553:root"",""811596193553"",null,{{""false"",""2017-02-12T19:57:05Z""}}}","""AwsApiCall""","""255.253.125.11…","""ListBuckets""","""s3.amazonaws.c…","""811596193553""","{{null,null},null,null,null,{null},{null},null,{null},{{{null,null},null,null}},{null},null,null,null,{null},{null},null,{null},null,null,null,{null},{null},{null},null,{null},null,{null},null,{null},{null},{null},{null},{null},{null},{null},null,{null,null},null,{null},null,null,{null},null,null}","""us-east-1""","""83A6C73FE87F51…","{null,null,null,null,null,null,null,null,{null,null,null,null,null,null}}","""1.04""","""2017-02-12T19:…",,,
"""console.amazon…","""22a0d9b1-deea-…","{""Root"",""811596193553"",""arn:aws:iam::811596193553:root"",""811596193553"",""ASIA79EXPHZ8SRL55OOE"",{{""false"",""2017-02-12T19:57:05Z""}}}","""AwsApiCall""","""255.253.125.11…","""GetAccountPass…","""iam.amazonaws.…","""811596193553""","{{null,null},null,null,null,{null},{null},null,{null},{{{null,null},null,null}},{null},null,null,null,{null},{null},null,{null},null,null,null,{null},{null},{null},null,{null},null,{null},null,{null},{null},{null},{null},{null},{null},{null},null,{null,null},null,{null},null,null,{null},null,null}","""us-east-1""","""b833be53-f15d-…","{null,null,null,null,null,null,null,null,{null,null,null,null,null,null}}","""1.02""","""2017-02-12T19:…","""The Password P…","""NoSuchEntityEx…",


In [5]:
df.filter(pl.col("responseElements").is_null()).count()

userAgent,eventID,userIdentity,eventType,sourceIPAddress,eventName,eventSource,recipientAccountId,requestParameters,awsRegion,requestID,responseElements,eventVersion,eventTime,errorMessage,errorCode,apiVersion
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
199668,199668,199668,199668,199668,199668,199668,199668,199668,199668,199608,199668,199668,199668,37161,38232,4438


In [13]:
df.select(pl.col("userAgent").value_counts()).unnest("userAgent")

userAgent,count
str,u32
"""[aws-cli/1.16.…",2
"""aws-cli/1.11.7…",3
"""[aws-cli/1.11.…",2
"""aws-cli/1.14.6…",12
"""aws-cli/1.14.5…",23
…,…
"""[aws-cli/1.16.…",14
"""[aws-cli/1.11.…",1
"""aws-cli/1.16.8…",57
"""[aws-cli/1.16.…",2
