## SEC-351 Getting Started with ML & Data Science for Security

In [None]:
# Load necessary modules
import pandas as pd
import numpy as np
import json
from matplotlib import pyplot
import seaborn as sns

from statsmodels.graphics.tsaplots import plot_acf

### ** CHANGE THIS S3 BUCKET PATH **
Note the expected file format:  
3 columns - eventtime, arn, eventname  (any order, but those column names in header)

In [None]:
#Download CloudTrail csv extract data from s3
! aws s3 cp s3://reinvent2019-builder-working/CT-4779150f72fb-2acct4regions.csv logs.csv

In [None]:
data = pd.read_csv('logs.csv')

Some Date/Time functionality to allow easier sorting and filtering

In [None]:
data['eventtime']= pd.to_datetime(data['eventtime']) 

In [None]:
data = data.sort_values(by='eventtime')

In [None]:
data['YearMonth'] = pd.to_datetime(data['eventtime']).apply(lambda x: '{year}-{month}'.format(year=x.year, month=x.month))
data['YearMonthDay'] = pd.to_datetime(data['eventtime']).apply(lambda x: '{year}-{month}-{day}'.format(year=x.year, month=x.month, day=x.day))

In [None]:
data.head()

#### Let's start to find find interesting pairs of users and API calls
#### Group data by day, arn, and event - counting occurrences

In [None]:
grouped_data = data.groupby(['YearMonthDay','arn','eventname'])['eventname'].count()

In [None]:
grouped_data_df = pd.DataFrame(grouped_data)

In [None]:
grouped_data_df.head()

In [None]:
# Write and read this DF. A quick workaround for fully populating all the aggregation columns
grouped_data.to_csv('grouped_data_arn_api.csv')

In [None]:
colnames=['date', 'arn', 'api', 'count']
userApiCounts = pd.read_csv('grouped_data_arn_api.csv', names=colnames, index_col='date', parse_dates=True)

In [None]:
userApiCounts.head()

### Count the number of times a user is used and select top roles for analysis

In [None]:
count_roles = userApiCounts.groupby('arn')['count'].sum().sort_values(ascending=False)

In [None]:
count_roles_df = pd.DataFrame(count_roles).reset_index()
count_roles_df.head(10)

In [None]:
top_arns_df = count_roles_df.head(10)
top_arns = top_arns_df['arn'].values

In [None]:
top_arns

#### A few other data exploration steps
These are not critical to our specific output, but demonstrating some sorting and exploration

In [None]:
userApiCounts = userApiCounts.sort_index()

In [None]:
userApiCounts.head()

In [None]:
# Take a look at a specific example
ResAbac_df = userApiCounts['arn'] == 'arn:aws:sts::002726030336:assumed-role/AwsSecurityAudit/ResourceAbacus'
df_test = userApiCounts[ResAbac_df]

In [None]:
# look at specific API
df_test['api'][10]

In [None]:
ResAbac_df = df_test['api'] == 'DescribeDBInstances'
df_test2 = df_test[ResAbac_df]
df_test2.head(10)

### Graph top arn/api pairs

In [None]:
pyplot.rcParams.update({'figure.max_open_warning': 0})
for i in range(len(top_arns)):
    arn = top_arns[i]
    ResAbac_df = df['arn'] == top_arns[i]
    df_test = df[ResAbac_df]
    features_of_interest = ['count']
    unique_api_array = df_test.api.unique()
    for n in range(len(unique_api_array)):
        api=unique_api_array[n]
        df_assume2 = df_test['api'] == unique_api_array[n]
        df_test2 = df_test[df_assume2]
        pyplot.figure(figsize=(12,3*len(features_of_interest)))
        for i,f in enumerate(features_of_interest):
            if i==0: ax0 = pyplot.subplot(len(features_of_interest), 1, i+1)
            else: pyplot.subplot(len(features_of_interest), 1, i+1, sharex = ax0)
            df_test2[f].plot()
            pyplot.title(arn+'  '+'api='+api, y=0.85, loc='right')
        pyplot.subplots_adjust(hspace=0.05)

### Pick a series with a repeating pattern to model on
We're going to look at user.arn Meta31 and the API call DescribeDBInstances

In [None]:
# Looking here specifically for /AwsSecurityAudit/Meta31'
top_arns[4]

In [None]:
# Once we have a particular user.arn, let's look at which API calls are most common
Meta31_df = userApiCounts['arn'] == top_arns[4]
df_test = userApiCounts[Meta31_df]
df_test.sort_values(by=['count'], ascending=False).head()

In [None]:
#df_assume = df['arn'] == top_arns[4]
#df_test = df[df_assume]

In [None]:
# DescribeDBInstances looks interesting, so let's build a temporary dataframe 
#  with just the Meta31 calls to DescribeDBInstances
Meta31_DescDB_df = df_test['api'] == 'DescribeDBInstances'
df_test2 = df_test[Meta31_DescDB_df]

In [None]:
# Let's plot the graph of Meta31-DescribeDBInstances
pyplot.figure(figsize=(12,3*len(features_of_interest)))
for i,f in enumerate(features_of_interest):
    if i==0:
        ax0 = pyplot.subplot(len(features_of_interest), 1, i+1)
    else:
        pyplot.subplot(len(features_of_interest), 1, i+1, sharex = ax0)
    df_test2[f].plot()
    pyplot.title(arn+'  '+'api='+api, y=0.85, loc='right')

In [None]:
# Looking at some of the data
df_test2.sort_values(by=['date']).head(5)

In [None]:
# Let's write this data to a local file so we have it available
df_test2.to_csv('AwsSecurityAudit_Meta31_DescribeDBInstances.csv')

In [None]:
Meta31_DescDB_df = pd.read_csv('AwsSecurityAudit_Meta31_DescribeDBInstances.csv')

### Let's use autocorrelation to look for repeating patterns
Note: There are other methods for doing this, but we're going to demonstrate one of them

In [None]:
Meta31_DescDB_df['count'].autocorr(lag=2)

In [None]:
plot_acf(Meta31_DescDB_df['count'], lags=30)

### Use the max correlation coeficient to select series with repeating pattern
Collect and print those graphs.
This is how we could visually identify those user.arn+APIs calls that suggested
automated roles

In [None]:

for i in range(len(top_arns)):
    arn = top_arns[i]
    topArns_df = userApiCounts['arn'] == top_arns[i]
    topArnsAPIs_df = userApiCounts[topArns_df]
    features_of_interest = ['count']
    unique_api_array = topArnsAPIs_df.api.unique()
    for n in range(len(unique_api_array)):
        api=unique_api_array[n]
        df_assume2 = topArnsAPIs_df['api'] == unique_api_array[n]
        df_test2 = topArnsAPIs_df[df_assume2]
        
        #Compute autocorr for lags 1..7 (7 days in a week)
        lis = []
        for i in range(7):
            x = abs(df_test2['count'].autocorr(lag=i+1))
            lis.append(x)
        #Choose lag with max abs(autocorr) for each pair
        max_corr = max(lis)
        
        # Plot graphs (same as above), but only for those pairs whose max Autocorr is > 0.5
        # Arbitrary cut-off at 0.5 (could use parameter)
        if max_corr >= 0.5:
            pyplot.figure(figsize=(12,3*len(features_of_interest)))
            for i,f in enumerate(features_of_interest):
                if i==0: ax0 = pyplot.subplot(len(features_of_interest), 1, i+1)
                else: pyplot.subplot(len(features_of_interest), 1, i+1, sharex = ax0)
                df_test2[f].plot()
                pyplot.title(arn+'  '+'api='+api, y=0.85, loc='right')
            pyplot.subplots_adjust(hspace=0.05)