In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta
import plotly.graph_objects as go

In [24]:
def read_ids_logs():
    url = 'https://raw.githubusercontent.com/data-challengers/DC4/main/DC4-data/IDS-logs.csv'
    df = pd.read_csv(url, parse_dates=[0], infer_datetime_format=True)
    df.columns = df.columns.str.replace(' ', '')
    return df

df = read_ids_logs()
# print(df)

In [25]:
def get_internal_sources(df):
    # try to isolate first IP num for visited sites
    df['pos'] = df['sourceIP'].str.find('.')
    # use position to slice sourceIP
    df['ip_source'] = df.apply(lambda x: x['sourceIP'][0:x['pos']],axis=1)
    print('Unique IP sources: ', df['ip_source'].unique())

    # Create boolean internal? var
    df['internal_source'] = np.where(df['ip_source']=='172', True, False)
    df.drop(['pos', 'ip_source'], axis=1, inplace=True)

get_internal_sources(df)
# print(df)

Unique IP sources:  ['172' '10']


In [26]:
classifications = df['classification'].unique()
print('Unique classifications: ', classifications)

# create classification specific dfs
general_protocol = df.loc[df['classification'] == ' Generic Protocol Command Decode']
privacy_violations = df.loc[df['classification'] == ' Potential Corporate Privacy Violation']
leaks = df.loc[df['classification'] == ' Attempted Information Leak']
poten_bad = df.loc[df['classification'] == ' Potentially Bad Traffic']
misc = df.loc[df['classification'] == ' Misc activity']
# Check if there are any NA classifications
na_class = df[df['classification'].isnull()]
print(na_class)

# get percentage of entries that are from internal sources
percent = df['internal_source'].value_counts(normalize=True) * 100
print('\nFull df: internal sources percentages: \n',percent)

# percent of leaks
percent = leaks['internal_source'].value_counts(normalize=True) * 100
print('\nLeaks df: internal sources percentages: \n',percent)

# percent of privacy_violations
percent = privacy_violations['internal_source'].value_counts(normalize=True) * 100
print('\nprivacy_violations df: internal sources percentages: \n',percent)

# percent of poten_bad
percent = poten_bad['internal_source'].value_counts(normalize=True) * 100
print('\nPotentially bad df: internal sources percentages: \n',percent)

# percent of misc
percent = misc['internal_source'].value_counts(normalize=True) * 100
print('\nmisc df: internal sources percentages: \n',percent)

# percent of poten_bad
percent = general_protocol['internal_source'].value_counts(normalize=True) * 100
print('\ngeneral_protocol df: internal sources percentages: \n',percent)


Unique classifications:  [' Generic Protocol Command Decode'
 ' Potential Corporate Privacy Violation' ' Misc activity'
 ' Attempted Information Leak' ' Potentially Bad Traffic']
Empty DataFrame
Columns: [time, sourceIP, sourcePort, destIP, destPort, classification, priority, label, packetinfo, packetinfocont'd, xref, internal_source]
Index: []

Full df: internal sources percentages: 
 True     66.044681
False    33.955319
Name: internal_source, dtype: float64

Leaks df: internal sources percentages: 
 True    100.0
Name: internal_source, dtype: float64

privacy_violations df: internal sources percentages: 
 True    100.0
Name: internal_source, dtype: float64

Potentially bad df: internal sources percentages: 
 True    100.0
Name: internal_source, dtype: float64

misc df: internal sources percentages: 
 False    99.965414
True      0.034586
Name: internal_source, dtype: float64

general_protocol df: internal sources percentages: 
 True    100.0
Name: internal_source, dtype: float64


In [27]:
# I'm gonna work on a percentage stacked bar chart in plotly which ... will probably take me a while! 

In [28]:
# Round times to the hour (to be grouped for plotting)
df['hour'] = df['time'].dt.floor('h')
# Create count aggregate of classification appearances over time
grouped_class_df = df.groupby(['classification', 'hour']).size().reset_index(name='count')
grouped_class_df.dtypes
print(grouped_class_df)

                 classification                hour  count
0    Attempted Information Leak 2012-04-05 21:00:00     16
1    Attempted Information Leak 2012-04-05 23:00:00     20
2    Attempted Information Leak 2012-04-06 00:00:00    213
3    Attempted Information Leak 2012-04-06 01:00:00     82
4    Attempted Information Leak 2012-04-06 03:00:00     35
..                          ...                 ...    ...
85      Potentially Bad Traffic 2012-04-05 21:00:00     16
86      Potentially Bad Traffic 2012-04-05 23:00:00     16
87      Potentially Bad Traffic 2012-04-06 00:00:00     68
88      Potentially Bad Traffic 2012-04-06 01:00:00     52
89      Potentially Bad Traffic 2012-04-06 03:00:00     20

[90 rows x 3 columns]


In [29]:
def fill_df_nas(df, time_col, group_col, group_arr):
    """
    Expands dataframe to include all x-axis values for every group, and
    fills dataframes with NAs when there are no observations for the specified group.
    
    Modified function that includes NA values for all time points within range from min-max times observed.
    
    Useful for Plotly graphs in mode='lines+markers'
    :param: df: dataframe of interest
    :param: time_col: string name of column that contains time variable (or generally, the x variable)
    :param: group_col: string name of column that contains the groups to plot over different traces
    :param: group_arr: list or numpy array of all unique observations in df['group_col']
    """
    # Sort by time for graphing
    df = df.sort_values(by=[time_col])
    # Get series time range between min and max time points
    time_range = pd.date_range(df[time_col].min(), df[time_col].max(), freq='H')
    
    df_series = pd.Series(np.tile(group_arr, len(time_range)))
    df_idx_series = time_range \
        .repeat(len(group_arr))
    new_df = pd.DataFrame({time_col: df_idx_series,
                          group_col: df_series})
    df_with_nas = pd.merge(new_df, df, on=[time_col, group_col], how='left')
    return df_with_nas


In [30]:
# PLOTLY GRAPH OF CLASSIFICATIONS OVER TIME 
def classifications_over_time(df):
    fig = go.Figure()
    # plot all classifications as different lines
    for i in range(0, len(classifications)):
        # filter for specific classification
        expr = df['classification'] == classifications[i]
        fig.add_trace(go.Scatter(
            x=df[expr]['hour'],
            y=df[expr]['count'],
            name=classifications[i],
            connectgaps=False,
            mode='lines+markers')
        )
    fig.show()

    
# Create df for figure
fig_df = fill_df_nas(grouped_class_df, 'hour', 'classification', classifications)
classifications_over_time(fig_df)