In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta
import plotly.graph_objects as go

In [51]:
def read_ids_logs():
    url = 'https://raw.githubusercontent.com/data-challengers/DC4/b0c7eb4c6e4aef805bb78a2f714534c8c8915a61/DC4-data/IDS-logs.csv'
    df = pd.read_csv(url, parse_dates=[0], infer_datetime_format=True)
    # drop random terrible empty columns, curse you bad csvs
    df.drop(df.columns[[11, 12, 13,14,15,16,17,18,19,20]], axis=1, inplace=True)
    df.columns = df.columns.str.replace(' ', '')
    return df

df = read_ids_logs()
# print(df)


Columns (11,13,15,17,18,19,20) have mixed types.Specify dtype option on import or set low_memory=False.



In [52]:
def get_internal_sources(df):
    # try to isolate first IP num for visited sites
    df['pos'] = df['sourceIP'].str.find('.')
    # use position to slice sourceIP
    df['ip_source'] = df.apply(lambda x: x['sourceIP'][0:x['pos']],axis=1)
    print(df['ip_source'].unique())

    # Create boolean internal? var
    df['internal_source'] = np.where(df['ip_source']=='172', True, False)
    df.drop(['pos', 'ip_source'], axis=1, inplace=True)

get_internal_sources(df)
# print(df)

['172' '10']


In [53]:
classifications = df['classification'].unique()
print('Unique classifications: ', classifications)

# create classification specific dfs
general_protocol = df.loc[df['classification'] == ' Generic Protocol Command Decode']
privacy_violations = df.loc[df['classification'] == ' Potential Corporate Privacy Violation']
leaks = df.loc[df['classification'] == ' Attempted Information Leak']
poten_bad = df.loc[df['classification'] == ' Potentially Bad Traffic']
misc = df.loc[df['classification'] == ' Misc activity']
# Check if there are any NA classifications
na_class = df[df['classification'].isnull()]
print(na_class)

Unique classifications:  [' Generic Protocol Command Decode'
 ' Potential Corporate Privacy Violation' ' Misc activity'
 ' Attempted Information Leak' ' Potentially Bad Traffic']
Empty DataFrame
Columns: [time, sourceIP, sourcePort, destIP, destPort, classification, priority, label, packetinfo, packetinfocont'd, xref, internal_source]
Index: []


In [54]:
# get percentage of entries that are from internal sources
percent = df['internal_source'].value_counts(normalize=True) * 100
print('Full df: internal sources percentages: \n',percent)

# percent of leaks
percent = leaks['internal_source'].value_counts(normalize=True) * 100
print('\nLeaks df: internal sources percentages: \n',percent)

# percent of privacy_violations
percent = privacy_violations['internal_source'].value_counts(normalize=True) * 100
print('\nprivacy_violations df: internal sources percentages: \n',percent)

# percent of poten_bad
percent = poten_bad['internal_source'].value_counts(normalize=True) * 100
print('\nPotentially bad df: internal sources percentages: \n',percent)

# percent of misc
percent = misc['internal_source'].value_counts(normalize=True) * 100
print('\nmisc df: internal sources percentages: \n',percent)

# percent of poten_bad
percent = general_protocol['internal_source'].value_counts(normalize=True) * 100
print('\ngeneral_protocol df: internal sources percentages: \n',percent)


Full df: internal sources percentages: 
 True     70.361922
False    29.638078
Name: internal_source, dtype: float64

Leaks df: internal sources percentages: 
 True    100.0
Name: internal_source, dtype: float64

privacy_violations df: internal sources percentages: 
 True    100.0
Name: internal_source, dtype: float64

Potentially bad df: internal sources percentages: 
 True    100.0
Name: internal_source, dtype: float64

misc df: internal sources percentages: 
 False    99.943715
True      0.056285
Name: internal_source, dtype: float64

general_protocol df: internal sources percentages: 
 True    100.0
Name: internal_source, dtype: float64


In [55]:
# print(misc) 



# port 6667 = IRC
# chat function, used to control bot nets
# potentially: external servers issuing commands over work stations
# work stations are trying to send commands 
# potentially searching for open ports on workstations to find vulnerabilities

In [56]:
# Round times to the hour (to be grouped for plotting)
df['hour'] = df['time'].dt.floor('h')
# Create count aggregate of classification appearances over time
grouped_class_df = df.groupby(['classification', 'hour']).size().reset_index(name='count')
grouped_class_df.dtypes
print(grouped_class_df)

                 classification                hour  count
0    Attempted Information Leak 2012-04-05 21:00:00     16
1    Attempted Information Leak 2012-04-05 23:00:00     20
2    Attempted Information Leak 2012-04-06 00:00:00    213
3    Attempted Information Leak 2012-04-06 01:00:00     82
4    Attempted Information Leak 2012-04-06 03:00:00     35
..                          ...                 ...    ...
67      Potentially Bad Traffic 2012-04-05 21:00:00     16
68      Potentially Bad Traffic 2012-04-05 23:00:00     16
69      Potentially Bad Traffic 2012-04-06 00:00:00     68
70      Potentially Bad Traffic 2012-04-06 01:00:00     52
71      Potentially Bad Traffic 2012-04-06 03:00:00     20

[72 rows x 3 columns]


In [58]:
# SOPHIE'S FUNCTION:
def fill_df_nas(df, time_col, group_col, group_arr):
    """
    Expands dataframe to include all x-axis values for every group, and
    fills dataframes with NAs when there are no observations for the specified group.
    Useful for Plotly graphs in mode='lines+markers'
    :param: df: dataframe of interest
    :param: time_col: string name of column that contains time variable (or generally, the x variable)
    :param: group_col: string name of column that contains the groups to plot over different traces
    :param: group_arr: list or numpy array of all unique observations in df['group_col']
    """
    df_series = pd.Series(np.tile(group_arr, df[time_col].nunique()))
    df_idx_series = df[time_col].unique() \
        .repeat(len(group_arr))
    new_df = pd.DataFrame({time_col: df_idx_series,
                          group_col: df_series})
    df_with_nas = pd.merge(new_df, df, on=[time_col, group_col], how='left')
    return df_with_nas


fig_df = fill_df_nas(grouped_class_df, 'hour', 'classification', classifications)
print(fig_df)


                   hour                          classification  count
0   2012-04-05 21:00:00         Generic Protocol Command Decode  896.0
1   2012-04-05 21:00:00   Potential Corporate Privacy Violation    NaN
2   2012-04-05 21:00:00                           Misc activity  224.0
3   2012-04-05 21:00:00              Attempted Information Leak   16.0
4   2012-04-05 21:00:00                 Potentially Bad Traffic   16.0
..                  ...                                     ...    ...
150 2012-04-07 08:00:00         Generic Protocol Command Decode  808.0
151 2012-04-07 08:00:00   Potential Corporate Privacy Violation    NaN
152 2012-04-07 08:00:00                           Misc activity  322.0
153 2012-04-07 08:00:00              Attempted Information Leak    NaN
154 2012-04-07 08:00:00                 Potentially Bad Traffic    NaN

[155 rows x 3 columns]


In [60]:
# PLOTLY GRAPH OF CLASSIFICATIONS OVER TIME 

fig = go.Figure()
# plot all classifications as different lines
for i in range(0, len(classifications)):
    # filter for specific classification
    expr = fig_df['classification'] == classifications[i]
    fig.add_trace(go.Scatter(
        x=fig_df[expr]['hour'],
        y=fig_df[expr]['count'],
        name=classifications[i],
        connectgaps=False,
        mode='lines+markers')
    )
fig.show()