In [None]:
import pandas as pd
import numpy as np
import polars as pl

In [None]:
!pip install polars

In [None]:
import polars as pl

In [None]:
# Load the traning dataset
df = pl.read_csv("UNSW_NB15_training-set.csv")

# Schema
df.schema

In [None]:
df.head(5)

In [None]:
# Column names
df.columns

In [None]:
# Count rows
df.shape

In [None]:
# 2 helpful columns
# Label: 1=attack 0=normal
# attack_cat: type of attack

# Checking unique attacks
df.select("attack_cat").unique()

In [None]:
type(df)

In [None]:
df_pd = df.to_pandas()

In [None]:
df_pd['attack_cat'].unique()

In [None]:
# to see how many sessions per type
df_pd['attack_cat'].value_counts()

In [None]:
!pip install altair

In [None]:
pip install --upgrade typing_extensions

In [None]:
!pip show typing_extensions

In [None]:
pip install altair-saver

In [None]:
# Visualize with Altair - Attack Category Distribution

import altair as alt

In [None]:
import altair as alt

alt.data_transformers.disable_max_rows()

In [None]:
alt.Chart(df_pd).mark_bar().encode(
    x=alt.X('attack_cat:N', sort='-y'),
    y='count():Q',
    color='attack_cat:N'
).properties(
    title='Attack Category Distribution',
    width=700
)

In [None]:
# Compare attack vs. Normal Sessions

# creating a new column 'is_attack'

df_pd['is_attack'] = df_pd['label'].map({0:'Normal', 1:'Attack'})

In [None]:
import altair as alt

alt.Chart(df_pd).mark_bar().encode(
    x=alt.X('is_attack:N', title='Session Type'),
    y='count():Q',
    color='is_attack:N'
).properties(
    title='Normal vs. Attack Session Distribution',
    width=400
)

In [None]:
# compare session duration between attack N Normal

alt.Chart(df_pd).mark_boxplot().encode(
    x=alt.X('is_attack:N', title='Session Type'),
    y=alt.Y('dur:Q', title='Session Duration'),
    color='is_attack:N'
).properties(
    title='Session Duration by Attack Label',
    width=400
)

In [None]:
# Comparing Source bytes Sent(sbytes)
# helpful to detect heavy hitters like brute-force or large payload attacks

alt.Chart(df_pd).mark_boxplot().encode(
    x=alt.X('is_attack:N', title='Session Type'),
    y=alt.Y('sbytes:Q', title='Source Bytes Sent'),
    color='is_attack:N'
).properties(
    title='Source Bytes by Attack Label',
    width=400
)

In [None]:
# Comparing Categorical Feature - Protocol(proto)
# shows which protocols(like TCP,UDP) are more commonly used by each

alt.Chart(df_pd).mark_bar().encode(
    x=alt.X('proto:N', title='Protocol'),
    y='count():Q',
    color='is_attack:N',
    column='is_attack:N'
).properties(
    title='Protocol Usage by Attack vs Normal',
    width=400
)

In [None]:
# Getting top 10 most common protocols

top_protocols = df_pd['proto'].value_counts().nlargest(10).index.tolist()

In [None]:
# Filter the dataset

df_top_proto = df_pd[df_pd['proto'].isin(top_protocols)]

In [None]:
alt.Chart(df_top_proto).mark_bar().encode(
    x=alt.X('proto:N', title='Top Protocols'),
    y='count():Q',
    color='is_attack:N',
    column='is_attack:N'
).properties(
    title='Top Protocol Usage by Attacj vs Normal',
    width=400
)

In [None]:
export_cols = [
    'is_attack', 'attack_cat', 'dur', 'proto', 'sbytes', 'dbytes', 'ct_src_ltm', 'ct_dst_ltm'
]

df_pd[export_cols].to_csv('unsw_nb15_tableau_export.csv', index=False)