# Heat Map Generation Tool

## Data Preparation

### Importing data set

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

column_names = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl',
'sloss', 'dloss', 'service', 'Sload', 'Dload', 'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt',
'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd',
'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
'attack_cat', 'Label']

df = pd.read_csv('/content/drive/My Drive/UNSW-NB15_4.csv', header=None, names=column_names)
df_copy = df.copy()
df_copy.head()

### Encode non-numeric values

In [None]:
# Select all non-numeric columns other than attack category label
non_numeric_cols = df_copy.select_dtypes(exclude=[int, float]).drop(columns=['attack_cat'])

df_copy['sport'] = df_copy['sport'].astype(str)
df_copy['dsport'] = df_copy['dsport'].astype(str)
df_copy['ct_ftp_cmd'] = df_copy['ct_ftp_cmd'].astype(str)

In [None]:
# Encode selected non-numeric columns
label_encoders = {}
for col in non_numeric_cols:
    label_encoders[col] = LabelEncoder()
    print(col)
    df_copy[col] = label_encoders[col].fit_transform(df_copy[col])

df_copy.head()

### Normalise Data

In [None]:
scaler = MinMaxScaler()

# Drop last two columns (attack category and data label)
to_normalise = df_copy.drop(columns=['attack_cat', 'Label'])

# Normalise data
df_normal = pd.DataFrame(scaler.fit_transform(to_normalise), columns=to_normalise.columns) * 255
df_normal = df_normal.fillna(0)

# Add dropped columns back
df_normal = pd.concat([df_normal, df_copy[['attack_cat', 'Label']]], axis=1)
df_normal.head()

### Split Data

In [None]:
# Split dataset by category
df_nonattack = df_normal[df_normal['Label'] == 0]
df_fuzzers = df_normal[df_normal['attack_cat'].str.strip() == 'Fuzzers']
df_analysis = df_normal[df_normal['attack_cat'].str.strip() == 'Analysis']
df_backdoors = df_normal[df_normal['attack_cat'].str.strip() == 'Backdoor']
df_dos = df_normal[df_normal['attack_cat'].str.strip() == 'DoS']
df_exploits = df_normal[df_normal['attack_cat'].str.strip() == 'Exploits']
df_generic = df_normal[df_normal['attack_cat'].str.strip() == 'Generic']
df_recon = df_normal[df_normal['attack_cat'].str.strip() == 'Reconnaissance']
df_shellcode = df_normal[df_normal['attack_cat'].str.strip() == 'Shellcode']
df_worms = df_normal[df_normal['attack_cat'].str.strip() == 'Worms']

### Oversampling using SMOTE (Analysis, Backdoors, Shellcode)

In [None]:
# Find proportion of total samples contained in current CSV
proportion_analysis = (len(df_analysis) / (132 * 20))
proportion_backdoors = (len(df_backdoors) / (120 * 20))
proportion_shellcode = (len(df_shellcode) / (74 * 20))

# Calculate number of samples to be generated from current CSV
num_samples_analysis = 800 * 20 * proportion_analysis
num_samples_backdoors = 800 * 20 * proportion_backdoors
num_samples_shellcode = 800 * 20 * proportion_shellcode

OS_size_anaylsis = int(num_samples_analysis) + 1
OS_size_backdoors = int(num_samples_backdoors) + 1
OS_size_shellcode = int(num_samples_shellcode) + 1

In [None]:
# Concatenate non-attack class for SMOTE
df_analysis_SMOTE = pd.concat([df_nonattack, df_analysis], ignore_index=True)
df_backdoors_SMOTE = pd.concat([df_nonattack, df_backdoors], ignore_index=True)
df_shellcode_SMOTE = pd.concat([df_nonattack, df_shellcode], ignore_index=True)

# Separate features and target for each class
df_analysis_y = df_analysis_SMOTE[['attack_cat', 'Label']]
df_backdoors_y = df_backdoors_SMOTE[['attack_cat', 'Label']]
df_shellcode_y = df_shellcode_SMOTE[['attack_cat', 'Label']]

df_analysis_X = df_analysis_SMOTE.drop(columns=['attack_cat', 'Label'])
df_backdoors_X = df_backdoors_SMOTE.drop(columns=['attack_cat', 'Label'])
df_shellcode_X = df_shellcode_SMOTE.drop(columns=['attack_cat', 'Label'])

# Convert target columns to single column for SMOTE
y_analysis = df_analysis_y['Label']
y_backdoors = df_backdoors_y['Label']
y_shellcode = df_shellcode_y['Label']

In [None]:
# Apply SMOTE with specified number of samples for each class
smote_analysis = SMOTE(sampling_strategy={1: OS_size_anaylsis}, random_state=42)
X_analysis_resampled, y_analysis_resampled = smote_analysis.fit_resample(df_analysis_X, y_analysis)

smote_backdoors = SMOTE(sampling_strategy={1: OS_size_backdoors}, random_state=42)
X_backdoors_resampled, y_backdoors_resampled = smote_backdoors.fit_resample(df_backdoors_X, y_backdoors)

smote_shellcode = SMOTE(sampling_strategy={1: OS_size_shellcode}, random_state=42)
X_shellcode_resampled, y_shellcode_resampled = smote_shellcode.fit_resample(df_shellcode_X, y_shellcode)

In [None]:
# Combine resampled data back into DataFrames
df_analysis_OS = pd.DataFrame(X_analysis_resampled, columns=df_analysis_X.columns)
df_analysis_OS['attack_cat'] = df_analysis_y['attack_cat'].iloc[0]
df_analysis_OS['Label'] = y_analysis_resampled
df_analysis_OS = df_analysis_OS[df_analysis_OS['Label'] == 1]

df_backdoors_OS = pd.DataFrame(X_backdoors_resampled, columns=df_backdoors_X.columns)
df_backdoors_OS['attack_cat'] = df_backdoors_y['attack_cat'].iloc[0]
df_backdoors_OS['Label'] = y_backdoors_resampled
df_backdoors_OS = df_backdoors_OS[df_backdoors_OS['Label'] == 1]

df_shellcode_OS = pd.DataFrame(X_shellcode_resampled, columns=df_shellcode_X.columns)
df_shellcode_OS['attack_cat'] = df_shellcode_y['attack_cat'].iloc[0]
df_shellcode_OS['Label'] = y_shellcode_resampled
df_shellcode_OS = df_shellcode_OS[df_shellcode_OS['Label'] == 1]

### Generate random sample of non-attack from entire dataset

#### Generate all csv subsets before proceeding!!

In [None]:
# Need all 4 outputs generated to proceed !!!
df_nonattack.to_csv('non_attack_output_csv4.csv', index=False)

In [None]:
# Read in non-attack CSVs
na_1 = pd.read_csv('non_attack_output_csv1.csv')
na_2 = pd.read_csv('non_attack_output_csv2.csv')
na_3 = pd.read_csv('non_attack_output_csv3.csv')
na_4 = pd.read_csv('non_attack_output_csv4.csv')

In [None]:
# Concatenate and randomly sample
combined_na_1 = pd.concat([na_1, na_2], ignore_index=True)
combined_na_2 = pd.concat([na_3, na_4], ignore_index=True)
final_na = pd.concat([combined_na_1, combined_na_2], ignore_index=True)

sampled_na = final_na.sample(n=20000, random_state=42)

In [None]:
# Export for numeric comparison
sampled_na.to_csv('/content/drive/My Drive/RTP_figs/numeric_datasets/non-attack_random_sample.csv', index=False)

### Export Numeric Data for Comparative Analysis

In [None]:
# Export data
subset_nonattack = df_nonattack.head(16000)

subset_nonattack.to_csv('nonattack_output_OS.csv', index=False)
df_dos.to_csv('dos_output4.csv', index=False)
df_analysis_OS.to_csv('analysis_output4.csv', index=False)
df_backdoors_OS.to_csv('backdoors_output3.csv', index=False)
df_shellcode_OS.to_csv('shellcode_output4.csv', index=False)
df_fuzzers.to_csv('fuzzers_output4.csv', index=False)
df_exploits.to_csv('exploits_output4.csv', index=False)
df_generic.to_csv('generic_output4.csv', index=False)
df_recon.to_csv('recon_output4.csv', index=False)

#### Generate all subsets before proceeding

In [None]:
# Read in attack CSVs
dos_1 = pd.read_csv('dos_output1.csv')
dos_2 = pd.read_csv('dos_output2.csv')
dos_3 = pd.read_csv('dos_output3.csv')
dos_4 = pd.read_csv('dos_output4.csv')

analysis_1 = pd.read_csv('analysis_output1.csv')
analysis_2 = pd.read_csv('analysis_output2.csv')
analysis_3 = pd.read_csv('analysis_output3.csv')
analysis_4 = pd.read_csv('analysis_output4.csv')

backdoors_1 = pd.read_csv('backdoors_output1.csv')
backdoors_2 = pd.read_csv('backdoors_output2.csv')
backdoors_3 = pd.read_csv('backdoors_output3.csv')

shellcode_1 = pd.read_csv('shellcode_output1.csv')
shellcode_2 = pd.read_csv('shellcode_output2.csv')
shellcode_3 = pd.read_csv('shellcode_output3.csv')
shellcode_4 = pd.read_csv('shellcode_output4.csv')

exploits_1 = pd.read_csv('exploits_output1.csv')
exploits_2 = pd.read_csv('exploits_output2.csv')
exploits_3 = pd.read_csv('exploits_output3.csv')
exploits_4 = pd.read_csv('exploits_output4.csv')

generic_1 = pd.read_csv('generic_output1.csv')
generic_2 = pd.read_csv('generic_output2.csv')
generic_3 = pd.read_csv('generic_output3.csv')
generic_4 = pd.read_csv('generic_output4.csv')

recon_1 = pd.read_csv('recon_output1.csv')
recon_2 = pd.read_csv('recon_output2.csv')
recon_3 = pd.read_csv('recon_output3.csv')
recon_4 = pd.read_csv('recon_output4.csv')

fuzzers_1 = pd.read_csv('fuzzers_output1.csv')
fuzzers_2 = pd.read_csv('fuzzers_output2.csv')
fuzzers_3 = pd.read_csv('fuzzers_output3.csv')
fuzzers_4 = pd.read_csv('fuzzers_output4.csv')

In [None]:
# Concatenate
combined_dos_1 = pd.concat([dos_1, dos_2], ignore_index=True)
combined_dos_2 = pd.concat([dos_3, dos_4], ignore_index=True)
final_dos = pd.concat([combined_dos_1, combined_dos_2], ignore_index=True)

combined_analysis_1 = pd.concat([analysis_1, analysis_2], ignore_index=True)
combined_analysis_2 = pd.concat([analysis_3, analysis_4], ignore_index=True)
final_analysis = pd.concat([combined_analysis_1, combined_analysis_2], ignore_index=True)

combined_backdoors_1 = pd.concat([backdoors_1, backdoors_2], ignore_index=True)
final_backdoors = pd.concat([combined_backdoors_1, backdoors_3], ignore_index=True)

combined_shellcode_1 = pd.concat([shellcode_1, shellcode_2], ignore_index=True)
combined_shellcode_2 = pd.concat([shellcode_3, shellcode_4], ignore_index=True)
final_shellcode = pd.concat([combined_shellcode_1, combined_shellcode_2], ignore_index=True)

combined_exploits_1 = pd.concat([exploits_1, exploits_2], ignore_index=True)
combined_exploits_2 = pd.concat([exploits_3, exploits_4], ignore_index=True)
final_exploits = pd.concat([combined_exploits_1, combined_exploits_2], ignore_index=True)

combined_generic_1 = pd.concat([generic_1, generic_2], ignore_index=True)
combined_generic_2 = pd.concat([generic_3, generic_4], ignore_index=True)
final_generic = pd.concat([combined_generic_1, combined_generic_2], ignore_index=True)

combined_recon_1 = pd.concat([recon_1, recon_2], ignore_index=True)
combined_recon_2 = pd.concat([recon_3, recon_4], ignore_index=True)
final_recon = pd.concat([combined_recon_1, combined_recon_2], ignore_index=True)

combined_fuzzers_1 = pd.concat([fuzzers_1, fuzzers_2], ignore_index=True)
combined_fuzzers_2 = pd.concat([fuzzers_3, fuzzers_4], ignore_index=True)
final_fuzzers = pd.concat([combined_fuzzers_1, combined_fuzzers_2], ignore_index=True)

In [None]:
# Export selection
final_dos.head(16000).to_csv('/content/drive/My Drive/RTP_figs/numeric_datasets/dos_output_OS.csv', index=False)
final_analysis.head(16000).to_csv('/content/drive/My Drive/RTP_figs/numeric_datasets/analysis_output_OS.csv', index=False)
final_backdoors.head(16000).to_csv('/content/drive/My Drive/RTP_figs/numeric_datasets/backdoors_output_OS.csv', index=False)
final_shellcode.head(16000).to_csv('/content/drive/My Drive/RTP_figs/numeric_datasets/shellcode_output_OS.csv', index=False)
final_exploits.head(16000).to_csv('/content/drive/My Drive/RTP_figs/numeric_datasets/exploits_output_OS.csv', index=False)
final_generic.head(16000).to_csv('/content/drive/My Drive/RTP_figs/numeric_datasets/generic_output_OS.csv', index=False)
final_recon.to_csv('/content/drive/My Drive/RTP_figs/numeric_datasets/recon_output_OS.csv', index=False)
final_fuzzers.head(16000).to_csv('/content/drive/My Drive/RTP_figs/numeric_datasets/fuzzers_output_OS.csv', index=False)
subset_nonattack.to_csv('/content/drive/My Drive/RTP_figs/numeric_datasets/nonattack_output_OS.csv', index=False)

## Generate Heat Maps

### Analysis

In [None]:
# Calculate the total number of iterations needed
num_iterations = len(df_analysis_OS) // 20
startNumber = 608

for i in range(num_iterations):
    # Extract 20 rows for the current iteration
    start_index = i * 20
    end_index = (i + 1) * 20
    image_array = df_analysis_OS.iloc[start_index:end_index, :47].values.astype(np.uint8)

    # Format the iteration number with leading zeros
    file_number = str(startNumber + i).zfill(4)

    # Plot and save the image
    plt.figure(figsize=(5, 5))
    plt.imshow(image_array, cmap='jet', aspect='auto')
    plt.axis('off')
    plt.savefig(f'/content/drive/My Drive/RTP_figs/final/Analysis/Analysis{file_number}.png')
    plt.close()

### Backdoors

In [None]:
# Calculate the total number of iterations needed
num_iterations = len(df_backdoors_OS) // 20
startNumber = 377

for i in range(num_iterations):
    # Extract 20 rows for the current iteration
    start_index = i * 20
    end_index = (i + 1) * 20
    image_array = df_backdoors_OS.iloc[start_index:end_index, :47].values.astype(np.uint8)

    # Format the iteration number with leading zeros
    file_number = str(startNumber + i).zfill(4)

    # Plot and save the image
    plt.figure(figsize=(5, 5))
    plt.imshow(image_array, cmap='jet', aspect='auto')
    plt.axis('off')
    plt.savefig(f'/content/drive/My Drive/RTP_figs/final/Backdoors/Backdoors{file_number}.png')
    plt.close()

### Shell code

In [None]:
# Calculate the total number of iterations needed
num_iterations = len(df_shellcode_OS) // 20
startNumber = 616

for i in range(num_iterations):
    # Extract 20 rows for the current iteration
    start_index = i * 20
    end_index = (i + 1) * 20
    image_array = df_shellcode_OS.iloc[start_index:end_index, :47].values.astype(np.uint8)

    # Format the iteration number with leading zeros
    file_number = str(startNumber + i).zfill(4)

    # Plot and save the image
    plt.figure(figsize=(5, 5))
    plt.imshow(image_array, cmap='jet', aspect='auto')
    plt.axis('off')
    plt.savefig(f'/content/drive/My Drive/RTP_figs/final/Shellcode/Shellcode{file_number}.png')
    plt.close()

### Non-attack



In [None]:
# Calculate the total number of iterations needed
num_iterations = len(sampled_na) // 20
startNumber = 1

for i in range(num_iterations):
    # Extract 20 rows for the current iteration
    start_index = i * 20
    end_index = (i + 1) * 20
    image_array = sampled_na.iloc[start_index:end_index, :47].values.astype(np.uint8)

    # Format the iteration number with leading zeros
    file_number = str(startNumber + i).zfill(4)

    # Plot and save the image
    plt.figure(figsize=(5, 5))
    plt.imshow(image_array, cmap='jet', aspect='auto')
    plt.axis('off')
    plt.savefig(f'/content/drive/My Drive/RTP_figs/final/Nonattack_sampled/non_attack_sampled_{file_number}.png')
    plt.close()

### Denial of Service (DoS)

In [None]:
# Calculate total number of iterations in df
num_iterations = len(df_dos) // 20
startNumber = 572

for i in range(num_iterations):
    # Extract 20 rows for the current iteration
    start_index = i * 20
    end_index = (i + 1) * 20
    image_array = df_dos.iloc[start_index:end_index, :47].values.astype(np.uint8)

    # Format the file number
    file_number = str(startNumber + i).zfill(4)

    # Plot and save the image
    plt.figure(figsize=(5, 5))
    plt.imshow(image_array, cmap='jet', aspect='auto')
    plt.axis('off')
    plt.savefig(f'/content/drive/My Drive/RTP_figs/final/DoS/DoS{file_number}.png')
    plt.close()

### Fuzzers

In [None]:
# Calculate the total number of iterations needed
num_iterations = len(df_fuzzers) // 20
startNumber = 486

for i in range(num_iterations):
    # Extract 20 rows for the current iteration
    start_index = i * 20
    end_index = (i + 1) * 20
    image_array = df_fuzzers.iloc[start_index:end_index, :47].values.astype(np.uint8)

    # Format the iteration number with leading zeros
    file_number = str(startNumber + i).zfill(4)

    # Plot and save the image
    plt.figure(figsize=(5, 5))
    plt.imshow(image_array, cmap='jet', aspect='auto')
    plt.axis('off')
    plt.savefig(f'/content/drive/My Drive/RTP_figs/final/Fuzzers/Fuzzers{file_number}.png')
    plt.close()

### Exploits

In [None]:
# Calculate the total number of iterations needed
num_iterations = len(df_exploits) // 20
startNumber = 271

for i in range(num_iterations):
    # Extract 20 rows for the current iteration
    start_index = i * 20
    end_index = (i + 1) * 20
    image_array = df_exploits.iloc[start_index:end_index, :47].values.astype(np.uint8)

    # Format the iteration number with leading zeros
    file_number = str(startNumber + i).zfill(4)

    # Plot and save the image
    plt.figure(figsize=(5, 5))
    plt.imshow(image_array, cmap='jet', aspect='auto')
    plt.axis('off')
    plt.savefig(f'/content/drive/My Drive/RTP_figs/final/Exploits/Exploits{file_number}.png')
    plt.close()

### Generic

In [None]:
# Calculate the total number of iterations needed
num_iterations = len(df_generic) // 20
startNumber = 377

for i in range(num_iterations):
    # Extract 20 rows for the current iteration
    start_index = i * 20
    end_index = (i + 1) * 20
    image_array = df_generic.iloc[start_index:end_index, :47].values.astype(np.uint8)

    # Format the iteration number with leading zeros
    file_number = str(startNumber + i).zfill(4)

    # Plot and save the image
    plt.figure(figsize=(5, 5))
    plt.imshow(image_array, cmap='jet', aspect='auto')
    plt.axis('off')
    plt.savefig(f'/content/drive/My Drive/RTP_figs/final/Generic/Generic{file_number}.png')
    plt.close()

### Reconnaissance

In [None]:
# Calculate the total number of iterations needed
num_iterations = len(df_recon) // 20
startNumber = 522

for i in range(num_iterations):
    # Extract 20 rows for the current iteration
    start_index = i * 20
    end_index = (i + 1) * 20
    image_array = df_recon.iloc[start_index:end_index, :47].values.astype(np.uint8)

    # Format the iteration number with leading zeros
    file_number = str(startNumber + i).zfill(4)

    # Plot and save the image
    plt.figure(figsize=(5, 5))
    plt.imshow(image_array, cmap='jet', aspect='auto')
    plt.axis('off')
    plt.savefig(f'/content/drive/My Drive/RTP_figs/final/Recon/Recon{file_number}.png')
    plt.close()