# 02: Perform Data Profiling
This notebook checks the shape, feature names, dtypes, statistics, missing value counts, infinite value counts and class distribution (binary label + attack categories) of DataFrame.

## 2.1: Import Library

In [1]:
import numpy as np
import pandas as pd

## 2.2: Load CSV File

In [2]:
df = pd.read_csv('../data/processed/UNSW-NB15.csv', na_values=['-'], low_memory=False)

In [3]:
df.head()

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0


## 2.3: Shape

In [4]:
number_of_records = df.shape[0]
number_of_features = df.shape[1]
print(f"Number of records: {number_of_records:,}")
print(f"Number of features: {number_of_features}")

Number of records: 2,540,047
Number of features: 49


## 2.4: Feature Names

In [5]:
df.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
       'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime',
       'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat',
       'Label'],
      dtype='str')

## 2.5: Info

In [6]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 2540047 entries, 0 to 2540046
Data columns (total 49 columns):
 #   Column            Dtype  
---  ------            -----  
 0   srcip             str    
 1   sport             str    
 2   dstip             str    
 3   dsport            str    
 4   proto             str    
 5   state             str    
 6   dur               float64
 7   sbytes            int64  
 8   dbytes            int64  
 9   sttl              int64  
 10  dttl              int64  
 11  sloss             int64  
 12  dloss             int64  
 13  service           str    
 14  Sload             float64
 15  Dload             float64
 16  Spkts             int64  
 17  Dpkts             int64  
 18  swin              int64  
 19  dwin              int64  
 20  stcpb             int64  
 21  dtcpb             int64  
 22  smeansz           int64  
 23  dmeansz           int64  
 24  trans_depth       int64  
 25  res_bdy_len       int64  
 26  Sjit              float64

## 2.6: Statistics

In [7]:
df.describe()

Unnamed: 0,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,Sload,Dload,Spkts,...,ct_flw_http_mthd,is_ftp_login,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,Label
count,2540047.0,2540047.0,2540047.0,2540047.0,2540047.0,2540047.0,2540047.0,2540047.0,2540047.0,2540047.0,...,1191902.0,1110168.0,2540047.0,2540047.0,2540047.0,2540047.0,2540047.0,2540047.0,2540047.0,2540047.0
mean,0.6587916,4339.6,36427.59,62.78197,30.76681,5.163921,16.32944,36956450.0,2450861.0,33.28884,...,0.2345856,0.0396994,9.206988,8.988958,6.439103,6.900986,4.642139,3.592729,6.845886,0.126487
std,13.92493,56405.99,161096.0,74.62277,42.85089,22.51707,56.59474,118604300.0,4224863.0,76.28388,...,0.7940924,0.1996589,10.83676,10.82249,8.162034,8.205062,8.477579,6.174445,11.25828,0.3323975
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,0.001037,200.0,178.0,31.0,29.0,0.0,0.0,135396.3,11915.94,2.0,...,0.0,0.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0,0.0
50%,0.015861,1470.0,1820.0,31.0,29.0,3.0,4.0,589303.8,589317.9,12.0,...,0.0,0.0,5.0,5.0,3.0,4.0,1.0,1.0,2.0,0.0
75%,0.2145545,3182.0,14894.0,31.0,29.0,7.0,14.0,2039923.0,2925974.0,44.0,...,0.0,0.0,10.0,10.0,6.0,7.0,2.0,1.0,5.0,0.0
max,8786.638,14355770.0,14657530.0,255.0,254.0,5319.0,5507.0,5988000000.0,128761900.0,10646.0,...,36.0,4.0,67.0,67.0,67.0,67.0,67.0,60.0,67.0,1.0


## 2.7: Missing Values

In [8]:
missing_values = df.isna().sum()
missing_values[missing_values > 0]

sport                     2
dsport                    7
service             1246397
ct_flw_http_mthd    1348145
is_ftp_login        1429879
attack_cat          2218764
dtype: int64

## 2.8: Infinite Values

In [9]:
infinite_count = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
print(f"Number of infinite values: {infinite_count}")

Number of infinite values: 0


## 2.9: Class Distribution

In [10]:
df["Label"].value_counts()

Label
0    2218764
1     321283
Name: count, dtype: int64

In [11]:
df["attack_cat"].value_counts()

attack_cat
Generic             215481
Exploits             44525
 Fuzzers             19195
DoS                  16353
 Reconnaissance      12228
 Fuzzers              5051
Analysis              2677
Backdoor              1795
Reconnaissance        1759
 Shellcode            1288
Backdoors              534
Shellcode              223
Worms                  174
Name: count, dtype: int64