In [1]:
import pandas as pd
import numpy as np

from analysis_src.basic_data_inspection import DataInspector, DataTypeInspectionStrategy, SummaryStatisticsInspectionStrategy
from analysis_src.missing_value_analysis import SimpleMissingValueAnalysis

In [2]:
dataset_path = "../extracted_data/ASNM-CDX-2009.csv"
df = pd.read_csv(dataset_path, sep=";")
df.head()

Unnamed: 0,id,label_2,label_poly,SrcIP,DstIP,SrcPort,DstPort,SrcMAC,DstMAC,SrcIPInVlan,...,GaussProds4All[2],GaussProds4All[3],GaussProds8All[0],GaussProds8All[1],GaussProds8All[2],GaussProds8All[3],GaussProds8All[4],GaussProds8All[5],GaussProds8All[6],GaussProds8All[7]
0,1,False,0_other,10.1.60.203,10.1.60.73,63637,5222,00:0C:29:D7:B7:89,00:1A:6C:FB:3D:F0,True,...,39.854715,13.226058,30.142188,27.376107,65.213653,28.129521,98.002262,34.486082,26.990536,16.26257
1,2,False,0_apache,10.1.10.69,10.1.60.187,2201,80,00:1E:7A:21:B4:00,00:1A:6C:FB:3D:F0,False,...,42.446332,10.750317,25.283971,25.283971,15.870392,175.667483,27.907473,62.828639,26.862012,14.79745
2,3,False,0_apache,10.1.10.69,10.1.60.187,2201,80,00:1E:7A:21:B4:00,00:1A:6C:FB:3D:F0,False,...,62.828639,12.236273,25.275107,26.156893,175.667483,27.907473,182.848444,26.254394,26.862012,14.79745
3,4,False,0_other,10.1.10.69,10.1.60.187,2202,443,00:1E:7A:21:B4:00,00:1A:6C:FB:3D:F0,False,...,42.162261,11.164318,25.275107,24.760861,25.046723,478.845737,99.88519,19.688286,31.250377,14.821825
4,5,False,0_apache,10.1.10.69,10.1.60.187,2203,80,00:1E:7A:21:B4:00,00:1A:6C:FB:3D:F0,False,...,62.828639,12.236273,25.275107,26.156893,175.667483,27.907473,182.848444,26.254394,26.862012,14.79745


In [3]:
# Basic Data Inspection : Data Type Inspection
data_inspector = DataInspector(DataTypeInspectionStrategy())
data_inspector.execute_inspection(df)


Data Types and Non-Null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5771 entries, 0 to 5770
Columns: 877 entries, id to GaussProds8All[7]
dtypes: bool(5), float64(331), int64(536), object(5)
memory usage: 38.4+ MB
None

Printing all the columns and their data type:
Column: id, Data type: int64
Column: label_2, Data type: bool
Column: label_poly, Data type: object
Column: SrcIP, Data type: object
Column: DstIP, Data type: object
Column: SrcPort, Data type: int64
Column: DstPort, Data type: int64
Column: SrcMAC, Data type: object
Column: DstMAC, Data type: object
Column: SrcIPInVlan, Data type: bool
Column: DstIPInVlan, Data type: bool
Column: InPkt1s10i[0], Data type: int64
Column: InPkt1s10i[1], Data type: int64
Column: InPkt1s10i[2], Data type: int64
Column: InPkt1s10i[3], Data type: int64
Column: InPkt1s10i[4], Data type: int64
Column: InPkt1s10i[5], Data type: int64
Column: InPkt1s10i[6], Data type: int64
Column: InPkt1s10i[7], Data type: int64
Column: InPkt1s10i[8], 

In [4]:
# Basic Data Inpection : Dataset Summary Inspection
data_inspector.set_strategy(SummaryStatisticsInspectionStrategy())
data_inspector.execute_inspection(df)


Summary Statistics (Numerical Features):
                id       SrcPort      DstPort  InPkt1s10i[0]  InPkt1s10i[1]  \
count  5771.000000   5771.000000  5771.000000    5771.000000    5771.000000   
mean   2886.000000  39220.685843   340.984925       7.046959       7.456247   
std    1666.088533  24358.193471   725.021824       7.289772       9.367462   
min       1.000000   1032.000000    25.000000       0.000000       0.000000   
25%    1443.500000   3981.500000    80.000000       2.000000       3.000000   
50%    2886.000000  54457.000000    80.000000       5.000000       4.000000   
75%    4328.500000  56996.000000   443.000000       8.000000       7.000000   
max    5771.000000  65533.000000  5222.000000      34.000000      41.000000   

       InPkt1s10i[2]  InPkt1s10i[3]  InPkt1s10i[4]  InPkt1s10i[5]  \
count    5771.000000    5771.000000    5771.000000    5771.000000   
mean        7.514815       6.808525       7.348986       6.960319   
std         9.934251       9.687397    

## **INSIGHTS**

### 1. Data Types and Null Counts
- The Dataset contains 5771 entries.
- **Data Types**: There are 5 columns of boolean and object data type, 331 columns of float type, 536 columns of int64 type.
    - There are no null values, which is expected since ASNM-CDX-2009 dataset is collected by monitoring the network traffic, the chances of null values being there is near zero.

### 2. Summary Statistics:
- **Numerical Features**:
    Most of the numerical features such as 

In [5]:
analysis = SimpleMissingValueAnalysis()
analysis.analyze(df)

No missing values.
