In [1]:
import pandas as pd 
import numpy as np 

import zipfile
import os

In [2]:
zip_file_path = '1-Neris-20110810.binetflow.parquet.zip'

# Directory to extract the files to
extracted_folder = 'data/'

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder)

# List the contents of the extracted folder
extracted_files = os.listdir(extracted_folder)
print("Extracted files:", extracted_files)

Extracted files: ['1-Neris-20110810.binetflow.parquet', 'modified_utc_dataset.csv']


In [3]:
#!conda install fastparquet

In [4]:
data = pd.read_parquet('data/1-Neris-20110810.binetflow.parquet')
print(data.head(4))

        dur proto    dir state  stos  dtos  tot_pkts  tot_bytes  src_bytes  \
0  1.026539   tcp     ->  S_RA   0.0   0.0         4        276        156   
1  1.009595   tcp     ->  S_RA   0.0   0.0         4        276        156   
2  3.056586   tcp     ->  SR_A   0.0   0.0         3        182        122   
3  3.111769   tcp     ->  SR_A   0.0   0.0         3        182        122   

                                    label                  Family  
0  flow=Background-Established-cmpgw-CVUT  20110810.binetflow.csv  
1  flow=Background-Established-cmpgw-CVUT  20110810.binetflow.csv  
2             flow=Background-TCP-Attempt  20110810.binetflow.csv  
3             flow=Background-TCP-Attempt  20110810.binetflow.csv  


# Let's see if we need to handle the missing values 
# CTU-13 Dataset Column Descriptions

The CTU-13 dataset contains network traffic data with various attributes. Here is a detailed description of each column in the dataset:

| Column Name | Description |
|-------------|-------------|
| **dur**     | **Duration**: The duration of the flow in seconds. |
| **proto**   | **Protocol**: The network protocol used (e.g., TCP, UDP). |
| **dir**     | **Direction**: The direction of the traffic flow. The values indicate the direction relative to the monitoring point (e.g., `->` means outgoing traffic). |
| **state**   | **State**: The state of the network flow. Common values include: <br> - **S_RA**: Syn-Ack-Reset <br> - **SR_A**: Syn-Reset-Ack <br> - **SR_SA**: Syn-Reset-Syn-Ack |
| **stos**    | **Service Type of Source**: The type of service for the source IP. |
| **dtos**    | **Service Type of Destination**: The type of service for the destination IP. |
| **tot_pkts**| **Total Packets**: The total number of packets in the flow. |
| **tot_bytes**| **Total Bytes**: The total number of bytes in the flow. |
| **src_bytes**| **Source Bytes**: The number of bytes sent by the source. |
| **label**   | **Label**: Descriptive label for the flow, indicating whether it's background traffic, an established flow, or part of a specific attack scenario. For example: <br> - `flow=Background-Established-cmpgw-CVUT` <br> - `flow=Background-TCP-Attempt` <br> - `flow=Background-TCP-Established` |
| **Family**  | **Family**: The family or scenario to which this flow belongs. It often includes the file name where the data was recorded (e.g., `20110810.bi20110810.binetflow.csv   |
``


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1621173 entries, 0 to 1621172
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype   
---  ------     --------------    -----   
 0   dur        1621173 non-null  float32 
 1   proto      1621173 non-null  category
 2   dir        1621173 non-null  category
 3   state      1621172 non-null  category
 4   stos       1616028 non-null  float32 
 5   dtos       1529939 non-null  float32 
 6   tot_pkts   1621173 non-null  int32   
 7   tot_bytes  1621173 non-null  int64   
 8   src_bytes  1621173 non-null  int64   
 9   label      1621173 non-null  category
 10  Family     1621173 non-null  category
dtypes: category(5), float32(3), int32(1), int64(2)
memory usage: 58.8 MB


In [6]:
# let's drop family and Rename the columns
data.rename(columns={
    'dur': 'Duration',
    'proto': 'Protocol',
    'dir': 'Direction',
    'state': 'State',
    'stos': 'Source_Type_of_Service',
    'dtos': 'Destination_Type_of_Service',
    'tot_pkts': 'Total_Packets',
    'tot_bytes': 'Total_Bytes',
    'src_bytes': 'Source_Bytes',
    'label': 'Label'
}, inplace=True)

columns_to_drop = ['Family']

# Drop the columns if they exist
data.drop(columns=[col for col in columns_to_drop if col in data.columns], inplace=True)

print(data.head(2))


   Duration Protocol Direction State  Source_Type_of_Service  \
0  1.026539      tcp        ->  S_RA                     0.0   
1  1.009595      tcp        ->  S_RA                     0.0   

   Destination_Type_of_Service  Total_Packets  Total_Bytes  Source_Bytes  \
0                          0.0              4          276           156   
1                          0.0              4          276           156   

                                    Label  
0  flow=Background-Established-cmpgw-CVUT  
1  flow=Background-Established-cmpgw-CVUT  


In [7]:
new_file_path = 'data/modified_utc_dataset.csv'
data.to_csv(new_file_path, index=False)