# Pobieranie danych

In [3]:
!gdown http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.zip
!unzip MachineLearningCSV.zip

Downloading...
From: http://205.174.165.80/CICDataset/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV.zip
To: /content/MachineLearningCSV.zip
100% 235M/235M [02:03<00:00, 1.91MB/s]
Archive:  MachineLearningCSV.zip
   creating: MachineLearningCVE/
  inflating: MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv  
  inflating: MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv  


# Importy i ważne zmienne


In [81]:
import numpy as np
import pandas as pd
import os


from fastai.tabular.all import df_shrink
dir_path = './MachineLearningCVE'

drop_columns = [ # niepotrzebne kolumny zawierające metadane
    "Flow ID",
    'Fwd Header Length.1',
    "Source IP", "Src IP",
    "Source Port", "Src Port",
    "Destination IP", "Dst IP",
    "Destination Port", "Dst Port",
    "Timestamp",
]


# Czytanie ścieżek do plików i danych

In [82]:
def read_paths(dir_path):
  dspaths = []
  for dirname, _, filenames in os.walk(dir_path):
      for filename in filenames:
          if filename.endswith('.csv'):
              pds = os.path.join(dirname, filename)
              dspaths.append(pds)
              print(pds)
  return dspaths

def read_files(dspaths):
  individual_dfs = [pd.read_csv(dsp, sep=',', encoding='utf-8') for dsp in dspaths]
  [i.shape for i in individual_dfs]
  return individual_dfs

dspaths = read_paths(dir_path)
individual_dfs = read_files(dspaths)

./MachineLearningCVE/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
./MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv
./MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv
./MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
./MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
./MachineLearningCVE/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
./MachineLearningCVE/Tuesday-WorkingHours.pcap_ISCX.csv
./MachineLearningCVE/Friday-WorkingHours-Morning.pcap_ISCX.csv


# Usuwanie niepotrzebnych kolumn

In [83]:
def dropy(individual_dfs):
  for df in individual_dfs:
      df.columns = df.columns.str.strip() # sometimes there's leading / trailing whitespace
      df.drop(columns=drop_columns, inplace=True, errors='ignore')
  [i.shape for i in individual_dfs]
  return individual_dfs

individual_dfs = dropy(individual_dfs)

# Downsizing

In [None]:
individual_dfs[0].dtypes

In [84]:
individual_dfs = [df_shrink(df) for df in individual_dfs]

In [70]:
individual_dfs[0].dtypes

Flow Duration                     int32
Total Fwd Packets                 int16
Total Backward Packets            int32
Total Length of Fwd Packets       int32
Total Length of Bwd Packets       int32
                                 ...   
Idle Mean                       float32
Idle Std                        float32
Idle Max                          int32
Idle Min                          int32
Label                          category
Length: 77, dtype: object

# Usuwanie niezdefiniowanych wartości

In [85]:
def drop_nan(individual_dfs):
  for df in individual_dfs:
      df.replace([np.inf, -np.inf], np.nan, inplace=True)
      # print(df.isna().any(axis=1).sum(), "rows with at least one NaN to remove")
      df.dropna(inplace=True)
  [i.shape for i in individual_dfs]
  return individual_dfs

individual_dfs = drop_nan(individual_dfs)

# Usuwanie zduplikowanych wierszy

In [86]:
def drop_dupes(individual_dfs):
  for df in individual_dfs:
      print(df.duplicated().sum(), "fully duplicate rows to remove")
      df.drop_duplicates(inplace=True)
      df.reset_index(inplace=True, drop=True)
  [i.shape for i in individual_dfs]
  return individual_dfs

individual_dfs = drop_dupes(individual_dfs)

80765 fully duplicate rows to remove
106415 fully duplicate rows to remove
70650 fully duplicate rows to remove
166574 fully duplicate rows to remove
4447 fully duplicate rows to remove
14411 fully duplicate rows to remove
55931 fully duplicate rows to remove
14873 fully duplicate rows to remove


# Konwersja z csv na parquet

In [None]:
def convert(individual_dfs):
  for i, df in enumerate(individual_dfs):
      df.to_parquet(f"./MachineLearningCVE{dspaths[i].split('/')[-1].replace('.csv', '.parquet')}")

convert(individual_dfs)

# Czytanie plików i misc

In [80]:
data2 = pd.read_parquet('MachineLearningCVEMonday-WorkingHours.pcap_ISCX.parquet', engine='pyarrow')
data2.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,49188,4,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,49188,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,49188,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,49188,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,49486,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
