# Importing Libraries

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

# Connection to drive

In [None]:
try:
    from google.colab import drive
    IN_COLAB = True
    print("Running on Google Colab. ")
except:
    IN_COLAB = False
    print("Not running on Google Colab. ")

In [3]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')

In [4]:
if IN_COLAB:
    os.chdir('/content/gdrive/MyDrive/Tesi/dataset')
else:
    os.chdir('./dataset')

# Downloading, Reading and Merging Data

In [5]:
def move_directories(orig_path, dest_path):
    # Check if the destination folder exists, otherwise create it
    if not os.path.exists(dest_path):
        os.makedirs(dest_path)
    
    # Iter over all files and folders in the source directory
    for item in os.listdir(orig_path):
        orig_item_path = os.path.join(orig_path, item)
        
        # Check if it is a folder
        if os.path.isdir(orig_item_path):
            # Defines the destination path for the folder
            dest_item_path = os.path.join(dest_path, item)
            
            # Move the folder by renaming it
            os.rename(orig_item_path, dest_item_path)
            print(f"Moved directory: {orig_item_path} -> {dest_item_path}")

In [None]:
import kagglehub

# Check if some subdirectories are not empty
path_to_check = "./LUFlow/"
subdirs = [d for d in os.listdir(path_to_check) if os.path.isdir(os.path.join(path_to_check, d))]
non_empty_subdirs = [d for d in subdirs if os.listdir(os.path.join(path_to_check, d))]

dest_path = './LUFlow'

if non_empty_subdirs:
    print("Non-empty subdirectories:", non_empty_subdirs)
    print("Skip downloading.")
else:
    print("All subdirectories are empty.")
    print("Download dataset.")

    # Download latest version
    path = kagglehub.dataset_download("mryanm/luflow-network-intrusion-detection-data-set")

    print("Path to dataset files:", path)

    move_directories(path, dest_path)

In [None]:
#subset = '2021.02'

df_list = []
i=0

for root, dirs, files in os.walk(dest_path):
    for file in files:
        # checks if 'file' does not exist in the directory
        # checks if 'csv' is in the file name
        # checks if a particular string is in the file name
        # insert in the list only a subset of the existing files
        if not os.path.isfile(file) and 'csv' in file: #and subset in file and i < 5:
            chunk_iter = pd.read_csv(os.path.join(root, file), chunksize=10000)
            for chunk in chunk_iter:
                df_list.append(chunk)
            
            del chunk_iter
            gc.collect()

df = pd.concat(df_list, ignore_index=True)

del df_list
gc.collect()

# Statistic Analysis

- For selected colums see central tendency (mean and variance)
- Benign, malicious percentage
- Analize trends of benign, malicious for the three different years
- See quantity of missing data is relevant
- Correlation matrix (codice stava già fatto nel codice iniziale)

In [None]:
#df.head()

In [None]:
#df.info()

## Description of the dataset columns

In [None]:
df.describe()

## Classes distribution (Benign and Malicious)

In [None]:
perc = df['label'].value_counts(normalize=True) * 100
print(perc)

perc.plot(kind='pie', autopct='%1.1f%%', startangle=90, cmap='viridis')
plt.title("Percentage distribution of classes")
plt.ylabel("")  # Per nascondere l'etichetta dell'asse y
plt.show()

## Columns values distribution

non ce la fa ad eseguirlo

In [12]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# selected_columns = ['bytes_in', 'bytes_out', 'num_pkts_out', 'num_pkts_in', 'proto', 'time_start']

# # Impostazione della griglia di subplot
# n_columns = len(selected_columns)
# n_rows = (n_columns + 1) // 2  # Organizza i subplot in due colonne

# fig, axes = plt.subplots(n_rows, 2, figsize=(12, 5 * n_rows))
# axes = axes.flatten()  # Rende l'array bidimensionale unidimensionale per un accesso più semplice

# for i, col in enumerate(selected_columns):
#     sns.histplot(df[col], kde=True, ax=axes[i])
#     axes[i].set_title(f"Distribution of column {col}")

# # Nascondi eventuali subplot vuoti
# for j in range(i + 1, len(axes)):
#     axes[j].axis('off')

# plt.tight_layout()
# plt.show()

## Correlation matrix

In [None]:
df_numerical = df.drop(columns=['label', 'src_ip', 'dest_ip'])
correlation_matrix = df_numerical.corr()

plt.figure(figsize=(16, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation matrix")
plt.show()

## Number of NaN values

In [None]:
total_nan = df.isna().sum().sum()
print("Total NaN values in DataFrame:", total_nan)

In [None]:
rows_with_nan = df[df.isna().any(axis=1)]
print(rows_with_nan)