# Importing Libraries

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Connection to drive

In [None]:
try:
    from google.colab import drive
    IN_COLAB = True
    print("Running on Google Colab. ")
except:
    IN_COLAB = False
    print("Not running on Google Colab. ")

In [3]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')

In [4]:
if IN_COLAB:
    os.chdir('/content/gdrive/MyDrive/Tesi/dataset')
else:
    os.chdir('./dataset')

# Downloading, Reading and Merging Data

In [5]:
def move_directories(orig_path, dest_path):
    # Check if the destination folder exists, otherwise create it
    if not os.path.exists(dest_path):
        os.makedirs(dest_path)
    
    # Iter over all files and folders in the source directory
    for item in os.listdir(orig_path):
        orig_item_path = os.path.join(orig_path, item)
        
        # Check if it is a folder
        if os.path.isdir(orig_item_path):
            # Defines the destination path for the folder
            dest_item_path = os.path.join(dest_path, item)
            
            # Move the folder by renaming it
            os.rename(orig_item_path, dest_item_path)
            print(f"Moved directory: {orig_item_path} -> {dest_item_path}")

In [None]:
import kagglehub

# Check if some subdirectories are not empty
path_to_check = "./LUFlow/"
subdirs = [d for d in os.listdir(path_to_check) if os.path.isdir(os.path.join(path_to_check, d))]
non_empty_subdirs = [d for d in subdirs if os.listdir(os.path.join(path_to_check, d))]

dest_path = './LUFlow'

if non_empty_subdirs:
    print("Non-empty subdirectories:", non_empty_subdirs)
    print("Skip downloading.")
else:
    print("All subdirectories are empty.")
    print("Download dataset.")

    # Download latest version
    path = kagglehub.dataset_download("mryanm/luflow-network-intrusion-detection-data-set")

    print("Path to dataset files:", path)

    move_directories(path, dest_path)

In [None]:
df_list = []
i = 0

for root, dirs, files in os.walk(dest_path):
    for file in files:
        # checks if 'file' does not exist in the directory
        # checks if 'csv' is in the file name
        # checks if a particular string is in the file name
        # insert in the list only a subset of the existing files
        if not os.path.isfile(file) and 'csv' in file and '2021.01' in file and i < 6:
            df_list.append(pd.read_csv(os.path.join(root, file)))
            i+=1

perc = 0.8

files_perc = int(len(df_list) * perc)
print(f'Num files: {files_perc}')

df = pd.concat(df_list[:files_perc])
df_test = pd.concat(df_list[files_perc:])

df = pd.concat(df_list, ignore_index=True)
len(df.columns)

In [None]:
df.info()

# Exploratory Data Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Count the occurrences of each label
label_counts = df['label'].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(label_counts, labels=label_counts.index, autopct=lambda p: '{:.0f}\n({:.1f}%)'.format(p * sum(label_counts) / 100, p))

# Show the plot
plt.title('Distribution of Labels')
plt.show()

# Data Preprocessing

In [10]:
outliers = df['label'] == 'outlier'
df = df[~outliers]

df = df.reset_index(drop=True)

In [11]:
outliers = df_test['label'] == 'outlier'
df_test = df_test[~outliers]

df_test = df_test.reset_index(drop=True)

In [12]:
columns_to_drop = ['avg_ipt', 'dest_ip', 'dest_port', 'entropy', 'src_ip', 'src_port', 'time_end', 'total_entropy', 'duration']

# Drop specified columns
df = df.drop(columns=columns_to_drop)

df_test = df_test.drop(columns=columns_to_drop)

In [None]:
df.dropna(axis=1, inplace = True)
df_test.dropna(axis=1, inplace = True)

missing_values = df.isnull().sum()

print(missing_values)

In [14]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Fit and transform the label column
df['label'] = label_encoder.fit_transform(df['label'])
df_test['label'] = label_encoder.fit_transform(df_test['label'])

In [None]:
# Count the occurrences of each label
label_counts = df['label'].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(label_counts, labels=label_counts.index, autopct=lambda p: '{:.0f}\n({:.1f}%)'.format(p * sum(label_counts) / 100, p))
# Show the plot
plt.title('Distribution of Labels')
plt.show()

In [None]:
df.columns

## Construction of Interval Information Granules

### Selecting index using time windows

In [17]:
time_slices = [16, 128, 256, 1024, 2048, 3016]

matching_indices_slices = {}

for ts in time_slices:
  # maximum time limit of the granule
  limit = df.iloc[0]['time_start'] + ts
  # number of groups for time_slice
  num_group = 0
  matching_indices_slices[ts] = {num_group:[0]}

  for i, row in df.iterrows():
      # if the value of time_start is greater than limit then limit is updated and
      # it is initilized a new group
      # otherwise the index of the row is added to matching_indeces_slices
      if row['time_start'] >= limit:
        limit = row['time_start'] + ts
        num_group += 1

        matching_indices_slices[ts][num_group] = [i]
      else:
        # print(ts, num_gran, i)
        if i != 0:
          matching_indices_slices[ts][num_group].append(i)


### Drop Label column

In [18]:
# Separate features (X) and target variable (y)
X_train = df.drop('label', axis=1)
y_train = df['label']

X_test = df_test.drop('label', axis=1)
y_test = df_test['label']

In [None]:
X_train

In [None]:
y_train

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm
import math

### Creating interval information granules

with justifiable granularity principle


implementing both 'a' and 'b' to define the frames to include in the granule

In [None]:
X_train_granule = {}
y_train_granule = {}

experimental_data = {}

alpha = 2.0
l = 100

for time_slice in tqdm(time_slices):
  X_train_granule[time_slice] = {}
  y_train_granule[time_slice] = {}

  experimental_data[time_slice] = {}

  for group in matching_indices_slices[time_slice]:

    V_b_opt = float('-inf')
    V_a_opt = float('-inf')
    # obtains index for previous defined groups
    group_indeces = matching_indices_slices[time_slice][group]

    experimental_data[time_slice][group] = X_train.iloc[group_indeces]

    N = len(experimental_data[time_slice][group])

    # if the slice has only one element, the granule is the element itself
    if N == 1:
      X_train_granule[time_slice][group] = experimental_data[time_slice][group]
      y_train_granule[time_slice][group] = y_train.iloc[X_train_granule[time_slice][group].index]
      continue

    m = experimental_data[time_slice][group]['time_start'].mean()
    y_max = experimental_data[time_slice][group]['time_start'].max()

    delta_y = (y_max - m)/l

    for h in range(0, l):
      b = m+h*delta_y
      a = m-h*delta_y

      condition = (experimental_data[time_slice][group]['time_start'] > m) & (experimental_data[time_slice][group]['time_start'] <= b)
      cov_b = len(experimental_data[time_slice][group][condition])/N
      sp_b = math.exp(-alpha*abs(m-b))

      condition = (experimental_data[time_slice][group]['time_start'] > a) & (experimental_data[time_slice][group]['time_start'] <= m)
      cov_a = len(experimental_data[time_slice][group][condition])/N
      sp_a = math.exp(-alpha*abs(a-m))

      V_b = cov_b*sp_b
      V_a = cov_a*sp_a

      if V_a > V_a_opt:
        a_opt = a
        V_a_opt = V_a

      if V_b > V_b_opt:
        b_opt = b
        V_b_opt = V_b

    # Make sure the granule is not empty
    # filtered_granule = experimental_data[time_slice][group][(experimental_data[time_slice][group] > a_opt) & 
    #                                                       (experimental_data[time_slice][group] <= b_opt)]

    condition = (experimental_data[time_slice][group]['time_start'] > a_opt) & (experimental_data[time_slice][group]['time_start'] <= b_opt)
    filtered_granule = experimental_data[time_slice][group][condition]

    if filtered_granule.empty:
      continue  # Skip this group if no data fits the condition

    X_train_granule[time_slice][group] = filtered_granule
    # Extract the selected rows
    selected_rows = y_train.iloc[X_train_granule[time_slice][group].index]

    # Find the most frequent value (mode)
    most_frequent_value = selected_rows.mode().iloc[0]

    # Assign the modal value to all selected rows
    y_train.iloc[X_train_granule[time_slice][group].index] = most_frequent_value
    y_train_granule[time_slice][group] = y_train.iloc[X_train_granule[time_slice][group].index]

### Training of Random Forest models

In [None]:
from tqdm import tqdm

list_reports = {}
best_models = {}

# Random Forest with GridSearchCV
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
rf_model = RandomForestClassifier()
rf_grid = GridSearchCV(rf_model, rf_params, cv=10, scoring='accuracy')


for time_slice in tqdm(time_slices):
  y_train_list = pd.concat(y_train_granule[time_slice].values(), ignore_index=True)
  #y_train_list = list(y_train_granule[time_slice].values())
  X_train_granule_df = pd.concat(X_train_granule[time_slice].values(), ignore_index=True)

  list_reports[time_slice] = {}
  best_models[time_slice] = {}

  rf_grid.fit(X_train_granule_df, y_train_list)
  rf_best_model = rf_grid.best_estimator_

  rf_pred = rf_best_model.predict(X_test)
  report = classification_report(y_test, rf_pred, output_dict=True)

  list_reports[time_slice] = pd.DataFrame(report).transpose()
  best_models[time_slice] = rf_best_model

# Model Performance

In [24]:
# Go up one directory
os.chdir('..')

# Check if 'reports' directory exists, otherwise create it
if not os.path.exists('reports'):
    os.mkdir('reports')

# Check if 'best_models' directory exists, otherwise create it
if not os.path.exists('best_models'):
    os.mkdir('best_models')

In [25]:
import time
import joblib

string = time.strftime("%Y%m%d-%H%M%S")
#string = 'justifiablegranularity_complete'         

os.mkdir(f'./reports/{string}')
os.mkdir(f'./best_models/{string}')

os.chdir(f'./reports/{string}')

for report in list_reports:
  list_reports[report].to_csv(f'report_{report}.csv')


os.chdir(f'../../best_models/{string}')

for model in best_models:
  joblib.dump(best_models[model], f'model_{model}_{string}.joblib')

In [None]:
equidistant_x = np.arange(len(time_slices))
feature_to_print = 'precision'

plt.plot(equidistant_x, [list_reports[i].loc['accuracy'][feature_to_print] for i in time_slices])

plt.title(f'{feature_to_print} over time_slices')
plt.xticks(equidistant_x, time_slices)
plt.xlabel('time slices')
plt.ylabel(f'{feature_to_print}')
plt.show()

In [None]:
for time_slice in time_slices:
  print(f'Report {time_slice}: \n{list_reports[time_slice]}')
  print('\n')