# Importing Libraries

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Connection to drive

In [None]:
try:
    from google.colab import drive
    IN_COLAB = True
    print("Running on Google Colab. ")
except:
    IN_COLAB = False
    print("Not running on Google Colab. ")

Running on Google Colab. 


In [None]:
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
if IN_COLAB:
    os.chdir('/content/gdrive/MyDrive/Tesi/dataset')

# Reading and Merging Data

In [None]:
import zipfile

extract_dir = "./LUFlow"

if not os.path.exists("./LUFlow"):

    zip_file_path = "./LUFlow.zip"

    # Open the zip file and extract all the files
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

    print("Files extracted successfully.")
else:
    print('Already extracted')

Already extracted


In [None]:
df_list = []
i = 0

for root, dirs, files in os.walk(extract_dir):
    for file in files:
        # checks if 'file' does not exist in the directory
        # checks if 'csv' is in the file name
        # checks if a particular string is in the file name
        # insert in the list only a subset of the existing files
        if not os.path.isfile(file) and 'csv' in file and '2020.08' in file and i < 6:
            df_list.append(pd.read_csv(os.path.join(root, file)))
            i+=1

perc = 0.8

files_perc = int(len(df_list) * perc)
print(f'Num files: {files_perc}')

df = pd.concat(df_list[:files_perc])
df_test = pd.concat(df_list[files_perc:])

df = pd.concat(df_list, ignore_index=True)
len(df.columns)

In [None]:
df.info()

# Exploratory Data Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Count the occurrences of each label
label_counts = df['label'].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(label_counts, labels=label_counts.index, autopct=lambda p: '{:.0f}\n({:.1f}%)'.format(p * sum(label_counts) / 100, p))

# Show the plot
plt.title('Distribution of Labels')
plt.show()

# Data Preprocessing

In [None]:
outliers = df['label'] == 'outlier'
df = df[~outliers]

df = df.reset_index(drop=True)

In [None]:
outliers = df_test['label'] == 'outlier'
df_test = df_test[~outliers]

df_test = df_test.reset_index(drop=True)

In [None]:
columns_to_drop = ['avg_ipt', 'dest_ip', 'dest_port', 'entropy', 'src_ip', 'src_port', 'time_end', 'total_entropy', 'duration']

# Drop specified columns
df = df.drop(columns=columns_to_drop)

df_test = df_test.drop(columns=columns_to_drop)

In [None]:
df.dropna(axis=1, inplace = True)
df_test.dropna(axis=1, inplace = True)

missing_values = df.isnull().sum()

print(missing_values)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Fit and transform the label column
df['label'] = label_encoder.fit_transform(df['label'])
df_test['label'] = label_encoder.fit_transform(df_test['label'])

In [None]:
# Count the occurrences of each label
label_counts = df['label'].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(label_counts, labels=label_counts.index, autopct=lambda p: '{:.0f}\n({:.1f}%)'.format(p * sum(label_counts) / 100, p))
# Show the plot
plt.title('Distribution of Labels')
plt.show()

In [None]:
df.columns

## Construction of Interval Information Granules

### Selecting index using time windows

In [None]:
time_slices = [16, 128, 256, 1024, 2048, 3016]

matching_indices_slices = {}

for ts in time_slices:
  # maximum time limit of the granule
  limit = df.iloc[0]['time_start'] + ts
  # number of groups for time_slice
  num_group = 0
  matching_indices_slices[ts] = {num_group:[0]}

  for i, row in df.iterrows():
      # if the value of time_start is greater than limit then limit is updated and
      # it is initilized a new group
      # otherwise the index of the row is added to matching_indeces_slices
      if row['time_start'] >= limit:
        limit = row['time_start'] + ts
        num_group += 1

        matching_indices_slices[ts][num_group] = [i]
      else:
        # print(ts, num_gran, i)
        if i != 0:
          matching_indices_slices[ts][num_group].append(i)


### Drop Label column

In [None]:
# Separate features (X) and target variable (y)
X_train = df.drop('label', axis=1)
y_train = df['label']

X_test = df_test.drop('label', axis=1)
y_test = df_test['label']

In [None]:
X_train

In [None]:
y_train

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm
import math

### Creating interval information granules

with justifiable granularity principle


In [None]:
X_train_granule = {}
y_train_granule = {}

experimental_data = {}

alpha = 2.0
l = 100

for time_slice in tqdm(time_slices):
  X_train_granule[time_slice] = {}
  y_train_granule[time_slice] = {}

  experimental_data[time_slice] = {}

  for group in matching_indices_slices[time_slice]:

    V_b_opt = float('-inf')
    # obtains index for previous defined groups
    group_indeces = matching_indices_slices[time_slice][group]

    experimental_data[time_slice][group] = X_train.iloc[group_indeces]

    N = len(experimental_data[time_slice][group])

    a = X_train.iloc[group_indeces]['time_start'].head(1)

    m = experimental_data[time_slice][group].mean()
    y_max = experimental_data[time_slice][group].max()

    delta_y = (y_max - m)/l

    for h in range(0, l):
      b = m['time_start']+h*delta_y['time_start']

      cov = len(experimental_data[time_slice][group][(experimental_data[time_slice][group]['time_start'] <= b)])/N  # capire se <= o <

      sp = math.exp(-alpha*abs(m['time_start']-b))

      V_b = cov*sp

      if V_b > V_b_opt:
        b_opt = b
        V_b_opt = V_b


    X_train_granule[time_slice][group] = experimental_data[time_slice][group][(experimental_data[time_slice][group]['time_start'] <= b_opt)] # capire se <= o <
    y_train_granule[time_slice][group] = y_train.iloc[X_train_granule[time_slice][group].index]


### Training of Random Forest models

In [None]:
from tqdm import tqdm

list_reports = {}
best_models = {}

# Random Forest with GridSearchCV
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}
rf_model = RandomForestClassifier()
rf_grid = GridSearchCV(rf_model, rf_params, cv=10, scoring='accuracy')


for time_slice in tqdm(time_slices):
  y_train_list = pd.concat(y_train_granule[time_slice].values(), ignore_index=True)
  X_train_granule_df = pd.concat(X_train_granule[time_slice].values(), ignore_index=True)

  list_reports[time_slice] = {}
  best_models[time_slice] = {}

  rf_grid.fit(X_train_granule_df, y_train_list)
  rf_best_model = rf_grid.best_estimator_

  rf_pred = rf_best_model.predict(X_test)
  report = classification_report(y_test, rf_pred, output_dict=True)

  list_reports[time_slice] = pd.DataFrame(report).transpose()
  best_models[time_slice] = rf_best_model

# Model Performance

In [None]:
import time
import joblib

timestr = time.strftime("%Y%m%d-%H%M%S")

os.mkdir(f'./reports/{timestr}')
os.mkdir(f'./best_models/{timestr}')

os.chdir(f'./reports/{timestr}')

for report in list_reports:
  list_reports[report].to_csv(f'report_{report}.csv')


os.chdir(f'../../best_models/{timestr}')

for model in best_models:
  joblib.dump(best_models[model], f'model_{model}_{timestr}.joblib')

In [None]:
equidistant_x = np.arange(len(time_slices))
feature_to_print = 'precision'

plt.plot(equidistant_x, [list_reports[i].loc['accuracy'][feature_to_print] for i in time_slices])

plt.title(f'{feature_to_print} over time_slices')
plt.xticks(equidistant_x, time_slices)
plt.xlabel('time slices')
plt.ylabel(f'{feature_to_print}')
plt.show()

In [None]:
for time_slice in time_slices:
  print(f'Report {time_slice}: \n{list_reports[time_slice]}')
  print('\n')