In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

# List all device folders
device_folders = ['Device1', 'Device2', 'Device3', 'Device4', 'Device5', 'Device6', 'Device7', 'Device8', 'Device9']

# Iterate over each device folder
for device_folder in device_folders:
    if device_folder == 'Device3' or device_folder == 'Device7':
        try:
            # Read the CSV files for the current device (excluding Mirai section)
            benign = pd.read_csv(f'./dataset/{device_folder}/benign_traffic.csv')
            gafgyt_combo = pd.read_csv(f'./dataset/{device_folder}/gafgyt/combo.csv')
            gafgyt_scan = pd.read_csv(f'./dataset/{device_folder}/gafgyt/scan.csv')
            gafgyt_tcp = pd.read_csv(f'./dataset/{device_folder}/gafgyt/tcp.csv')
            gafgyt_udp = pd.read_csv(f'./dataset/{device_folder}/gafgyt/udp.csv')
            gafgyt_junk = pd.read_csv(f'./dataset/{device_folder}/gafgyt/junk.csv')

            # Add a new column 'type' to each DataFrame
            benign['type'] = 'benign'
            gafgyt_combo['type'] = 'gafgyt_combo'
            gafgyt_scan['type'] = 'gafgyt_scan'
            gafgyt_tcp['type'] = 'gafgyt_tcp'
            gafgyt_udp['type'] = 'gafgyt_udp'
            gafgyt_junk['type'] = 'gafgyt_junk'

            # Concatenate the DataFrames for the current device
            concatenated_df = pd.concat([benign, gafgyt_combo, gafgyt_scan, gafgyt_tcp, gafgyt_udp,
                                         gafgyt_junk])

        except FileNotFoundError:
            # Skip this device if the required files are not found
            continue
    else:
        try:
            # Read the CSV files for the current device
            benign = pd.read_csv(f'./dataset/{device_folder}/benign_traffic.csv')
            mirai_ack = pd.read_csv(f'./dataset/{device_folder}/mirai/ack.csv')
            mirai_scan = pd.read_csv(f'./dataset/{device_folder}/mirai/scan.csv')
            mirai_syn = pd.read_csv(f'./dataset/{device_folder}/mirai/syn.csv')
            mirai_udp = pd.read_csv(f'./dataset/{device_folder}/mirai/udp.csv')
            mirai_udpplain = pd.read_csv(f'./dataset/{device_folder}/mirai/udpplain.csv')
            gafgyt_combo = pd.read_csv(f'./dataset/{device_folder}/gafgyt/combo.csv')
            gafgyt_scan = pd.read_csv(f'./dataset/{device_folder}/gafgyt/scan.csv')
            gafgyt_tcp = pd.read_csv(f'./dataset/{device_folder}/gafgyt/tcp.csv')
            gafgyt_udp = pd.read_csv(f'./dataset/{device_folder}/gafgyt/udp.csv')
            gafgyt_junk = pd.read_csv(f'./dataset/{device_folder}/gafgyt/junk.csv')

            # Add a new column 'type' to each DataFrame
            benign['type'] = 'benign'
            mirai_ack['type'] = 'mirai_ack'
            mirai_scan['type'] = 'mirai_scan'
            mirai_syn['type'] = 'mirai_syn'
            mirai_udp['type'] = 'mirai_udp'
            mirai_udpplain['type'] = 'mirai_udpplain'
            gafgyt_combo['type'] = 'gafgyt_combo'
            gafgyt_scan['type'] = 'gafgyt_scan'
            gafgyt_tcp['type'] = 'gafgyt_tcp'
            gafgyt_udp['type'] = 'gafgyt_udp'
            gafgyt_junk['type'] = 'gafgyt_junk'

            # Concatenate the DataFrames for the current device
            concatenated_df = pd.concat([benign, mirai_ack, mirai_scan, mirai_syn, mirai_udp,
                                         mirai_udpplain, gafgyt_combo, gafgyt_scan, gafgyt_tcp, gafgyt_udp,
                                         gafgyt_junk])

        except FileNotFoundError:
            # Skip this device if the required files are not found
            continue

    # Reset the index of the concatenated DataFrame
    concatenated_df.reset_index(drop=True, inplace=True)

    # Count the occurrences of each label
    label_counts = concatenated_df['type'].value_counts()

    # Define custom colors for the bars
    colors = ['steelblue', 'mediumseagreen', 'darkorange', 'royalblue',
              'lightcoral', 'mediumorchid', 'goldenrod', 'teal', 'slategray', 'tomato']

    # Create a larger figure size
    plt.figure(figsize=(12, 8))

    # Create a bar plot with custom colors
    ax = label_counts.plot(kind='bar', color=colors)

    # Add count numbers on top of each bar
    for i, count in enumerate(label_counts):
        ax.text(i, count + 10, str(count), ha='center', color='black', fontweight='bold')

    plt.xlabel('Label')
    plt.ylabel('Count')
    plt.title(f'Distribution of Labels - {device_folder}')

    # Customize the plot aesthetics
    plt.xticks(rotation=45)
    plt.ylim(top=max(label_counts) * 1.1)
    plt.grid(axis='y', linestyle='--')

    # Save the plot with a higher DPI setting and larger size
    plt.savefig(f'{device_folder}_distribution_plot.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Save the concatenated DataFrame in Parquet format
    concatenated_df.reset_index(inplace=True)
    concatenated_df.rename(columns={'index': 'Unnamed: 0'}, inplace=True)
    concatenated_df.to_parquet(f'{device_folder}.parquet', index=False)

print("All Device parquet files created successfully!")


All Device parquet files created successfully!
