# Exploratory Data Analysis for CoT-KG Network Intrusion Detection

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_processing.preprocess import load_and_preprocess_data

%matplotlib inline
plt.style.use('seaborn')

## Load Data

In [None]:
df = load_and_preprocess_data('../data/raw/CICIDS2017.csv')
print(f"Dataset shape: {df.shape}")
df.head()

## Data Overview

In [None]:
df.info()

## Class Distribution

In [None]:
plt.figure(figsize=(12, 6))
df['Label'].value_counts().plot(kind='bar')
plt.title('Distribution of Network Traffic Classes')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Statistical Summary

In [None]:
df.describe()

## Correlation Analysis

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(20, 16))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.title('Correlation Heatmap of Numeric Features')
plt.tight_layout()
plt.show()

## Feature Distributions

In [None]:
def plot_feature_distributions(df, features, n_cols=3):
    n_rows = (len(features) - 1) // n_cols + 1
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
    axes = axes.flatten()

    for i, feature in enumerate(features):
        sns.histplot(df[feature], ax=axes[i], kde=True)
        axes[i].set_title(feature)
        axes[i].set_xlabel('')

    for i in range(len(features), len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.show()

# Select a subset of features to plot
features_to_plot = ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 
                    'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 
                    'Fwd Packet Length Max', 'Bwd Packet Length Max']

plot_feature_distributions(df, features_to_plot)

## Feature Importance

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Prepare the data
X = df.drop('Label', axis=1)
y = LabelEncoder().fit_transform(df['Label'])

# Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importances
importances = pd.DataFrame({'feature': X.columns, 'importance': rf.feature_importances_})
importances = importances.sort_values('importance', ascending=False).head(20)

# Plot feature importances
plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=importances)
plt.title('Top 20 Most Important Features')
plt.tight_layout()
plt.show()

## Pairplot of Top Features

In [None]:
top_features = importances['feature'].head(5).tolist() + ['Label']
sns.pairplot(df[top_features], hue='Label')
plt.tight_layout()
plt.show()

## Conclusion

Based on our exploratory data analysis, we can draw the following conclusions:

1. The dataset is imbalanced, with some classes being significantly underrepresented.
2. There are strong correlations between certain features, which might indicate redundancy.
3. Many features have skewed distributions, which might require normalization or transformation.
4. The top important features identified by the Random Forest classifier could be good candidates for our initial focus in the CoT-KG model.
5. The pairplot of top features shows clear separation between some classes, indicating that these features are indeed informative for classification.

Next steps:
1. Address class imbalance through techniques like oversampling or undersampling.
2. Consider feature selection or dimensionality reduction to address multicollinearity.
3. Apply appropriate scaling or transformation to the skewed features.
4. Incorporate the insights from this EDA into the design of our CoT prompts and knowledge graph structure.