# 01 - Data Exploration: CICIDS2017 Dataset

This notebook performs exploratory data analysis (EDA) on the CICIDS2017 intrusion detection dataset.

## Objectives:
1. Load and inspect the dataset
2. Understand data structure and types
3. Analyze label distribution
4. Identify missing values and outliers
5. Visualize feature distributions
6. Explore correlations between features

## 1. Setup and Imports

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import yaml

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

# Figure size
plt.rcParams['figure.figsize'] = (14, 8)

print("‚úì Libraries imported successfully")

## 2. Load Configuration

In [None]:
# Load configuration
with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

# Paths
RAW_DATA_PATH = config['paths']['data_raw']
LABEL_COLUMN = config['dataset']['label_column']

print(f"Raw data path: {RAW_DATA_PATH}")
print(f"Label column: {LABEL_COLUMN}")

## 3. Load Dataset

We'll start by loading one file to explore, then load all files if needed.

In [None]:
# List all CSV files
csv_files = list(Path(RAW_DATA_PATH).glob('*.csv'))

print(f"Found {len(csv_files)} CSV files:")
for idx, file in enumerate(csv_files, 1):
    size_mb = file.stat().st_size / (1024 * 1024)
    print(f"  {idx}. {file.name} ({size_mb:.2f} MB)")

In [None]:
# Load first file for initial exploration
if csv_files:
    sample_file = csv_files[0]
    print(f"Loading: {sample_file.name}")
    df = pd.read_csv(sample_file, encoding='utf-8', low_memory=False)
    print(f"‚úì Loaded {len(df):,} records")
else:
    print("‚úó No CSV files found. Please extract the dataset first.")

## 4. Basic Dataset Information

In [None]:
# Dataset shape
print(f"Dataset Shape: {df.shape}")
print(f"  - Rows (samples): {df.shape[0]:,}")
print(f"  - Columns (features): {df.shape[1]}")

In [None]:
# First few rows
print("\n=== First 5 Rows ===")
df.head()

In [None]:
# Column names
print("\n=== Column Names ===")
print(f"Total columns: {len(df.columns)}")
for idx, col in enumerate(df.columns, 1):
    print(f"{idx:2d}. {col}")

In [None]:
# Data types
print("\n=== Data Types ===")
print(df.dtypes.value_counts())
print("\nColumn types:")
df.dtypes

In [None]:
# Dataset info
print("\n=== Dataset Info ===")
df.info()

## 5. Label Distribution Analysis

In [None]:
# Clean column names
df.columns = df.columns.str.strip()

# Label distribution
if LABEL_COLUMN in df.columns:
    print("\n=== Label Distribution ===")
    label_counts = df[LABEL_COLUMN].value_counts()
    label_percentages = df[LABEL_COLUMN].value_counts(normalize=True) * 100
    
    label_df = pd.DataFrame({
        'Count': label_counts,
        'Percentage': label_percentages
    })
    
    print(label_df)
    print(f"\nTotal unique labels: {df[LABEL_COLUMN].nunique()}")
else:
    print(f"‚úó Label column '{LABEL_COLUMN}' not found")

In [None]:
# Visualize label distribution
plt.figure(figsize=(14, 6))

# Bar plot
plt.subplot(1, 2, 1)
label_counts.plot(kind='bar', color='steelblue')
plt.title('Label Distribution (Count)', fontsize=14, fontweight='bold')
plt.xlabel('Attack Type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# Pie chart
plt.subplot(1, 2, 2)
label_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title('Label Distribution (Percentage)', fontsize=14, fontweight='bold')
plt.ylabel('')

plt.tight_layout()
plt.show()

# Check for class imbalance
imbalance_ratio = label_counts.max() / label_counts.min()
print(f"\n‚ö† Class Imbalance Ratio: {imbalance_ratio:.2f}:1")
if imbalance_ratio > 10:
    print("  ‚Üí High imbalance detected! Consider using SMOTE or other sampling techniques.")

## 6. Missing Values Analysis

In [None]:
# Missing values
print("\n=== Missing Values Analysis ===")
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_values,
    'Percentage': missing_percentage
})

missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(f"\nColumns with missing values: {len(missing_df)}")
    print(missing_df)
    
    # Visualize missing values
    plt.figure(figsize=(12, 6))
    missing_df['Percentage'].plot(kind='bar', color='coral')
    plt.title('Missing Values Percentage by Column', fontsize=14, fontweight='bold')
    plt.xlabel('Column', fontsize=12)
    plt.ylabel('Missing %', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("\n‚úì No missing values found!")

## 7. Duplicate Records Analysis

In [None]:
# Check for duplicates
print("\n=== Duplicate Records ===")
duplicates = df.duplicated().sum()
duplicate_percentage = (duplicates / len(df)) * 100

print(f"Duplicate rows: {duplicates:,} ({duplicate_percentage:.2f}%)")

if duplicates > 0:
    print("  ‚Üí Duplicates will be removed during preprocessing")

## 8. Numerical Features Analysis

In [None]:
# Select numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\n=== Numerical Features ===")
print(f"Total numerical features: {len(numerical_cols)}")

# Basic statistics
print("\nBasic Statistics:")
df[numerical_cols].describe()

In [None]:
# Check for infinite values
print("\n=== Infinite Values ===")
inf_counts = {}
for col in numerical_cols:
    inf_count = np.isinf(df[col]).sum()
    if inf_count > 0:
        inf_counts[col] = inf_count

if inf_counts:
    print(f"\nColumns with infinite values: {len(inf_counts)}")
    for col, count in sorted(inf_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"  {col}: {count:,}")
else:
    print("‚úì No infinite values found")

In [None]:
# Distribution of key features
print("\n=== Feature Distributions ===")

# Select some key features to visualize
key_features = [
    'Flow Duration',
    'Total Fwd Packets',
    'Total Backward Packets',
    'Flow Bytes/s',
    'Flow Packets/s'
]

# Filter features that exist
existing_features = [f for f in key_features if f in df.columns]

if existing_features:
    fig, axes = plt.subplots(2, 3, figsize=(16, 10))
    axes = axes.flatten()
    
    for idx, feature in enumerate(existing_features[:6]):
        # Remove infinite values for visualization
        data = df[feature].replace([np.inf, -np.inf], np.nan).dropna()
        
        axes[idx].hist(data, bins=50, color='skyblue', edgecolor='black', alpha=0.7)
        axes[idx].set_title(f'{feature}', fontsize=12, fontweight='bold')
        axes[idx].set_xlabel('Value')
        axes[idx].set_ylabel('Frequency')
        axes[idx].grid(alpha=0.3)
    
    # Remove extra subplots
    for idx in range(len(existing_features), 6):
        fig.delaxes(axes[idx])
    
    plt.tight_layout()
    plt.suptitle('Distribution of Key Features', y=1.02, fontsize=16, fontweight='bold')
    plt.show()
else:
    print("‚ö† Key features not found in dataset")

## 9. Correlation Analysis

In [None]:
# Calculate correlation matrix (sample for performance)
print("\n=== Correlation Analysis ===")
print("Computing correlation matrix (this may take a moment)...")

# Use a sample for correlation analysis to speed up
sample_size = min(10000, len(df))
df_sample = df[numerical_cols].sample(n=sample_size, random_state=42)

# Replace infinite values
df_sample = df_sample.replace([np.inf, -np.inf], np.nan).dropna()

correlation_matrix = df_sample.corr()
print(f"‚úì Correlation matrix computed ({correlation_matrix.shape[0]}x{correlation_matrix.shape[1]})")

In [None]:
# Visualize correlation matrix (top features)
plt.figure(figsize=(14, 12))

# Select top 20 features with highest variance
top_features = df_sample.var().nlargest(20).index
corr_subset = correlation_matrix.loc[top_features, top_features]

sns.heatmap(corr_subset, annot=False, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix (Top 20 Features by Variance)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Find highly correlated features
print("\n=== High Correlations (>0.95) ===")
high_corr_pairs = []

for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.95:
            high_corr_pairs.append((
                correlation_matrix.columns[i],
                correlation_matrix.columns[j],
                correlation_matrix.iloc[i, j]
            ))

if high_corr_pairs:
    print(f"\nFound {len(high_corr_pairs)} highly correlated feature pairs:")
    for feat1, feat2, corr in sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True)[:10]:
        print(f"  {feat1[:40]:40s} <-> {feat2[:40]:40s} : {corr:.3f}")
    print("\n  ‚Üí Consider removing one feature from each pair during feature selection")
else:
    print("‚úì No highly correlated features found")

## 10. Attack Types vs Features

In [None]:
# Compare feature distributions across attack types
print("\n=== Feature Analysis by Attack Type ===")

if LABEL_COLUMN in df.columns and existing_features:
    # Select 2 key features for comparison
    features_to_compare = existing_features[:2]
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    for idx, feature in enumerate(features_to_compare):
        # Sample data for each label
        plot_data = []
        labels = []
        
        for label in df[LABEL_COLUMN].unique()[:5]:  # Top 5 labels
            data = df[df[LABEL_COLUMN] == label][feature].replace(
                [np.inf, -np.inf], np.nan
            ).dropna().sample(min(1000, len(df[df[LABEL_COLUMN] == label])))
            plot_data.append(data)
            labels.append(label)
        
        axes[idx].boxplot(plot_data, labels=labels)
        axes[idx].set_title(f'{feature} by Attack Type', fontsize=12, fontweight='bold')
        axes[idx].set_xlabel('Attack Type')
        axes[idx].set_ylabel(feature)
        axes[idx].tick_params(axis='x', rotation=45)
        axes[idx].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 11. Summary and Recommendations

In [None]:
print("\n" + "="*70)
print("EXPLORATORY DATA ANALYSIS - SUMMARY")
print("="*70)

print(f"\nüìä Dataset Overview:")
print(f"  ‚Ä¢ Total samples: {len(df):,}")
print(f"  ‚Ä¢ Total features: {df.shape[1]}")
print(f"  ‚Ä¢ Numerical features: {len(numerical_cols)}")
print(f"  ‚Ä¢ Target classes: {df[LABEL_COLUMN].nunique()}")

print(f"\nüîç Data Quality:")
print(f"  ‚Ä¢ Missing values: {df.isnull().sum().sum():,} ({(df.isnull().sum().sum()/df.size)*100:.2f}%)")
print(f"  ‚Ä¢ Duplicate rows: {duplicates:,} ({duplicate_percentage:.2f}%)")
print(f"  ‚Ä¢ Columns with infinite values: {len(inf_counts)}")

print(f"\n‚ö†Ô∏è Class Imbalance:")
print(f"  ‚Ä¢ Imbalance ratio: {imbalance_ratio:.2f}:1")
print(f"  ‚Ä¢ Most common class: {label_counts.index[0]} ({label_counts.iloc[0]:,} samples)")
print(f"  ‚Ä¢ Least common class: {label_counts.index[-1]} ({label_counts.iloc[-1]:,} samples)")

print(f"\nüîó Feature Correlation:")
print(f"  ‚Ä¢ Highly correlated pairs (>0.95): {len(high_corr_pairs)}")

print(f"\nüìù Recommendations:")
print(f"  1. ‚úì Remove duplicate rows during preprocessing")
print(f"  2. ‚úì Handle missing values (drop or impute)")
print(f"  3. ‚úì Replace infinite values with NaN")
print(f"  4. ‚úì Apply SMOTE or other sampling for class imbalance")
print(f"  5. ‚úì Remove highly correlated features")
print(f"  6. ‚úì Normalize/standardize numerical features")
print(f"  7. ‚úì Consider feature selection (PCA, feature importance)")

print("\n" + "="*70)
print("‚úì Exploratory Data Analysis Complete!")
print("Next step: Run 02_data_preprocessing.ipynb")
print("="*70)

## 12. Optional: Load and Explore All Files

In [None]:
# Uncomment to load ALL CSV files (warning: may take time and memory)

# print("\n=== Loading All CSV Files ===")
# dfs = []
# 
# for file in csv_files:
#     print(f"Loading {file.name}...")
#     df_temp = pd.read_csv(file, encoding='utf-8', low_memory=False)
#     dfs.append(df_temp)
#     print(f"  Loaded {len(df_temp):,} records")
# 
# df_all = pd.concat(dfs, ignore_index=True)
# print(f"\n‚úì Total records across all files: {len(df_all):,}")
# 
# # Analyze combined dataset
# print("\n=== Combined Label Distribution ===")
# print(df_all[LABEL_COLUMN].value_counts())