# DeceptiCloud Data Analysis

This notebook analyzes the results from DeceptiCloud experiments comparing:
- **Adaptive**: DQN agent dynamically selecting honeypots
- **Static**: Fixed honeypot deployment (baseline)

## Objectives
1. Load and visualize experiment results
2. Compare adaptive vs static performance
3. Statistical significance testing
4. Reward trajectory analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully")

## 1. Load Experiment Data

Load both summary and per-timestep results from the adaptive and static experiments.

In [None]:
# Load adaptive (DQN) results
adaptive_summary = pd.read_csv('../results/results_summary.csv')
adaptive_timestep = pd.read_csv('../results/results_per_timestep.csv')

# Load static baseline results (if available)
static_summary_path = '../results/static_results_summary.csv'
static_timestep_path = '../results/static_results_per_timestep.csv'

if os.path.exists(static_summary_path):
    static_summary = pd.read_csv(static_summary_path)
    static_timestep = pd.read_csv(static_timestep_path)
    has_static = True
else:
    print("Warning: Static baseline results not found. Run static experiment first.")
    print("To run static experiment: python scripts/run_static_experiment.py")
    has_static = False

print(f"Adaptive episodes: {len(adaptive_summary)}")
print(f"Adaptive timesteps: {len(adaptive_timestep)}")
if has_static:
    print(f"Static episodes: {len(static_summary)}")
    print(f"Static timesteps: {len(static_timestep)}")

# Display first few rows
print("\nAdaptive Summary (first 5 rows):")
print(adaptive_summary.head())

## 2. Reward Trajectories

Visualize how total reward per episode evolves over training.

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Total reward per episode
ax1.plot(adaptive_summary['episode'], adaptive_summary['total_reward'], 
         marker='o', label='Adaptive (DQN)', linewidth=2)
if has_static:
    ax1.plot(static_summary['episode'], static_summary['total_reward'], 
             marker='s', label='Static Baseline', linewidth=2)
ax1.set_xlabel('Episode')
ax1.set_ylabel('Total Reward')
ax1.set_title('Reward per Episode')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot 2: Epsilon decay (exploration rate)
ax2.plot(adaptive_summary['episode'], adaptive_summary['epsilon'], 
         marker='o', color='orange', linewidth=2)
ax2.set_xlabel('Episode')
ax2.set_ylabel('Epsilon (Exploration Rate)')
ax2.set_title('Agent Exploration Over Time')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary statistics
print("Adaptive Summary Statistics:")
print(f"  Mean reward: {adaptive_summary['total_reward'].mean():.2f}")
print(f"  Std reward: {adaptive_summary['total_reward'].std():.2f}")
print(f"  Min reward: {adaptive_summary['total_reward'].min():.2f}")
print(f"  Max reward: {adaptive_summary['total_reward'].max():.2f}")

if has_static:
    print("\nStatic Summary Statistics:")
    print(f"  Mean reward: {static_summary['total_reward'].mean():.2f}")
    print(f"  Std reward: {static_summary['total_reward'].std():.2f}")
    print(f"  Min reward: {static_summary['total_reward'].min():.2f}")
    print(f"  Max reward: {static_summary['total_reward'].max():.2f}")

## 3. Action Distribution Analysis

Analyze which honeypots the agent deployed most frequently.

In [None]:
# Count action frequencies
action_names = {0: 'Do Nothing', 1: 'Deploy Cowrie (SSH)', 2: 'Deploy Web'}
adaptive_actions = adaptive_timestep['action'].value_counts().sort_index()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Action distribution (bar chart)
ax = axes[0]
actions = [action_names.get(i, f'Action {i}') for i in adaptive_actions.index]
ax.bar(actions, adaptive_actions.values, color=['gray', 'skyblue', 'coral'])
ax.set_ylabel('Frequency')
ax.set_title('Agent Action Distribution')
ax.grid(True, alpha=0.3, axis='y')

# Plot 2: Action over time
ax = axes[1]
window = min(50, len(adaptive_timestep) // 5)  # Moving average window
if len(adaptive_timestep) > window:
    adaptive_timestep['action_smooth'] = adaptive_timestep['action'].rolling(window=window).mean()
    ax.plot(adaptive_timestep.index, adaptive_timestep['action_smooth'], linewidth=2)
    ax.set_xlabel('Timestep')
    ax.set_ylabel('Action (smoothed)')
    ax.set_title(f'Action Trend (Moving Avg, window={window})')
    ax.set_yticks([0, 1, 2])
    ax.set_yticklabels(['Do Nothing', 'SSH', 'Web'])
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Action Distribution:")
for action, count in adaptive_actions.items():
    pct = count / len(adaptive_timestep) * 100
    print(f"  {action_names.get(action, f'Action {action}')}: {count} ({pct:.1f}%)")

## 4. Attack Detection Analysis

Analyze SSH and web attack detection rates.

In [None]:
# Calculate attack detection rates
ssh_attacks = adaptive_timestep['ssh_attack'].sum()
web_attacks = adaptive_timestep['web_attack'].sum()
total_timesteps = len(adaptive_timestep)

# Calculate matches (correct honeypot deployed for attack type)
ssh_matches = ((adaptive_timestep['ssh_attack'] == 1) & 
               (adaptive_timestep['current_honeypot'] == 1)).sum()
web_matches = ((adaptive_timestep['web_attack'] == 1) & 
               (adaptive_timestep['current_honeypot'] == 2)).sum()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Attack frequency
ax = axes[0]
attack_counts = [ssh_attacks, web_attacks]
ax.bar(['SSH Attacks', 'Web Attacks'], attack_counts, color=['skyblue', 'coral'])
ax.set_ylabel('Number of Timesteps with Attack')
ax.set_title('Attack Type Frequency')
ax.grid(True, alpha=0.3, axis='y')

# Plot 2: Match rate (honeypot matched to attack)
ax = axes[1]
match_rates = []
labels = []
if ssh_attacks > 0:
    ssh_match_rate = ssh_matches / ssh_attacks * 100
    match_rates.append(ssh_match_rate)
    labels.append(f'SSH Match Rate\n({ssh_matches}/{ssh_attacks})')
if web_attacks > 0:
    web_match_rate = web_matches / web_attacks * 100
    match_rates.append(web_match_rate)
    labels.append(f'Web Match Rate\n({web_matches}/{web_attacks})')

if match_rates:
    ax.bar(labels, match_rates, color=['skyblue', 'coral'])
    ax.set_ylabel('Match Rate (%)')
    ax.set_title('Honeypot-Attack Match Rate')
    ax.set_ylim([0, 100])
    ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print(f"Total timesteps: {total_timesteps}")
print(f"SSH attacks detected: {ssh_attacks} ({ssh_attacks/total_timesteps*100:.1f}%)")
print(f"Web attacks detected: {web_attacks} ({web_attacks/total_timesteps*100:.1f}%)")
if ssh_attacks > 0:
    print(f"SSH match rate: {ssh_matches}/{ssh_attacks} ({ssh_match_rate:.1f}%)")
if web_attacks > 0:
    print(f"Web match rate: {web_matches}/{web_attacks} ({web_match_rate:.1f}%)")

## 5. Statistical Comparison

Compare adaptive vs static performance using statistical tests.

In [None]:
if has_static:
    # Comparison bar chart
    fig, ax = plt.subplots(figsize=(10, 6))
    
    methods = ['Adaptive (DQN)', 'Static Baseline']
    mean_rewards = [adaptive_summary['total_reward'].mean(), 
                    static_summary['total_reward'].mean()]
    std_rewards = [adaptive_summary['total_reward'].std(), 
                   static_summary['total_reward'].std()]
    
    bars = ax.bar(methods, mean_rewards, yerr=std_rewards, capsize=10, 
                   color=['skyblue', 'lightcoral'], alpha=0.8)
    ax.set_ylabel('Mean Total Reward')
    ax.set_title('Adaptive vs Static Performance Comparison')
    ax.grid(True, alpha=0.3, axis='y')
    
    # Add value labels on bars
    for bar, mean, std in zip(bars, mean_rewards, std_rewards):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{mean:.2f}Â±{std:.2f}',
                ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    # Statistical tests
    print("=" * 60)
    print("STATISTICAL COMPARISON")
    print("=" * 60)
    
    # T-test (parametric)
    t_stat, t_pval = stats.ttest_ind(adaptive_summary['total_reward'], 
                                      static_summary['total_reward'])
    print(f"\nIndependent t-test:")
    print(f"  t-statistic: {t_stat:.4f}")
    print(f"  p-value: {t_pval:.4f}")
    if t_pval < 0.05:
        print(f"  Result: Statistically significant difference (p < 0.05)")
    else:
        print(f"  Result: No significant difference (p >= 0.05)")
    
    # Mann-Whitney U test (non-parametric)
    u_stat, u_pval = stats.mannwhitneyu(adaptive_summary['total_reward'], 
                                         static_summary['total_reward'],
                                         alternative='two-sided')
    print(f"\nMann-Whitney U test (non-parametric):")
    print(f"  U-statistic: {u_stat:.4f}")
    print(f"  p-value: {u_pval:.4f}")
    if u_pval < 0.05:
        print(f"  Result: Statistically significant difference (p < 0.05)")
    else:
        print(f"  Result: No significant difference (p >= 0.05)")
    
    # Effect size (Cohen's d)
    pooled_std = np.sqrt((adaptive_summary['total_reward'].std()**2 + 
                          static_summary['total_reward'].std()**2) / 2)
    cohens_d = (adaptive_summary['total_reward'].mean() - 
                static_summary['total_reward'].mean()) / pooled_std
    print(f"\nEffect Size (Cohen's d): {cohens_d:.4f}")
    if abs(cohens_d) < 0.2:
        print("  Interpretation: Small effect")
    elif abs(cohens_d) < 0.5:
        print("  Interpretation: Medium effect")
    else:
        print("  Interpretation: Large effect")
    
else:
    print("Skipping statistical comparison - static baseline data not available")