<a href="https://colab.research.google.com/github/fjadidi2001/AD_Prediction/blob/main/CyberAttackDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# FlowChart

In [1]:
from graphviz import Digraph

dot = Digraph(comment='Cyber Attack Detection Flowchart')
dot.node('A', 'Start')
dot.node('B', 'Define Problem & Scope')
dot.node('C', 'Collect Dataset')
dot.node('D', 'Preprocess Data')
dot.node('E', 'Perform EDA')
dot.node('F', 'Select Model')
dot.node('G', 'Train Model')
dot.node('H', 'Evaluate Model')
dot.node('I', 'Optimize Model')
dot.node('J', 'Deploy Model')
dot.node('K', 'Monitor & Retrain')
dot.node('L', 'End')

dot.edges(['AB', 'BC', 'CD', 'DE', 'EF', 'FG', 'GH', 'HJ', 'JK', 'KL'])
dot.edge('H', 'I', label='If performance poor')
dot.edge('I', 'G', label='Retrain')

dot.render('flowchart', format='png', view=True)

'flowchart.png'

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import os
from google.colab import drive
import torch
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Mount Google Drive
drive.mount('/content/drive')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Path to the zip file
zip_path = '/content/drive/MyDrive/network-intrusion-dataset.zip'
extract_dir = '/content/cicids2017/'

# Unzip the dataset
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# List of CSV files
csv_files = [
    'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv',
    'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
    'Friday-WorkingHours-Morning.pcap_ISCX.csv',
    'Monday-WorkingHours.pcap_ISCX.csv',
    'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
    'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
    'Tuesday-WorkingHours.pcap_ISCX.csv',
    'Wednesday-workingHours.pcap_ISCX.csv'
]

# Step 1: Load and Combine Datasets
def load_and_combine_data():
    data_frames = []
    base_path = os.path.join(extract_dir, 'MachineLearningCVE')
    for file in csv_files:
        file_path = os.path.join(base_path, file)
        df = pd.read_csv(file_path, encoding='latin1')
        data_frames.append(df)
    combined_df = pd.concat(data_frames, ignore_index=True)
    return combined_df

print("Loading and combining datasets...")
df = load_and_combine_data()
print(f"Dataset shape: {df.shape}")

# Step 2: Initial Data Inspection
def inspect_data():
    print("\nStep 2: Initial Data Inspection")
    print("\nFirst 5 rows:")
    print(df.head())
    print("\nColumn names:")
    print(df.columns.tolist())
    print("\nData types:")
    print(df.dtypes)
    print("\nBasic statistics:")
    print(df.describe())

    # Save column names for reference
    plt.figure(figsize=(15, 5))
    plt.bar(range(len(df.columns)), [1] * len(df.columns))
    plt.xticks(range(len(df.columns)), df.columns, rotation=90)
    plt.title("Dataset Columns")
    plt.tight_layout()
    plt.savefig('columns.png')
    plt.close()

inspect_data()

# Step 3: Check for Missing Values
def check_missing_values():
    print("\nStep 3: Check for Missing Values")
    missing_values = df.isnull().sum()
    print("\nMissing values per column:")
    print(missing_values[missing_values > 0])

    # Visualize missing values
    plt.figure(figsize=(15, 5))
    missing_values[missing_values > 0].plot(kind='bar')
    plt.title("Missing Values per Column")
    plt.xlabel("Columns")
    plt.ylabel("Number of Missing Values")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig('missing_values.png')
    plt.close()

check_missing_values()

# Step 4: Handle Missing Values
def handle_missing_values():
    print("\nStep 4: Handle Missing Values")
    global df
    # Replace inf values with NaN
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # Impute numerical columns with median
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    imputer = SimpleImputer(strategy='median')
    df[numerical_cols] = imputer.fit_transform(df[numerical_cols])
    # Verify no missing values remain
    print("\nMissing values after imputation:")
    print(df.isnull().sum().sum())

handle_missing_values()

# Step 5: Analyze Class Distribution
def analyze_class_distribution():
    print("\nStep 5: Analyze Class Distribution")
    # Map labels to Attack (1) and Non-Attack (0)
    df['Label'] = df[' Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
    class_counts = df['Label'].value_counts()
    print("\nClass distribution:")
    print(class_counts)

    # Visualize class distribution
    plt.figure(figsize=(8, 6))
    sns.countplot(x='Label', data=df)
    plt.title("Class Distribution (0: Non-Attack, 1: Attack)")
    plt.xlabel("Class")
    plt.ylabel("Count")
    plt.savefig('class_distribution.png')
    plt.close()

    # Detailed attack type distribution
    attack_types = df[df['Label'] == 1][' Label'].value_counts()
    print("\nAttack types distribution:")
    print(attack_types)

    plt.figure(figsize=(10, 6))
    attack_types.plot(kind='bar')
    plt.title("Distribution of Attack Types")
    plt.xlabel("Attack Type")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('attack_types_distribution.png')
    plt.close()

analyze_class_distribution()

# Step 6: Feature Correlation Analysis
def correlation_analysis():
    print("\nStep 6: Feature Correlation Analysis")
    # Select numerical features
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    corr_matrix = df[numerical_cols].corr()

    # Visualize correlation matrix
    plt.figure(figsize=(15, 10))
    sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
    plt.title("Correlation Matrix of Numerical Features")
    plt.tight_layout()
    plt.savefig('correlation_matrix.png')
    plt.close()

correlation_analysis()

# Step 7: Feature Distribution Analysis
def feature_distribution():
    print("\nStep 7: Feature Distribution Analysis")
    # Select a few key features for visualization
    key_features = [' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', ' Flow Bytes/s']
    for feature in key_features:
        plt.figure(figsize=(8, 6))
        sns.histplot(df[feature], bins=50, kde=True)
        plt.title(f"Distribution of {feature}")
        plt.xlabel(feature)
        plt.ylabel("Frequency")
        plt.savefig(f'distribution_{feature.replace(" ", "_")}.png')
        plt.close()

feature_distribution()

# Step 8: Data Preprocessing
def preprocess_data():
    print("\nStep 8: Data Preprocessing")
    global df
    # Drop original label column
    df = df.drop(' Label', axis=1)
    # Normalize numerical features
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    numerical_cols = [col for col in numerical_cols if col != 'Label']  # Exclude Label
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

    # Convert features to text for Hugging Face model
    def features_to_text(row):
        text = " ".join([f"{col}:{row[col]}" for col in numerical_cols])
        return text

    df['text'] = df.apply(features_to_text, axis=1)
    print("\nSample text representation:")
    print(df['text'].iloc[0])

    # Save preprocessed dataset
    df[['text', 'Label']].to_csv('preprocessed_cicids2017.csv', index=False)
    print("\nPreprocessed dataset saved as 'preprocessed_cicids2017.csv'")

preprocess_data()

# Step 9: Summary of Preprocessed Data
def summarize_preprocessed_data():
    print("\nStep 9: Summary of Preprocessed Data")
    print("\nShape of preprocessed dataset:")
    print(df[['text', 'Label']].shape)
    print("\nSample data:")
    print(df[['text', 'Label']].head())

summarize_preprocessed_data()

Mounted at /content/drive
Using device: cpu
Loading and combining datasets...


FileNotFoundError: [Errno 2] No such file or directory: '/content/cicids2017/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'