<a href="https://colab.research.google.com/github/gj0210/CMP7239/blob/main/vish%201%20Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
import zipfile
import os

# Suppress a common warning from imblearn when using SMOTE
warnings.filterwarnings("ignore", category=UserWarning)

# --- 1. DATA LOADING AND INITIAL ANALYSIS ---
print("--- Step 1: Loading Dataset and Initial Analysis ---")

# The path to your uploaded zip file.
zip_file_path = 'archive(4).zip'
csv_file_name = 'CloudWatch_Traffic_Web_Attack.csv'
extracted_dir = 'extracted_data'
extracted_file_path = os.path.join(extracted_dir, csv_file_name)

try:
    # Create directory for extraction if it doesn't exist
    if not os.path.exists(extracted_dir):
        os.makedirs(extracted_dir)

    # Extract the CSV file from the zip archive
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extract(csv_file_name, extracted_dir)
    print(f"Successfully extracted {csv_file_name} from {zip_file_path} to {extracted_dir}")

    # Read the CSV file into a pandas DataFrame.
    # We will also parse the 'time' column as datetime objects
    df = pd.read_csv(extracted_file_path)
    print("Dataset loaded successfully.")
    print("\nInitial Data Preview:")
    display(df.head())
    print("\nDataFrame Info:")
    df.info()

    # Check the class distribution. The 'rule_names' column contains our labels.
    print("\nInitial Class Distribution:")
    print(df['rule_names'].value_counts())

    # We will create a binary target variable: 1 for 'Suspicious Web Traffic' and 0 otherwise.
    df['is_attack'] = df['rule_names'].apply(lambda x: 1 if 'Suspicious Web Traffic' in x else 0)
    print("\nBinary Target Variable Distribution:")
    print(df['is_attack'].value_counts())

except FileNotFoundError:
    print(f"Error: The file at '{zip_file_path}' was not found. Please ensure the file path is correct.")
    exit()
except KeyError:
    print(f"Error: The file '{csv_file_name}' was not found inside the zip archive '{zip_file_path}'.")
    exit()


# --- 2. DATA PREPARATION & HANDLING CLASS IMBALANCE ---
print("\n--- Step 2: Data Preparation & Handling Class Imbalance ---")

# Select features (X) and target (y). We'll drop non-numerical or irrelevant columns.
# Drop columns that are not suitable for direct use in the model and the target column itself.
columns_to_drop = ['creation_time', 'end_time', 'src_ip', 'src_ip_country_code',
                     'protocol', 'dst_ip', 'rule_names', 'observation_name',
                     'source.meta', 'source.name', 'time', 'detection_types', 'is_attack']

# Filter out columns that don't exist in the DataFrame
existing_columns_to_drop = [col for col in columns_to_drop if col in df.columns]

X = df.drop(columns=existing_columns_to_drop)
y = df['is_attack']


# Split the data into training and testing sets before balancing.
# This ensures the test set remains a true, untouched representation of the original data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size before balancing: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print("\nTraining set distribution before SMOTE:")
print(y_train.value_counts())

# Apply SMOTE to the training data to balance the classes.
print("\nApplying SMOTE to training data...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("\nTraining set distribution after SMOTE:")
print(y_train_resampled.value_counts())

# --- 3. MODEL TRAINING ---
print("\n--- Step 3: Model Training ---")

# Train Decision Tree Classifier (our baseline)
print("Training Decision Tree model...")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_resampled, y_train_resampled)

# Train Random Forest Classifier (our main model)
print("Training Random Forest Classifier model...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_resampled, y_train_resampled)

# --- 4. MODEL EVALUATION ---
print("\n--- Step 4: Model Evaluation ---")

# Evaluate Decision Tree on the original, unbalanced test set.
print("=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=")
print("     Decision Tree Performance     ")
print("=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=")
y_pred_dt = dt_model.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt, target_names=['Normal', 'Attack']))
print("Overall Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))

# Evaluate Random Forest on the original, unbalanced test set.
print("\n\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=")
print("      Random Forest Performance      ")
print("=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=")
y_pred_rf = rf_model.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Normal', 'Attack']))
print("Overall Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

--- Step 1: Loading Dataset and Initial Analysis ---
Successfully extracted CloudWatch_Traffic_Web_Attack.csv from archive(4).zip to extracted_data
Dataset loaded successfully.

Initial Data Preview:


Unnamed: 0,bytes_in,bytes_out,creation_time,end_time,src_ip,src_ip_country_code,protocol,response.code,dst_port,dst_ip,rule_names,observation_name,source.meta,source.name,time,detection_types
0,5602,12990,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,147.161.161.82,AE,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
1,30912,18186,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,165.225.33.6,US,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
2,28506,13468,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,165.225.212.255,CA,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
3,30546,14278,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,136.226.64.114,US,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
4,6526,13892,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,165.225.240.79,NL,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule



DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282 entries, 0 to 281
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   bytes_in             282 non-null    int64 
 1   bytes_out            282 non-null    int64 
 2   creation_time        282 non-null    object
 3   end_time             282 non-null    object
 4   src_ip               282 non-null    object
 5   src_ip_country_code  282 non-null    object
 6   protocol             282 non-null    object
 7   response.code        282 non-null    int64 
 8   dst_port             282 non-null    int64 
 9   dst_ip               282 non-null    object
 10  rule_names           282 non-null    object
 11  observation_name     282 non-null    object
 12  source.meta          282 non-null    object
 13  source.name          282 non-null    object
 14  time                 282 non-null    object
 15  detection_types      282 non-null    obj

ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead

In [8]:
from sklearn.datasets import make_classification
from collections import Counter
from imblearn.over_sampling import SMOTE
import pandas as pd

# Generate a synthetic dataset with 2 classes and imbalance
X_synth, y_synth = make_classification(n_classes=2, class_sep=2,
                           weights=[0.9, 0.1], n_informative=3, n_redundant=1,
                           flip_y=0, n_features=20, n_clusters_per_class=1,
                           n_samples=1000, random_state=42)

print("Original dataset shape %s" % Counter(y_synth))

# Apply SMOTE to the synthetic dataset
smote_synth = SMOTE(random_state=42)
X_res_synth, y_res_synth = smote_synth.fit_resample(X_synth, y_synth)

print("Resampled dataset shape %s" % Counter(y_res_synth))

# You can then use X_res_synth and y_res_synth to train your models
# For example:
# dt_model_synth = DecisionTreeClassifier(random_state=42)
# dt_model_synth.fit(X_res_synth, y_res_synth)

Original dataset shape Counter({np.int64(0): 900, np.int64(1): 100})
Resampled dataset shape Counter({np.int64(0): 900, np.int64(1): 900})
