In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
# Load the balanced dataset
df = pd.read_csv("Balanced_Autonomous_Vehicle_Security_Dataset.csv")

In [3]:
# Encode categorical variables (Label, Sensor Status, Attack Type, Protocol, Vehicle Usage, Car Sensor Type, Connection Type)
label_encoder = LabelEncoder()

# Encode the necessary columns
df["Sensor Status"] = label_encoder.fit_transform(df["Sensor Status"])
df["Vehicle Usage"] = label_encoder.fit_transform(df["Vehicle Usage"])
df["Attack Type"] = label_encoder.fit_transform(df["Attack Type"])
df["Protocol"] = label_encoder.fit_transform(df["Protocol"])
df["Car Sensor Type"] = label_encoder.fit_transform(df["Car Sensor Type"])
df["Connection Type"] = label_encoder.fit_transform(df["Connection Type"])
df["Label"] = label_encoder.fit_transform(df["Label"])

# Separate features and target variable
X = df.drop(columns=["Vehicle ID", "Location", "Label"])  # Drop ID and Location columns as they are not useful for the model
y = df["Label"]


In [5]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Increase min_samples_split and min_samples_leaf
rf_model = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=10, min_samples_leaf=5, random_state=42)


In [13]:
# Adding some noise to the data by randomly changing a small percentage of the features
import numpy as np

noise_level = 0.05
noise = np.random.normal(0, noise_level, X_train.shape)
X_train_noisy = X_train + noise


In [14]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation to avoid overfitting
cv_scores = cross_val_score(rf_model, X, y, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean() * 100:.2f}%")


Cross-validation scores: [0.99875  1.       1.       0.996875 0.999875]
Mean cross-validation score: 99.91%


In [16]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("Balanced_Autonomous_Vehicle_Security_Dataset.csv")

# Preprocessing and encoding (similar as before)
label_encoder = LabelEncoder()
df["Sensor Status"] = label_encoder.fit_transform(df["Sensor Status"])
df["Vehicle Usage"] = label_encoder.fit_transform(df["Vehicle Usage"])
df["Attack Type"] = label_encoder.fit_transform(df["Attack Type"])
df["Protocol"] = label_encoder.fit_transform(df["Protocol"])
df["Car Sensor Type"] = label_encoder.fit_transform(df["Car Sensor Type"])
df["Connection Type"] = label_encoder.fit_transform(df["Connection Type"])
df["Label"] = label_encoder.fit_transform(df["Label"])

# Prepare features and labels
X = df.drop(columns=["Vehicle ID", "Location", "Label"])
y = df["Label"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model with adjusted parameters to prevent overfitting
rf_model = RandomForestClassifier(n_estimators=50, max_depth=6, min_samples_split=10, min_samples_leaf=5, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 99.99%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4003
           1       1.00      1.00      1.00      3997

    accuracy                           1.00      8000
   macro avg       1.00      1.00      1.00      8000
weighted avg       1.00      1.00      1.00      8000



In [18]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 99.99%
