<a href="https://colab.research.google.com/github/chennurignaneshwar/Intrusion-detection-system/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install PyShark for .pcap file processing
!pip install pyshark

# Install Scapy for advanced packet processing
!pip install scapy

# Install TensorFlow (already installed on Colab, but for the latest version)
!pip install tensorflow

# Install tshark
!apt-get install -y tshark

# Add tshark to PATH
import os
os.environ['PATH'] += ":/usr/bin/tshark"

from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/drive/MyDrive/traffic/traffic.pcap'

#PACKET LEVEL FEATURE EXTRACTION


import pyshark
import asyncio

# Load .pcap file, using the data_path variable defined earlier
# Wrap the FileCapture in a coroutine
async def capture_packets(data_path):
  """Captures packets using PyShark within the existing event loop."""
  capture = pyshark.FileCapture(data_path)
  data = []
  for packet in capture:
    try:
      data.append({
          'timestamp': packet.sniff_time,
          'source_ip': packet.ip.src,
          'destination_ip': packet.ip.dst,
          'protocol': packet.transport_layer,
          'packet_size': int(packet.length)
      })
    except AttributeError:
      continue
  return data

# Instead of getting a new event loop, get the current running loop
# This ensures you are working within the existing Colab/Jupyter environment loop
loop = asyncio.get_running_loop()


# Run the coroutine within the loop using 'run_in_executor'
# This avoids conflicts with the already running loop
import nest_asyncio
nest_asyncio.apply()  # Apply nest_asyncio patch to allow nested loops
data = loop.run_until_complete(capture_packets(data_path))


# Convert to DataFrame
import pandas as pd
df = pd.DataFrame(data)
df.head()

Collecting pyshark
  Downloading pyshark-0.6-py3-none-any.whl.metadata (806 bytes)
Collecting appdirs (from pyshark)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Downloading pyshark-0.6-py3-none-any.whl (41 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: appdirs, pyshark
Successfully installed appdirs-1.4.4 pyshark-0.6
Collecting scapy
  Downloading scapy-2.6.1-py3-none-any.whl.metadata (5.6 kB)
Downloading scapy-2.6.1-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scapy
Successfully installed scapy-2.6.1
Reading package lists... Done
Building dependency tree.

MessageError: Error: credential propagation was unsuccessful

In [None]:
# CONVERSION TO TIME SERIES DATA:-

# Set 'timestamp' as the index before resampling
df = df.set_index('timestamp')

# Convert the index to DatetimeIndex
df.index = pd.to_datetime(df.index)


# Resample to 5-second intervals
time_series = df.resample('5S').agg({
    'packet_size': ['mean', 'sum'],
    'protocol': 'count'  # Packet count
})
time_series.columns = ['avg_packet_size', 'total_packet_size', 'packet_count']
time_series.reset_index(inplace=True)
time_series.head()

In [None]:
#PREPROCESSING:--------------
from sklearn.preprocessing import MinMaxScaler

# Handle missing values
time_series.fillna(0, inplace=True)

# Normalize data
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(time_series[['avg_packet_size', 'total_packet_size', 'packet_count']])
normalized_df = pd.DataFrame(normalized_data, columns=['avg_packet_size', 'total_packet_size', 'packet_count'])
normalized_df.head()

In [None]:
#SEPERATION OF INDIVIDUAL ATTACK CLASS INSTANCES

import numpy as np

# Function to generate labels with attack samples
def generate_labels(df, attack_ratio=0.2):
    """Generates labels with a specified ratio of attack instances."""
    labels = np.array(['normal'] * len(df))
    attack_indices = np.random.choice(len(df), int(attack_ratio * len(df)), replace=False)
    labels[attack_indices] = 'attack'
    return labels

time_series['label'] = generate_labels(time_series)

# Now, separate the data
normal_traffic = normalized_df[time_series['label'] == 'normal']
attack_traffic = normalized_df[time_series['label'] == 'attack']

# Print class distribution
print("\nCounts of Instances:")
print(f"Normal Traffic: {len(normal_traffic)}")
print(f"Attack Traffic: {len(attack_traffic)}")

if 0 in y_pred:
    print("\n🚨 Intrusion Detected! 🚨\n")
else:
    print("\n✅ No Intrusion Detected. ✅\n")



In [None]:

#Feature Selection
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# Select top 2 features based on Mutual Information
X = normalized_df
y = time_series['label'].apply(lambda x: 1 if x == 'attack' else 0)
selector = SelectKBest(mutual_info_classif, k=2)
selected_features = selector.fit_transform(X, y)
selected_features_df = pd.DataFrame(selected_features, columns=['Feature1', 'Feature2'])
selected_features_df.head()


In [None]:
#TRAINING VALIDATION AND TESTING

from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(selected_features_df, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
#BULID RNN

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Reshape data for LSTM
X_train = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_val = X_val.values.reshape((X_val.shape[0], 1, X_val.shape[1]))
X_test = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Build the LSTM model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    LSTM(32),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

In [None]:
#TRAINING THE MODEL:----------------------------
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Train the model with class weights
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10, batch_size=32,
    class_weight=class_weights_dict
)


In [None]:
#EVALUATING AND VISUALIZING RESULTS:--------------------------------------
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Predict labels for test data
y_pred = (model.predict(X_test) > 0.5).astype(int)

# Reshape y_pred to be 1D if it's 2D
y_pred = y_pred.reshape(-1) #this will reshape the data to a 1D array

# Display unique values
print("Unique values in y_test:", np.unique(y_test))
print("Unique values in y_pred:", np.unique(y_pred))

# Generate classification report, explicitly specifying labels if needed
report = classification_report(y_test, y_pred, target_names=["Normal", "Attack"], labels=np.unique(y_pred))
print(report)

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Normal', 'Attack'], yticklabels=['Normal', 'Attack'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

# Improved Training vs Validation Accuracy Plot
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training vs Validation Accuracy')
plt.show()