In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
from keras.models import Sequential
from keras.layers import Dense
from IPython.display import Audio, display
from google.colab import files

# Function to load datasets and sound file
def load_files():
    print("Upload your phishing dataset (CSV format):")
    uploaded_phishing = files.upload()
    phishing_file = list(uploaded_phishing.keys())[0]

    print("Upload your Wireshark dataset (CSV format):")
    uploaded_wireshark = files.upload()
    wireshark_file = list(uploaded_wireshark.keys())[0]

    print("Upload your sound file (WAV format):")
    uploaded_sound = files.upload()
    sound_file = list(uploaded_sound.keys())[0]

    return phishing_file, wireshark_file, sound_file

# Load datasets and sound file
phishing_file, wireshark_file, sound_file = load_files()

# Load phishing dataset
phishing_df = pd.read_csv(phishing_file)

# Define the label column (adjust this if necessary)
label_column_name = 'Info'  # Change if needed
if label_column_name not in phishing_df.columns:
    raise ValueError(f"Label column '{label_column_name}' not found in dataset. Available columns: {phishing_df.columns}")

# Define new phishing keywords based on the 'Info' column
phishing_keywords = ['DHCP', 'Publish', 'Request', 'Ack', 'Seq', 'Win', 'TSval', 'TSecr']

# Generate labels based on adjusted keywords
phishing_labels = phishing_df[label_column_name].str.contains('|'.join(phishing_keywords), case=False, na=False).astype(int)

# Check if we have both classes
if phishing_labels.nunique() == 1:
    raise ValueError("The dataset does not contain both classes. Please check the label generation logic.")

# Prepare features and labels
phishing_features = phishing_df.drop(columns=[label_column_name])

# Convert categorical features to numeric
for col in phishing_features.select_dtypes(include=['object']).columns:
    phishing_features[col] = pd.factorize(phishing_features[col])[0]

# Split the phishing data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(phishing_features, phishing_labels, test_size=0.2, random_state=42, stratify=phishing_labels)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train models
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train_resampled, y_train_resampled)

gb_model = GradientBoostingClassifier()
gb_model.fit(X_train_resampled, y_train_resampled)

# Example CNN model
cnn_model = Sequential()
cnn_model.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_resampled, y_train_resampled, epochs=10, verbose=1)

# Example RNN model (simplified for demonstration)
rnn_model = Sequential()
rnn_model.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
rnn_model.add(Dense(1, activation='sigmoid'))
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rnn_model.fit(X_train_resampled, y_train_resampled, epochs=10, verbose=1)

def detect_phishing_attack(packet_data, source_info):
    # Convert input data to DataFrame with the correct number of features
    packet_features = pd.DataFrame(packet_data, columns=phishing_features.columns)

    # Ensure the features are reshaped correctly
    packet_features_reshaped = packet_features.values.reshape(1, -1)

    # Get predictions
    lgb_pred = lgb_model.predict(packet_features_reshaped)
    gb_pred = gb_model.predict(packet_features_reshaped)
    cnn_pred = (cnn_model.predict(packet_features_reshaped) > 0.5).astype(int)
    rnn_pred = (rnn_model.predict(packet_features_reshaped) > 0.5).astype(int)

    # Alert if phishing attack is detected
    if (lgb_pred[0] == 1 or gb_pred[0] == 1 or cnn_pred[0] == 1 or rnn_pred[0] == 1):
        print(f"ALERT: Phishing attack detected! Source: {source_info}")
        try:
            display(Audio(sound_file, autoplay=True))  # Play the uploaded sound file
        except Exception as e:
            print(f"Error playing sound: {e}")
        return "Phishing attack detected!"
    else:
        return "No phishing attack detected."

# Load Wireshark dataset
wireshark_df = pd.read_csv(wireshark_file)

# Print the available columns in the Wireshark dataset
print("Available columns in the Wireshark dataset:", wireshark_df.columns)

# Specify the relevant feature columns you want to use for detection
relevant_feature_columns = ['Source', 'Info']  # Use 'Source' for IP and 'Info' for relevant details

# Check if all relevant columns are in the Wireshark dataset
missing_columns = [col for col in relevant_feature_columns if col not in wireshark_df.columns]
if missing_columns:
    raise ValueError(f"The following relevant columns are missing from the Wireshark dataset: {missing_columns}")

for index, row in wireshark_df.iterrows():
    packet_data = {col: pd.factorize([row[col]])[0][0] if col in relevant_feature_columns else 0 for col in phishing_features.columns}

    # Create a DataFrame for the packet data
    packet_data_full = pd.DataFrame([packet_data])

    # Ensure no NaN values
    packet_data_full.fillna(0, inplace=True)

    # Call the detection function
    source_info = f"Source IP: {row['Source']}, Info: '{row['Info']}'"
    print(detect_phishing_attack(packet_data_full, source_info))