In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# **Loading Dataset**

In [None]:
df = pd.read_csv("/kaggle/input/ssh123/SSH.csv")
df.head()

# **Data Analysis**

In [None]:
df.info()

In [None]:
df = df.drop(["Flow ID"], axis = 1)
df = df.drop([" Timestamp"],axis = 1)

In [None]:
df.isna().sum()

#Here we can see we don't have null values. So we don't need to imputed the columns.

#Here is only one issue occur that is datatype. As it detects the vulnerability we can't replace it with any other values.

#So we used onehot to memorize the values.

In [None]:
df.describe()

Our target feature


In [None]:
df.groupby([' Label']).describe()

**OneHot technique to work with object datatypes**

Here we are remaining this columns, as from many papers we read, we got that this columns make effect to check vulnerabilty.

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
#df2 = df[[" Source IP", " Destination IP", " Source Port", " Destination Port", " Protocol", "Total Length of Fwd Packets", " Total Length of Bwd Packets"," SYN Flag Count"," RST Flag Count", " ACK Flag Count", " Label"]]
df2 = df
one_hot_encoder = OneHotEncoder(sparse_output=False)
protocol_encoded = one_hot_encoder.fit_transform(df2[[" Protocol"]])

# Convert to DataFrame and concatenate with the original data
protocol_encoded_df = pd.DataFrame(protocol_encoded, columns=one_hot_encoder.get_feature_names_out([" Protocol"]))
df2 = pd.concat([df2.reset_index(drop=True), protocol_encoded_df], axis=1)

# Drop the original 'Protocol' column
df2.drop(" Protocol", axis=1, inplace=True)
label_encoder = LabelEncoder()
df2[' Label'] = label_encoder.fit_transform(df2[' Label'])
df2[' Source IP'] = label_encoder.fit_transform(df2[' Source IP'])
df2[' Destination IP'] = label_encoder.fit_transform(df2[' Destination IP'])
print(df2)

In [None]:
df2.groupby([' Label']).describe()

# **Balancing the dataset**

In [None]:
features = df2.drop([" Label"],axis=1)
result = df2[" Label"]

In [None]:
from imblearn.over_sampling import RandomOverSampler
var = RandomOverSampler(sampling_strategy = "not majority")
feature_balance, result_balance = var.fit_resample(features, result)


In [None]:
df_merged = pd.concat([feature_balance, result_balance], axis=1)

In [None]:
df_merged.groupby([" Label"]).describe()

In [None]:
df_merged.info()

# Histogram

In [None]:
import matplotlib.pyplot as plt

In [None]:
cols = [" Source IP", " Destination IP", " Source Port", " Destination Port", "Total Length of Fwd Packets", " Total Length of Bwd Packets"," SYN Flag Count"," RST Flag Count", " ACK Flag Count", " Label"]
for label in cols[:-1]:
  plt.hist(df_merged[df_merged[" Label"]==0][label], color='blue', label='BENIGN', alpha=0.7, density=True)
  plt.hist(df_merged[df_merged[" Label"]==1][label], color='red', label='FTP-Parator', alpha=0.7, density=True)
  plt.hist(df_merged[df_merged[" Label"]==2][label], color='black', label='SSH-Parator', alpha=0.7, density=True)
  plt.title(label)
  plt.ylabel("Count")
  plt.xlabel(label)
  plt.legend()
  plt.show()

# **Correlation Matrix**

In [None]:
corr = df2.corr()
corr

In [None]:
import seaborn as sns
fig,ax = plt.subplots(figsize=(20,20))
sns.heatmap(corr, annot=True, ax=ax, cmap = "winter")
fig.suptitle(t="heatmap", color=  "purple", fontsize=20);

# **Data PreProcessing**

In [None]:
clean_df = df2[[" Source IP", " Destination IP", " Source Port", " Destination Port", " Protocol_0"," Protocol_6"," Protocol_17", "Total Length of Fwd Packets", " Total Length of Bwd Packets"," SYN Flag Count"," RST Flag Count", " ACK Flag Count", " Label"]]
feature_balance = clean_df.drop([" Label"],axis=1)
result_balance = clean_df[" Label"]

# **Data Trainning**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import tree
X_train, X_test, y_train, y_test = train_test_split(feature_balance, result_balance, test_size=0.3, random_state=42)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train.head()

In [None]:
y_test.head()

# Machine Learning Models

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# Initialize the Decision Tree classifier
decision_tree_model = DecisionTreeClassifier(random_state=42)

# Train the model on the training data
decision_tree_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = decision_tree_model.predict(X_test)

# Evaluate the accuracy of the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}%")

In [None]:
plt.figure(figsize=(15, 10))
tree.plot_tree(decision_tree_model, filled=True, feature_names=feature_balance.columns, class_names=["0", "1", "2"])
plt.show()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(f"Classification Report:\n{classification_report(y_test,y_pred)}")

In [None]:
cof_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cof_matrix}")  

In [None]:
fig,ax = plt.subplots(figsize=(8,8))
sns.heatmap(cof_matrix, annot=True, cmap="coolwarm")
fig.suptitle(t="Confusion Matrix",color = "orange", fontsize = 20)
ax.set(xlabel = "Predicted Label", ylabel="Actual label")

# **Random Forest**

In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assuming you have a dataset with features (X) and labels (y)
# Replace X and y with your actual feature and label data

# Split the dataset into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy score
print(f"Accuracy: {accuracy*100}%")

**Model Evaluation**

Metris used for model evalaution:
    a) CLassification Report
    b) Confusion Matrix
    c) Accuracy Score

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(f"Classification Report:\n{classification_report(y_test,y_pred)}")

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cf_matrix}")  

**Visualization of CF matrix**

In [None]:
fig,ax = plt.subplots(figsize=(8,8))
sns.heatmap(cf_matrix, annot=True, cmap="coolwarm")
fig.suptitle(t="Confusion Matrix",color = "orange", fontsize = 20)
ax.set(xlabel = "Predicted Label", ylabel="Actual label")

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

# Initialize the Naive Bayes classifier
naive_bayes_model = GaussianNB()

# Train the model on the training data
naive_bayes_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = naive_bayes_model.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}%")



In [None]:
# Print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
fig,ax = plt.subplots(figsize=(8,8))
sns.heatmap(conf_matrix, annot=True, cmap="coolwarm")
fig.suptitle(t="Confusion Matrix",color = "orange", fontsize = 20)
ax.set(xlabel = "Predicted Label", ylabel="Actual label")