In [82]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [83]:
df = pd.read_csv(r"C:\Users\Fatma PC\Desktop\networksecurity_project\data\processed\snmp_merged_dataset.csv")

In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3528 entries, 0 to 3527
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   srcip        3528 non-null   object 
 1   srcport      3528 non-null   int64  
 2   dstip        3528 non-null   object 
 3   dstport      3528 non-null   int64  
 4   proto        3528 non-null   object 
 5   dur          3528 non-null   float64
 6   sbytes       3528 non-null   int64  
 7   dbytes       3528 non-null   int64  
 8   sttl         3528 non-null   float64
 9   dttl         3528 non-null   float64
 10  service      3528 non-null   object 
 11  Label        3528 non-null   int64  
 12  source_file  3528 non-null   object 
dtypes: float64(3), int64(5), object(5)
memory usage: 358.4+ KB


In [85]:
df.duplicated().sum()

np.int64(0)

In [86]:
df.describe()

Unnamed: 0,srcport,dstport,dur,sbytes,dbytes,sttl,dttl,Label
count,3528.0,3528.0,3528.0,3528.0,3528.0,3528.0,3528.0,3528.0
mean,25059.098639,23140.967687,25.335956,72098.370465,0.0,82.929989,82.929989,0.377551
std,25319.91955,24055.872104,303.239649,93153.90724,0.0,41.299983,41.299983,0.484843
min,53.0,53.0,0.0,71.0,0.0,1.0,1.0,0.0
25%,53.0,53.0,0.0,82.0,0.0,64.0,64.0,0.0
50%,33232.0,5355.0,0.58233,164.0,0.0,64.0,64.0,0.0
75%,50521.0,47128.5,1.77058,186228.0,0.0,128.0,128.0,1.0
max,65487.0,65429.0,5710.19594,372572.0,0.0,255.0,255.0,1.0


In [87]:
df=df.drop(['source_file'],axis=1)

In [88]:
df['Label'].value_counts()

Label
0    2196
1    1332
Name: count, dtype: int64

In [89]:
df.isnull().sum()

srcip      0
srcport    0
dstip      0
dstport    0
proto      0
dur        0
sbytes     0
dbytes     0
sttl       0
dttl       0
service    0
Label      0
dtype: int64

In [90]:
df.head()

Unnamed: 0,srcip,srcport,dstip,dstport,proto,dur,sbytes,dbytes,sttl,dttl,service,Label
0,10.0.2.6,43217,10.0.2.9,161,udp,1.936872,179134,0,64.0,64.0,snmp,1
1,10.0.2.9,161,10.0.2.6,54065,udp,4.046434,186228,0,128.0,128.0,snmp,1
2,10.0.2.9,161,10.0.2.6,57854,udp,1.261654,186228,0,128.0,128.0,snmp,1
3,10.0.2.6,39846,10.0.2.9,161,udp,2.920267,179134,0,64.0,64.0,snmp,1
4,10.0.2.6,49923,10.0.2.9,161,udp,3.221906,179248,0,64.0,64.0,snmp,1


In [91]:
df=df.drop(['srcip','dstip','service','dbytes'],axis=1)

In [92]:
df.head()

Unnamed: 0,srcport,dstport,proto,dur,sbytes,sttl,dttl,Label
0,43217,161,udp,1.936872,179134,64.0,64.0,1
1,161,54065,udp,4.046434,186228,128.0,128.0,1
2,161,57854,udp,1.261654,186228,128.0,128.0,1
3,39846,161,udp,2.920267,179134,64.0,64.0,1
4,49923,161,udp,3.221906,179248,64.0,64.0,1


In [93]:
df['proto'].unique()

array(['udp'], dtype=object)

In [94]:
df=df.drop(['proto'],axis=1)

In [95]:
correlation = df.corr()['Label'].abs().sort_values(ascending=False)
correlation

Label      1.000000
sbytes     0.990646
sttl       0.248330
dttl       0.248330
dur        0.053846
srcport    0.050866
dstport    0.008956
Name: Label, dtype: float64

In [96]:
df=df.drop(['dstport'],axis=1)

In [97]:
y = df["Label"].astype(int)
x=df.drop(['Label'],axis=1)


In [98]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

In [99]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [100]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("\nAfter SMOTE:")
print(pd.Series(y_train_smote).value_counts())



After SMOTE:
Label
0    1757
1    1757
Name: count, dtype: int64


In [101]:
from sklearn.feature_selection import SelectKBest, f_classif

X_fs = X_train_smote  # features AFTER train-test split & SMOTE
y_fs = y_train_smote

selector = SelectKBest(score_func=f_classif, k="all")
selector.fit(X_fs, y_fs)

anova_scores = pd.DataFrame({
    "Feature": x.columns,
    "ANOVA_F_score": selector.scores_
}).sort_values(by="ANOVA_F_score", ascending=False)

print(anova_scores)


   Feature  ANOVA_F_score
2   sbytes  166801.877634
4     dttl     283.234441
3     sttl     283.234441
1      dur      14.952718
0  srcport      14.671227


In [102]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)


In [103]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

rf.fit(X_train_smote, y_train_smote)

y_pred_rf = rf.predict(X_test)

print("\n=== Random Forest Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, digits=4))



=== Random Forest Results ===
Accuracy: 0.9971671388101983
[[439   0]
 [  2 265]]
              precision    recall  f1-score   support

           0     0.9955    1.0000    0.9977       439
           1     1.0000    0.9925    0.9962       267

    accuracy                         0.9972       706
   macro avg     0.9977    0.9963    0.9970       706
weighted avg     0.9972    0.9972    0.9972       706



In [104]:
from sklearn.svm import SVC

svm = SVC(
    kernel="rbf",
    C=1.0,
    gamma="scale",
    random_state=42
)

svm.fit(X_train_scaled, y_train_smote)

y_pred_svm = svm.predict(X_test_scaled)

print("\n=== SVM Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, digits=4))



=== SVM Results ===
Accuracy: 0.9971671388101983
[[439   0]
 [  2 265]]
              precision    recall  f1-score   support

           0     0.9955    1.0000    0.9977       439
           1     1.0000    0.9925    0.9962       267

    accuracy                         0.9972       706
   macro avg     0.9977    0.9963    0.9970       706
weighted avg     0.9972    0.9972    0.9972       706



In [105]:
import os, pickle

ART_DIR = os.path.join("..", "artifacts")
os.makedirs(ART_DIR, exist_ok=True)

FEATURES = ["srcport", "dur", "sbytes", "sttl", "dttl"]  # must match your training

# model variable name must match your notebook (change rf_model if needed)
with open(os.path.join(ART_DIR, "rf_model.pkl"), "wb") as f:
    pickle.dump(rf, f)   # <-- change to your model variable name

with open(os.path.join(ART_DIR, "features.pkl"), "wb") as f:
    pickle.dump(FEATURES, f)

print("Saved: artifacts/rf_model.pkl and artifacts/features.pkl")


Saved: artifacts/rf_model.pkl and artifacts/features.pkl


In [106]:
import os
print(os.listdir("artifacts"))


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'artifacts'

In [None]:
import pickle

with open("artifacts/features.pkl", "rb") as f:
    FEATURES = pickle.load(f)

with open("artifacts/rf_model.pkl", "rb") as f:
    rf = pickle.load(f)

print("Loaded features:", FEATURES)
print("Loaded model:", type(rf))


Loaded features: ['srcport', 'dur', 'sbytes', 'sttl', 'dttl']
Loaded model: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
