In [2]:
# Libraries to help with reading and manipulating data
import pandas as pd
import numpy as np

# Libaries to help with data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# To tune model, get different metric scores, and split data
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    plot_confusion_matrix,
)
from sklearn import metrics

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To impute missing values
from sklearn.impute import SimpleImputer

# To oversample and undersample data
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# To do hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)





In [3]:
# To suppress scientific notations
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To suppress warnings
import warnings

warnings.filterwarnings("ignore")

# This will help in making the Python code more structured automatically (good coding practice)
#%load_ext nb_black

In [4]:
df = pd.read_csv('Kddtrain_finalbinary.csv') ##  Complete the code to read the data
df.head(5)

Unnamed: 0,duration_real,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creation,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,3,28,1,43,76,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,69,147,0.0,0.0,0.0,0.0,1.0,0.0,0.01,255,211,0.83,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0,1,29,1,340,1275,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,3,0.0,0.0,0.0,0.0,1.0,0.0,0.67,98,255,1.0,0.0,0.01,0.02,0.0,0.0,0.01,0.01,0
2,0,1,29,1,347,6239,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,7,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7,255,1.0,0.0,0.14,0.04,0.0,0.0,0.0,0.0,0
3,0,1,25,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,9,0.0,0.33,1.0,0.33,1.0,0.0,0.78,118,79,0.33,0.04,0.01,0.03,0.0,0.0,0.89,0.66,1
4,0,1,29,1,369,238,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,7,253,1.0,0.0,0.14,0.21,0.0,0.0,0.43,0.91,0


In [5]:
df_test = pd.read_csv('Kddtest_finalbinary.csv') ##  Complete the code to read the data
df_test.head(5)

Unnamed: 0,duration_real,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creation,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,2,1,5,1,2036,375,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,6,62,1.0,0.0,0.17,0.03,0.0,0.0,0.0,0.0,0
1,0,1,29,1,314,12923,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,35,37,0.0,0.0,0.0,0.0,1.0,0.0,0.05,160,255,1.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0
2,0,1,29,1,287,2513,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,7,7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,16,229,1.0,0.0,0.06,0.04,0.0,0.0,0.0,0.0,0
3,0,1,29,1,217,10490,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,255,1.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0
4,1,1,5,1,1023,336,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,150,98,0.65,0.03,0.01,0.0,0.01,0.01,0.0,0.0,0


In [6]:
# Checking number of rows and columns in the training data
df.shape

(79919, 42)

In [7]:
# Checking  number of rows and columns in the test data
df_test.shape

(11212, 42)

In [8]:
# Creating copy of the training data
data_training = df.copy()

In [9]:
# Creating copy of the training data
data_test = df_test.copy()

In [10]:
# Checking data format/types of the columns in the dataset
print(data_training.info())
print(data_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79919 entries, 0 to 79918
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration_real                79919 non-null  int64  
 1   protocol_type                79919 non-null  int64  
 2   service                      79919 non-null  int64  
 3   flag                         79919 non-null  int64  
 4   src_bytes                    79919 non-null  int64  
 5   dst_bytes                    79919 non-null  int64  
 6   land                         79919 non-null  int64  
 7   wrong_fragment               79919 non-null  int64  
 8   urgent                       79919 non-null  int64  
 9   hot                          79919 non-null  int64  
 10  num_failed_logins            79919 non-null  int64  
 11  logged_in                    79919 non-null  int64  
 12  num_compromised              79919 non-null  int64  
 13  root_shell      

In [11]:
# Cheking Duplicacy
data_training.duplicated().sum()
data_test.duplicated().sum()

0

In [12]:
print(data_training.duplicated().sum(), data_test.duplicated().sum())

13 0


In [13]:
#drop duplicates on training data
data_training = data_training.drop_duplicates()

In [14]:
print(data_training.duplicated().sum(), data_test.duplicated().sum())

0 0


In [15]:
# Check missing entries in the train data
data_training.isnull().sum().sum()

0

In [16]:
# Check missing entries in the testing data
data_test.isnull().sum().sum() ##  Complete the code to check missing entries in the test data

0

In [17]:
data_training.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration_real,79906.0,444.75,3249.54,0.0,0.0,0.0,0.0,42908.0
protocol_type,79906.0,1.422,0.773,1.0,1.0,1.0,1.0,3.0
service,79906.0,20.119,13.163,1.0,4.0,29.0,29.0,71.0
flag,79906.0,1.288,1.047,1.0,1.0,1.0,1.0,12.0
src_bytes,79906.0,67328.131,7369352.637,0.0,43.0,215.0,309.0,1379963888.0
dst_bytes,79906.0,30062.998,5048565.21,0.0,0.0,326.0,1536.0,1309937401.0
land,79906.0,0.0,0.009,0.0,0.0,0.0,0.0,1.0
wrong_fragment,79906.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
urgent,79906.0,0.0,0.016,0.0,0.0,0.0,0.0,3.0
hot,79906.0,0.195,2.121,0.0,0.0,0.0,0.0,77.0


In [18]:
data_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration_real,11212.0,40.784,1109.877,0.0,0.0,0.0,0.0,54451.0
protocol_type,11212.0,1.395,0.781,1.0,1.0,1.0,1.0,3.0
service,11212.0,21.727,12.426,1.0,5.0,29.0,29.0,66.0
flag,11212.0,1.246,1.03,1.0,1.0,1.0,1.0,12.0
src_bytes,11212.0,2192.08,79178.063,0.0,46.0,223.0,302.0,6291668.0
dst_bytes,11212.0,3608.861,29238.013,0.0,50.0,426.0,2239.0,1345927.0
land,11212.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wrong_fragment,11212.0,0.011,0.176,0.0,0.0,0.0,0.0,3.0
urgent,11212.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hot,11212.0,0.045,1.171,0.0,0.0,0.0,0.0,101.0


In [19]:
# Checking the balance of the traning dataset interms of class distrbution
data_training["class"].value_counts(normalize=True)

0   0.843
1   0.157
Name: class, dtype: float64

In [20]:
# Checking the balance of the testing dataset interms of class distrbution
data_test["class"].value_counts(normalize=True)

0   0.866
1   0.134
Name: class, dtype: float64

In [21]:
data_training = data_training.drop(['wrong_fragment', 'num_outbound_cmds'], axis=1)
data_test = data_test.drop(['wrong_fragment', 'num_outbound_cmds'], axis=1)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC

# Load your training and testing datasets
# Replace 'training_data.csv' and 'testing_data.csv' with your actual data files
train_data = data_training
test_data = data_test

# Assuming your features are in columns 'feature1', 'feature2', ..., and target in 'target'
X_train = train_data.drop(columns=['class'])
y_train = train_data['class']
X_test = test_data.drop(columns=['class'])
y_test = test_data['class']

# Create an SVM classifier
svm_classifier = SVC(kernel='linear')

# Apply 5-fold cross-validation on training data
cross_val_scores = cross_val_score(svm_classifier, X_train, y_train, cv=5)

# Train the SVM model on the entire training data
svm_classifier.fit(X_train, y_train)

# Use the trained SVM model to make predictions on the testing data
y_pred = svm_classifier.predict(X_test)

# Print cross-validation scores and predictions
print("Cross-Validation Scores:", cross_val_scores)
print("Predicted Labels on Testing Data:", y_pred)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import time

# Load your training and testing datasets
# Replace 'training_data.csv' and 'testing_data.csv' with your actual data files
train_data = data_training
test_data = data_test

# Assuming your features are in columns 'feature1', 'feature2', ..., and target in 'target'
X_train = train_data.drop(columns=['class'])
y_train = train_data['class']
X_test = test_data.drop(columns=['class'])
y_test = test_data['class']

# Create an SVM classifier
svm_classifier = SVC(kernel='linear')
  
# Measure the time taken to run the process
start_time = time.time()

# Apply 5-fold cross-validation on training data
cross_val_scores = cross_val_score(svm_classifier, X_train, y_train, cv=5)

# Train the SVM model on the entire training data
svm_classifier.fit(X_train, y_train)

# Use the trained SVM model to make predictions on the testing data
y_pred = svm_classifier.predict(X_test)

# Calculate classification report
report = classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1'])

# Calculate ROC curve and AUC
y_scores = svm_classifier.decision_function(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
roc_auc = roc_auc_score(y_test, y_scores)

# Print cross-validation scores and classification report
print("Cross-Validation Scores:", cross_val_scores)
print("Classification Report:\n", report)
print("ROC AUC:", roc_auc)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Calculate the time taken
end_time = time.time()
time_taken = end_time - start_time
print("Time taken:", time_taken, "seconds")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import time

# Load your training and testing datasets
# Replace 'training_data.csv' and 'testing_data.csv' with your actual data files
train_data = data_training
test_data = data_test

# Assuming your features are in columns 'feature1', 'feature2', ..., and target in 'target'
X_train = train_data.drop(columns=['class'])
y_train = train_data['class']
X_test = test_data.drop(columns=['class'])
y_test = test_data['class']

# Create an SVM classifier
svm_classifier = SVC(kernel='linear')
  
# Measure the time taken to run the process
start_time = time.time()

# Apply 5-fold cross-validation on training data
cross_val_scores = cross_val_score(svm_classifier, X_train, y_train, cv=5)

# Train the SVM model on the entire training data
svm_classifier.fit(X_train, y_train)

# Use the trained SVM model to make predictions on the testing data
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Calculate ROC curve and AUC
y_scores = svm_classifier.decision_function(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
roc_auc = roc_auc_score(y_test, y_scores)

# Print cross-validation scores and evaluation metrics
print("Cross-Validation Scores:", cross_val_scores)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC AUC:", roc_auc)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Calculate the time taken
end_time = time.time()
time_taken = end_time - start_time
print("Time taken:", time_taken, "seconds")
