In [1]:
import os
import sklearn
import numpy as np
import pandas as pd
import sys, traceback
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier 
from sklearn.neighbors import KNeighborsClassifier

In [2]:

data = pd.read_csv("./data/vector_allocation3_1_Com_FINAL_WITHOUT_DUPLICATE_NEW_SORTED_COMBINED.csv")

col_names = data.columns.to_list()

print("Columns Names: ", col_names)

print("\nSample data:")

data.head()


Columns Names:  ['URLID', 'stypeID', 'r0', 'r1', 'r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15', 'r16', 'r17', 'r18', 'r19', 'r20', 'r21', 'r22', 'r23', 'r24', 'r25', 'r26', 'r27', 'r28', 'r29', 'r30', 'r31', 'r32', 'r33', 'r34', 'r35', 'r36', 'r37', 'r38', 'r39', 'r40', 'r41', 'r42', 'r43', 'r44', 'r45', 'r46', 'r47', 'r48', 'r49', 'r50', 'r51', 'r52', 'r53', 'r54', 'r55', 'r56', 'r57', 'r58', 'r59', 'r60', 'r61', 'r62', 'r63', 'r64', 'r65', 'r66', 'r67', 'r68', 'r69', 'r70', 'r71', 'r72', 'r73', 'r74', 'r75', 'r76', 'r77', 'r78', 'r79', 'r80', 'r81', 'r82', 'r83', 'r84', 'r85', 'r86', 'r87', 'r88', 'r89', 'r90', 'r91', 'r92', 'r93', 'r94', 'r95', 'r96', 'r97', 'r98', 'r99', 'r100', 'r101', 'r102', 'r103', 'r104', 'r105', 'r106', 'r107', 'r108', 'r109', 'r110', 'r111', 'attack_status_value', 'vectors_class_output', 'attack_vector_class', 'attack_vector_type']

Sample data:


Unnamed: 0,URLID,stypeID,r0,r1,r2,r3,r4,r5,r6,r7,...,r106,r107,r108,r109,r110,r111,attack_status_value,vectors_class_output,attack_vector_class,attack_vector_type
0,1,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,No,0,BENIGN
1,2,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,No,0,BENIGN
2,3,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,No,0,BENIGN
3,4,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,No,0,BENIGN
4,5,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,No,0,BENIGN


In [3]:
# DISPLAY DATATYPWE FOR EACH COLUMJS

data.dtypes

URLID                    int64
stypeID                  int64
r0                       int64
r1                       int64
r2                       int64
                         ...  
r111                     int64
attack_status_value      int64
vectors_class_output    object
attack_vector_class      int64
attack_vector_type      object
Length: 118, dtype: object

### CHECK FOR MISSING VALUES

In [4]:
data.isnull().any().sum()

# No missing values on dataset

0

In [5]:
data.attack_vector_class.describe()

count    102186.000000
mean          8.829830
std           4.719239
min           0.000000
25%           6.000000
50%           8.000000
75%          12.000000
max          18.000000
Name: attack_vector_class, dtype: float64

In [163]:
# Create our X (all the fearures except Y {feature matrix})
X = data.drop(["URLID","stypeID", "attack_status_value", "attack_vector_type", "vectors_class_output", "attack_vector_class"], axis=1)

Y = data["attack_status_value"]

## FEATURE SELECTION

In [164]:
import joblib

try:
    selections = joblib.load("selection_RFE.array")
    
except FileNotFoundError:
    
    model = RandomForestClassifier() #Using the default hyper params

    rfe = RFE(model)

    rfe = rfe.fit(X, Y)

    selections = rfe.support_
    
    joblib.dump(selections, "selection_RFE.array")
    


    
# Using Boolean Mask Technique to get accepted features

selections = np.array(selections)

joblib.dump(selections, "selection_RFE.array")

selected_X = X.columns[selections]


selected_X = data[selected_X].drop("attack_status_value", axis=1)

selected_X

IndexError: boolean index did not match indexed array along dimension 0; dimension is 112 but corresponding boolean dimension is 113

In [81]:
# 1. LogisticRegression
# 2. RandomForest Classiffier
# 3. SVM(Support Vector Machines)
# 4. XGBoost Classifier
# 5. KNeighborsClassifier

## DEVELOPING THE MODEL

In [165]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=0)

### LOGISTIC REGERESSION

In [166]:
from sklearn.linear_model import LogisticRegression



logreg = LogisticRegression(max_iter=200000, C=10)

logreg.fit(x_train, y_train)

In [167]:
print("Logistic Regression Model: {:.5f}".format(accuracy_score(y_test, logreg.predict(x_test))))

Logistic Regression Model: 0.99830


### RANDOM FOREST

In [168]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier()

rf.fit(x_train, y_train)

In [169]:
print("Random Forest Accuracy: {:.5f}".format(accuracy_score(y_test, rf.predict(x_test))))

Random Forest Accuracy: 0.99843


### SUPPORT VECTOR MACHINES

In [155]:
from sklearn.svm import SVC


svc = SVC(C=10)

svc.fit(x_train, y_train)

In [156]:
print("Support Vector Accuracy: {:.5f}".format(accuracy_score(y_test, svc.predict(x_test))))

Support Vector Accuracy: 0.99837


### XGBoost CLASSIFIER

In [157]:
from xgboost import XGBClassifier

xgb = XGBClassifier()

xgb.fit(x_train, y_train)

In [158]:
print("Xgboost Accuracy: {:.5f}".format(accuracy_score(y_test, xgb.predict(x_test))))

Xgboost Accuracy: 0.99863


### DECISION TREES

In [159]:
from sklearn.tree import DecisionTreeClassifier


dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(x_train, y_train)

In [160]:
print("Decisiion Tree Accuracy: {:.5f}".format(accuracy_score(y_test, dt_classifier.predict(x_test))))

Decisiion Tree Accuracy: 0.99843


Let us try using AdaBoost on the decision tree clasifier, to see if there is an improvmement on the model

In [128]:
from sklearn.ensemble import AdaBoostClassifier

adaboost_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=150, random_state=42)


adaboost_classifier.fit(x_train, y_train)


In [129]:
print("Adabooost Accuracy on Decision Trees: {:.5f}".format(accuracy_score(y_test, adaboost_classifier.predict(x_test))))

Adabooost Accuracy on Decision Trees: 0.99837


### KNEAREST NEIGHBOURS

In [130]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=150)

knn.fit(x_train, y_train)

In [131]:
print("KNeighbors Classifier Accuracy: {:.7f}".format(accuracy_score(y_test, knn.predict(x_test))))

KNeighbors Classifier Accuracy: 0.9932150


### USING MLP (Multi-Layer Perceptron)

In [42]:
from sklearn.neural_network import MLPClassifier


# Constructing MLP estimator class, specifyig relu as the actication function

mlp_classifier = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=500)

mlp_classifier.fit(x_train, y_train)

In [43]:
print("MLP Classifier Accuracy: {:.3f}".format(accuracy_score(y_test, mlp_classifier.predict(x_test))))

MLP Classifier Accuracy: 0.813


### Lets use Grid Search for hyper parameter tuning,  to try out different models with different params 

In [44]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

In [48]:
# Create the parameter grid for each classifier
svc_param_grid = {'svc__C': [1, 10, 100, 1000], 'svc__kernel': ['linear', 'rbf']}

rf_param_grid = {'randomforestclassifier__n_estimators': [100, 200, 300], 'randomforestclassifier__max_depth': [None, 5, 10]}

lr_param_grid = {'logisticregression__C': [20, 30, 40], 'logisticregression__penalty': ['l2']}

knn_param_grid = {'kneighborsclassifier__n_neighbors': [100, 150, 200], 'kneighborsclassifier__weights': ['uniform', 'distance']}


# Create the classifiers
svc_classifier = SVC()
rf_classifier = RandomForestClassifier()
lr_classifier = LogisticRegression(max_iter=2000, solver='lbfgs')
knn_classifier = KNeighborsClassifier()



model_params = {
    "svm": {
        "model": svc_classifier,
        "params": svc_param_grid
    },
    
    "random_forest": {
        "model": rf_classifier,
        "params": rf_param_grid
    },
    
    "logistic_regression": {
        "model": lr_classifier,
        "params": lr_param_grid
    },
    
    "knn": {
        "model": knn_classifier,
        "params": knn_param_grid
    }
}

    
    
scores = []
best_estimators = {}

for algo, mp in model_params.items():
    pipe = make_pipeline(mp["model"])
    
    clf = GridSearchCV(pipe, mp["params"], cv=5, return_train_score=False)
    
    clf.fit(x_train, y_train)
    
    scores.append({
        "model":algo,
        "best_score": clf.best_score_,
        "best_params": clf.best_params_
    })
    
    best_estimators[algo] = clf.best_estimator_
    
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[{'model': 'svm',
  'best_score': 0.8114204188717693,
  'best_params': {'svc__C': 10, 'svc__kernel': 'rbf'}},
 {'model': 'random_forest',
  'best_score': 0.8091084368990389,
  'best_params': {'randomforestclassifier__max_depth': None,
   'randomforestclassifier__n_estimators': 300}},
 {'model': 'logistic_regression',
  'best_score': 0.789401552489533,
  'best_params': {'logisticregression__C': 40,
   'logisticregression__penalty': 'l2'}},
 {'model': 'knn',
  'best_score': 0.8052061839953926,
  'best_params': {'kneighborsclassifier__n_neighbors': 100,
   'kneighborsclassifier__weights': 'distance'}}]

In [51]:
rfc_best = best_estimators["random_forest"]

In [52]:
rfc_best.fit(x_train, y_train)

In [53]:
print("Randomn Forest Hyper Tuned Accuracy: {:.3f}".format(accuracy_score(y_test, rfc_best.predict(x_test))))

Randomn Forest Hyper Tuned Accuracy: 0.816


In [55]:
svm_best = best_estimators["svm"]

svm_best.fit(x_train, y_train)

In [56]:
print("SVC Hyper Tuned Accuracy: {:.3f}".format(accuracy_score(y_test, svm_best.predict(x_test))))

SVC Hyper Tuned Accuracy: 0.815


In [57]:
knn_best = best_estimators["knn"]

knn_best.fit(x_train, y_train)

In [58]:
print("KNN Hyper Tuned Accuracy: {:.3f}".format(accuracy_score(y_test, knn_best.predict(x_test))))

KNN Hyper Tuned Accuracy: 0.813


In [60]:
logreg_best = best_estimators["logistic_regression"]

logreg_best.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [61]:
print("Logistic Reg Tuned Accuracy: {:.3f}".format(accuracy_score(y_test, logreg_best.predict(x_test))))

Logistic Reg Tuned Accuracy: 0.791


## SAVING THE MODEL

In [None]:
import joblib


