In [1]:
import imblearn
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import itertools
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [2]:
#==========our plot and panda settings because the dataset is large===================#
pd.set_option('display.max_columns', None)
sns.set(style="darkgrid")
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

#===================Loading and understanding the dataset better=============================#
train = pd.read_csv(r'C:\Users\hp\Desktop\Train_data.csv')
test = pd.read_csv(r'C:\Users\hp\Desktop\Test_data.csv')
#print(test.head(5))
#print(train.head(5))
#train.describe()
#test.describe()
#print("Training data has {} rows & {} columns".format(train.shape[0],train.shape[1]))
#print("Test data has {} rows & {} columns".format(test.shape[0],test.shape[1]))
#print(train['num_outbound_cmds'].value_counts())
#print(test['num_outbound_cmds'].value_counts())
train.drop(['num_outbound_cmds'], axis=1, inplace=True)
test.drop(['num_outbound_cmds'], axis=1, inplace=True)

#=====scaling the numerical attribute, scaling is used to bring data points far from each other closer#
#To do this i'm extracting the numerical attributes and making the mean and variance 0#
scaler = StandardScaler()
cols = train.select_dtypes(include=['float64','int64']).columns
sc_train = scaler.fit_transform(train.select_dtypes(include=['float64','int64']))
sc_test = scaler.fit_transform(test.select_dtypes(include=['float64','int64']))
# Returning it to the data frame
sc_traindf = pd.DataFrame(sc_train, columns = cols)
sc_testdf = pd.DataFrame(sc_test, columns = cols)

encoder = LabelEncoder()

# extract categorical attributes from both training and test sets 
cattrain = train.select_dtypes(include=['object']).copy()
cattest = test.select_dtypes(include=['object']).copy()

# encode the categorical attributes
traincat = cattrain.apply(encoder.fit_transform)
testcat = cattest.apply(encoder.fit_transform)

# separate target column from encoded data 
enctrain = traincat.drop(['class'], axis=1)
cat_Ytrain = traincat[['class']].copy()

train_x = pd.concat([sc_traindf,enctrain],axis=1)
train_y = train['class']
#print(train_x.shape)

test_df = pd.concat([sc_testdf,testcat],axis=1)
#print(test_df.shape)

#===========For our feature selection, we'll be using a package known as ensemble===========#
# we'll be using a Random forest classifier for it

# fit random forest classifier on the training set
rfc = RandomForestClassifier()
rfc.fit(train_x, train_y);
# extract important features
score = np.round(rfc.feature_importances_,3)
importances = pd.DataFrame({'feature':train_x.columns,'importance':score})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
# plot importances
plt.rcParams['figure.figsize'] = (11, 4)
#importances.plot.bar();
# The graph i just plotted ranks the most important features of the dataset by getting the scores of each
   # feature and ranking them in a descending order(from highest to lowest)
    #from the graph you can see that source_bytes is the most important feature of the dataset
    
# now we have to use the RFE(Recursive feature elimination) model to fit a model and remove the weakest
  #features cos they will have little or no effect on the model
    
# create the RFE model and select 15 attributes
rfe = RFE(rfc, n_features_to_select=15)
rfe = rfe.fit(train_x, train_y)

# summarize the selection of the attributes
feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), train_x.columns)]
selected_features = [v for i, v in feature_map if i==True]

#selected_features

# Basically the above block of code tells the RFE model to pick just the 15 most important features
#These are the features we will use to train our model

# now let me split my data into training and testing and also fit in my models

X_train,X_test,Y_train,Y_test = train_test_split(train_x,train_y,train_size=0.70, random_state=2)

# Train Decision Tree Model
DTC_Classifier = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
DTC_Classifier.fit(X_train, Y_train)

#Train SVM model
SVM_Classifier = SVC(random_state=0)
SVM_Classifier.fit(X_train, Y_train)

# Now i want to evaluate the models with popular performance metrics

models = []
models.append(('Decision Tree Classifier', DTC_Classifier))
models.append(('SVM Classifier', SVM_Classifier))

# i am using 4 well known performance metrics to evaluate the performance of my model
# score, accuracy, confusion matrix, classification
for i, v in models:
    scores = cross_val_score(v, X_train, Y_train, cv=10)
    accuracy = metrics.accuracy_score(Y_train, v.predict(X_train))
    confusion_matrix = metrics.confusion_matrix(Y_train, v.predict(X_train))
    classification = metrics.classification_report(Y_train, v.predict(X_train))
    print()
    print('============================== {} Model Evaluation =============================='.format(i))
    print()
    print ("Cross Validation Mean Score:" "\n", scores.mean())
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()
    # why the code is this bulky is because i want to achieve a better result when i train my models
    # and in order to achieve that i have to entirely manipulate my data and remove unwanted features
    # that will affect the accuracy of my model
    # and as you can see decision tree algorithm has an average score of 100%
    # and the confusion matrix tells us that the model didn't make even one wrong prediction
    
    
    
    # copyright PROTEK
    



Cross Validation Mean Score:
 0.9960869883971739

Model Accuracy:
 1.0

Confusion matrix:
 [[8245    0]
 [   0 9389]]

Classification report:
               precision    recall  f1-score   support

     anomaly       1.00      1.00      1.00      8245
      normal       1.00      1.00      1.00      9389

    accuracy                           1.00     17634
   macro avg       1.00      1.00      1.00     17634
weighted avg       1.00      1.00      1.00     17634




Cross Validation Mean Score:
 0.9621754109093059

Model Accuracy:
 0.9628558466598617

Confusion matrix:
 [[7703  542]
 [ 113 9276]]

Classification report:
               precision    recall  f1-score   support

     anomaly       0.99      0.93      0.96      8245
      normal       0.94      0.99      0.97      9389

    accuracy                           0.96     17634
   macro avg       0.97      0.96      0.96     17634
weighted avg       0.96      0.96      0.96     17634


