In [1]:
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.cross_validation import train_test_split # If you can use kFold the warning will be gone
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss
from sklearn.linear_model import LogisticRegression

  from numpy.core.umath_tests import inner1d


In [2]:
raw_data = pd.read_csv('datasets/kddcup.data_10_percent.gz', header=None)

# Categorize columns: "protocol", "service", "flag", "attack_type"
raw_data[1], protocols= pd.factorize(raw_data[1])
raw_data[2], services = pd.factorize(raw_data[2])
raw_data[3], flags    = pd.factorize(raw_data[3])
raw_data[41], attacks = pd.factorize(raw_data[41])

In [3]:
features= raw_data.iloc[:,:raw_data.shape[1]-1]
labels= raw_data.iloc[:,raw_data.shape[1]-1:]

In [4]:
# convert them into numpy arrays
#features= numpy.array(features)
#labels= numpy.array(labels).ravel() # this becomes an 'horizontal' array
labels= labels.values.ravel() # this becomes a 'horizontal' array

In [5]:
# Separate data in train set and test set
df= pd.DataFrame(features)
# create training and testing vars
# Note: train_size + test_size < 1.0 means we are subsampling
# Use small numbers for slow classifiers, as KNN, Radius, SVC,...
X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8, test_size=0.2)
print("X_train, y_train:", X_train.shape, y_train.shape)
print("X_test, y_test:", X_test.shape, y_test.shape)

X_train, y_train: (395216, 41) (395216,)
X_test, y_test: (98805, 41) (98805,)


In [6]:
# Training, choose model by commenting/uncommenting clf=
print("Training model")
clf= RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)#, max_features=0.8, min_samples_leaf=3, n_estimators=500, min_samples_split=3, random_state=10, verbose=1)
#clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, presort=False)

Training model


In [7]:
%%time
model = clf.fit(X_train, y_train)

Wall time: 13.3 s


In [8]:
%%time
from sklearn.externals import joblib
# save the model to disk
filename = 'finalized_model.sav'
joblib.dump(model, filename)

Wall time: 101 ms


In [9]:
%%time
# load the model from disk
trained_model = joblib.load(filename)

Wall time: 120 ms


In [10]:
print("Score: ", trained_model.score(X_train, y_train))

Score:  0.9999949394761346


In [11]:
%%time
# Predicting
print("Predicting")
y_pred = clf.predict(X_test)

Predicting
Wall time: 1.71 s


In [12]:
print("Computing performance metrics")
results = confusion_matrix(y_test, y_pred)
error = zero_one_loss(y_test, y_pred)

Computing performance metrics


In [13]:
from sklearn.metrics import classification_report
labels.shape
# target_names = dict(enumerate(labels))
# np.unique(target_names)
# print(target_names)
# print(classification_report(y_true, y_pred, target_names=target_names))

(494021,)

In [14]:
target_names = dict(zip(np.unique(labels), attacks))

In [15]:
reversefactor = target_names
yy_test = np.vectorize(reversefactor.get)(y_test)
yy_pred = np.vectorize(reversefactor.get)(y_pred)

# print("============================= Labels Start ========================")
# print(np.unique(labels))
# print(attacks)
# a = [1,2]
# b = np.array(a)
# print(b.shape)

# print("============================= Labels END ==========================")

print("============================= Printing Classification Report ==========================")
print(classification_report(yy_test, yy_pred))

# # Making the Confusion Matrix
print("============================= Confusion Matrix ===========================")
pd_cm = pd.crosstab(yy_test, yy_pred, rownames=['Actual'], colnames=['Predicted'])

                  precision    recall  f1-score   support

           back.       1.00      1.00      1.00       446
buffer_overflow.       1.00      0.67      0.80         9
      ftp_write.       1.00      0.50      0.67         2
   guess_passwd.       1.00      1.00      1.00        12
           imap.       1.00      0.50      0.67         2
        ipsweep.       1.00      0.99      1.00       265
           land.       1.00      1.00      1.00         4
     loadmodule.       0.00      0.00      0.00         1
        neptune.       1.00      1.00      1.00     21431
           nmap.       1.00      0.98      0.99        42
         normal.       1.00      1.00      1.00     19402
            pod.       1.00      1.00      1.00        45
      portsweep.       1.00      1.00      1.00       210
        rootkit.       0.00      0.00      0.00         4
          satan.       1.00      0.99      1.00       329
          smurf.       1.00      1.00      1.00     56218
       teardr

  'precision', 'predicted', average, warn_for)


In [16]:
pd_cm.to_csv('output/RF_Confusion_Matricx_Output_10_Percent.csv', index_label = 'attacks|attacks')