In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
filepath = "KaggleImbalanced.csv"
df = pd.read_csv(filepath)

In [3]:
feats = [x for x in df.columns if x != 'ProtocolName']
X = df[feats]
Y = df['ProtocolName']

In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

In [6]:
params = { 'n_neighbors': [3, 5, 7, 11, 21],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan'] }
gs = GridSearchCV(KNeighborsClassifier(), params, cv = 10, n_jobs = -1)

In [12]:
%%time
gs.fit(X_train, y_train)

In [13]:
%%time
gs.best_score_

In [14]:
%%time
gs.best_params_

In [6]:
neigh = KNeighborsClassifier(n_neighbors = 3, metric='manhattan', weights='distance')

In [7]:
%%time
neigh.fit(X_train, y_train)

Wall time: 1min 14s


KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance')

In [8]:
%%time
neigh.score(X_test, y_test)

Wall time: 7min 20s


0.9561002290905344

In [10]:
%%time
y_pred = neigh.predict(X_test)
y_pred

Wall time: 7min 26s


array(['AMAZON', 'NETFLIX', 'WHATSAPP', ..., 'HTTP_DOWNLOAD', 'FACEBOOK',
       'DROPBOX'], dtype=object)

In [12]:
import time

In [18]:
class_list = df['ProtocolName'].unique()

In [19]:
model_output = {}
for label in class_list:
    model_output.setdefault(label, [])
    myDataFrame = df[df['ProtocolName']==label]
    samples = len(myDataFrame)
    myFeats = [x for x in myDataFrame.columns if x != 'ProtocolName']
    X_features = myDataFrame[myFeats]
    X_features = scaler.fit_transform(X_features)
    myLabel = myDataFrame['ProtocolName']
    tic = time.time()
#     my_predict = np.argmax(model.predict(X_features), axis=-1)
    predicted_class = neigh.predict(X_features)
    toc = time.time()
#     confidence_score = np.max(neigh.predict(X_features))
#     predicted_class = encoder.inverse_transform(my_predict)
    time_taken = toc-tic
    my_acc = accuracy_score(myDataFrame['ProtocolName'], predicted_class) 
    model_output[label].append(predicted_class)
    model_output[label].append(time_taken)
    model_output[label].append(samples)
    model_output[label].append(my_acc)
#     model_output[label].append(confidence_score)
    

In [22]:
with  open("Evaluation2.txt", 'w+') as f:
    for label in model_output.keys():
        f.write(label +"\t" + str(round(model_output[label][1], 2)) + "\t" + str(model_output[label][2]) + "\t" + str(round(model_output[label][3]*100, 2)) + "\n")
f.close()

In [23]:
print(model_output['GOOGLE_MAPS'])

[array(['APPLE', 'NETFLIX', 'GOOGLE_MAPS', ..., 'WIKIPEDIA',
       'APPLE_ICLOUD', 'GOOGLE_MAPS'], dtype=object), 159.41592741012573, 10000, 0.1845]


In [24]:
for label in model_output.keys():
    myDataFrame = df[df['ProtocolName']==label]
    myFeats = [x for x in myDataFrame.columns if x != 'ProtocolName']
    X_features = myDataFrame[myFeats]
    X_features = scaler.fit_transform(X_features)
    myLabel = myDataFrame['ProtocolName']
#     confidence_score = np.max(neigh.predict(X_features))
    score = neigh.score(X_features, myLabel)
#     my_acc = accuracy_score(myDataFrame['ProtocolName'], predicted_class) 
    model_output[label].append(score)

In [25]:
print(model_output['GOOGLE_MAPS'])

[array(['APPLE', 'NETFLIX', 'GOOGLE_MAPS', ..., 'WIKIPEDIA',
       'APPLE_ICLOUD', 'GOOGLE_MAPS'], dtype=object), 159.41592741012573, 10000, 0.1845, 0.1845]


In [28]:
# print(classification_report(y_test, y_pred))


In [23]:
print(confusion_matrix(y_test, y_pred))

[[2931    0    0 ...    1    0    0]
 [   0 2567   90 ...    3    0    1]
 [   0   22 2835 ...    2    0    0]
 ...
 [   0    7    6 ... 3015    0    0]
 [   0    3    3 ...    0 2736    0]
 [   0    1    0 ...    0    0 2881]]


In [27]:
#These are features extracted from RandomForest classifier
important_features = ['Flow.Duration', 'Total.Length.of.Fwd.Packets', 'Fwd.Packet.Length.Max',
       'Fwd.Packet.Length.Min', 'Fwd.Packet.Length.Mean',
       'Fwd.Packet.Length.Std', 'Flow.Bytes.s', 'Flow.Packets.s',
       'Flow.IAT.Mean', 'Flow.IAT.Std', 'Flow.IAT.Max', 'Flow.IAT.Min',
       'Fwd.IAT.Total', 'Fwd.IAT.Mean', 'Fwd.IAT.Std', 'Fwd.IAT.Max',
       'Bwd.IAT.Total', 'Bwd.IAT.Max', 'Fwd.Header.Length', 'Fwd.Packets.s',
       'Bwd.Packets.s', 'Min.Packet.Length', 'Max.Packet.Length',
       'Packet.Length.Mean', 'Packet.Length.Variance', 'ACK.Flag.Count',
       'Average.Packet.Size', 'Avg.Fwd.Segment.Size', 'Fwd.Header.Length.1',
       'Subflow.Fwd.Bytes', 'Init_Win_bytes_forward',
       'Init_Win_bytes_backward', 'min_seg_size_forward', 'L7Protocol']

In [26]:
# important_features

In [30]:
X = df[important_features]
Y = df['ProtocolName']

In [31]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)

In [32]:
%%time
neigh = KNeighborsClassifier(n_neighbors = 3, metric='manhattan', weights='distance')
neigh.fit(X_train, y_train)

Wall time: 43.2 s


KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance')

In [33]:
%%time
neigh.score(X_test, y_test)

Wall time: 1min 15s


0.9664925367218651

In [34]:
%%time
y_pred = neigh.predict(X_test)

Wall time: 1min 15s


In [27]:
# print(classification_report(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred))

In [None]:
cf_matrix = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(34,12)) 
sns.heatmap(cf_matrix,annot=True, ax=ax, fmt='d', annot_kws={"size": 12})
plt.savefig("RF_cf_with_important_features.png")