In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

%matplotlib inline

In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
paths = {
    'part1': '../input/simargl2021-network-intrusion-detection-dataset/dataset-part1.csv',
    'part2': '../input/simargl2021-network-intrusion-detection-dataset/dataset-part2.csv',
    'initial_features': '../input/clean-simargl/initial_features.csv'
}

In [None]:
df = pd.concat([
    pd.read_csv(paths['part1']), 
    pd.read_csv(paths['part2'])
])

### EPXLORACION INICIAL

In [None]:
df.shape

In [None]:
df.info()

In [None]:
## exploramos algunas entradas, para ver qué tipo de data tenemos
df.head(50)

In [None]:
df.DST_TO_SRC_SECOND_BYTES.replace(',', 0)

In [None]:
df.FIREWALL_EVENT.value_counts()

In [None]:
df.FLOW_ACTIVE_TIMEOUT.value_counts()

In [None]:
df.FLOW_ID.is_unique

In [None]:
df.FLOW_INACTIVE_TIMEOUT.value_counts()

In [None]:
df.FRAME_LENGTH.value_counts()

In [None]:
df.MIN_IP_PKT_LEN.value_counts()

In [None]:
df.MAX_IP_PKT_LEN.value_counts()

In [None]:
zeros = 0

for val in df.OOORDER_IN_PKTS:
    if val == 0:
        zeros += 1
        
print("Porcentaje de valores 0 en columna: %.2f%%" % (100 * zeros / len(df)))

In [None]:
zeros = 0

for val in df.OOORDER_OUT_PKTS:
    if val == 0:
        zeros += 1
        
print("Porcentaje de valores 0 en columna: %.2f%%" % (100 * zeros / len(df)))

In [None]:
df.SAMPLING_INTERVAL.value_counts()

In [None]:
df.TOTAL_FLOWS_EXP.is_unique

In [None]:
df.BIFLOW_DIRECTION.value_counts()

In [None]:
TO_DELETE = [
    'FLOW_ID',
    'BIFLOW_DIRECTION',
    'FIREWALL_EVENT', 
    'FLOW_ACTIVE_TIMEOUT',
    'FLOW_INACTIVE_TIMEOUT',
    'FRAME_LENGTH',
    'MAX_IP_PKT_LEN',
    'MIN_IP_PKT_LEN',
    'PROTOCOL_MAP',
    'SAMPLING_INTERVAL',
    'TOTAL_FLOWS_EXP',
    'OOORDER_OUT_PKTS',
    'OOORDER_IN_PKTS',
    'IPV4_SRC_ADDR',
    'IPV4_DST_ADDR'
]

In [None]:
clean_df = df.drop(TO_DELETE, axis=1)

In [None]:
clean_df.columns

In [None]:
clean_df.to_csv('datasets/initial_features.csv')

In [None]:
## En caso se necesite optimizar memoria
dtype={
    'PROTOCOL': 'int16',
    'DIRECTION': 'int16',
    'FLOW_DURATION_MILLISECONDS': 'int16',
    'IN_PKTS': 'int32',
    'OUT_PKTS': 'int32',
    'L4_DST_PORT': 'int32',
    'L4_SRC_PORT': 'int32',
    'RETRANSMITTED_IN_PKTS': 'int32',
    'RETRANSMITTED_OUT_PKTS': 'int32'
    'RETRANSMITTED_IN_BYTES': 'int32',
    'RETRANSMITTED_OUT_BYTES': 'int32',
}

In [5]:
## SI SE EJECUTA TODO EL DOCUMENTO SALTAR ESTA CELDA

clean_df = pd.read_csv(
    paths['initial_features'], 
    index_col=0,
    #dtype=dtype
)

In [6]:
clean_df.info()

In [7]:
clean_df.head()

In [8]:
clean_df = clean_df.drop(
    [
        'DST_TO_SRC_SECOND_BYTES', 
        'SRC_TO_DST_SECOND_BYTES',
        'FLOW_START_SEC', 
        'FLOW_END_SEC', 
        'FLOW_DURATION_MICROSECONDS',
        'L7_PROTO_NAME',
    ],
    axis=1
)

In [9]:
clean_df.head()

In [10]:
## cuando la dirección del flow es == 1, es muy probable que el flow sea normal
clean_df[clean_df['DIRECTION'] == 1].groupby(['LABEL']).size()

In [11]:
target = clean_df['LABEL']

In [12]:
features = clean_df.drop('LABEL', axis=1)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
test_ratio = 0.25

In [15]:
_, X_sample, _, y_sample = train_test_split(features, target, test_size = test_ratio)

In [16]:
print("Se trabajará con una muestra del %.0f%% de los datos originales" % (len(X_sample) / len(clean_df) * 100))

In [17]:
## Revisamos el balance de los datos, es posible que el balance sea 
## lo suficientemente bueno para entrenar los modelos sin over ni under sampling
y_sample.value_counts()

In [18]:
## Encoding de la columna de features ('LABEL')
encoding = {
    'Normal flow' : 0,
    'SYN Scan - aggressive': 1,
    'Denial of Service R-U-Dead-Yet': 2,
    'Denial of Service Slowloris': 3
}

In [19]:
y_sample = y_sample.apply(lambda row: encoding[row])

In [20]:
y_sample.value_counts()

In [21]:
clean_df = features = target = None

### UNDERSAMPLING

In [22]:
from imblearn.under_sampling import RandomUnderSampler

In [23]:
sampler = RandomUnderSampler(random_state=0)
X_undersampled, y_undersampled = sampler.fit_resample(X_sample, y_sample)

In [24]:
y_undersampled.value_counts()

In [25]:
len(y_undersampled)

### SCALING

In [26]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [27]:
df_columns = X_undersampled.columns

In [28]:
# scaler = MinMaxScaler()
scaler = StandardScaler()

df_scaled = scaler.fit_transform(X_undersampled)
df_scaled = pd.DataFrame(df_scaled, columns = df_columns)

In [29]:
df_scaled.head()

### MODELOS

In [30]:
X_train, X_rem, y_train, y_rem = train_test_split(df_scaled, y_undersampled, train_size=0.55)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size = 0.30)

In [31]:
from sklearn import metrics, model_selection, tree
from scikitplot.metrics import plot_roc

In [None]:
def manual_grid_search(x_train, y_train, x_val, y_val, depths, sample_splits):
    accuracy_scores = [] 

    for depth in depths:
        optimal_sample_split = 0
        sample_accuracy = 0

        for sample_split in sample_splits:
            tree_model = tree.DecisionTreeClassifier(
            max_depth = depth,
            min_samples_split = sample_split)

            tree_model.fit(x_train, y_train)

            y_pred = tree_model.predict(x_val)

            accuracy = metrics.accuracy_score(y_val, y_pred)

            if (accuracy > sample_accuracy): 
                sample_accuracy = accuracy 
                optimal_sample_split = sample_split

        accuracy_scores.append([depth, optimal_sample_split, sample_accuracy])

    return accuracy_scores 

In [None]:
def graph_scores(accuracy_scores):
    depths = []
    scores = []
    
    for i in accuracy_scores:
        depths.append(i[0])
        scores.append(round(i[2], 3))

    fig, ax = plt.subplots(1, 1, figsize = (15, 5)) 
    ax.plot(depths, scores, '-o', label = 'accuracy')
    
    ylim = plt.ylim()
    
    ax.set_title('Precisión dependiendo de la profundidad del árbol', fontsize = 16)
    ax.set_xlabel('Profundidad del árbol', fontsize = 14)
    ax.set_ylabel('Presición', fontsize = 14)
    
    ax.set_ylim(ylim)
    ax.set_xticks(depths)
    plt.grid()
    # ax.legend()
    
    for i in range(len(scores)):
        ax.annotate(str(scores[i]), xy=(depths[i], scores[i]), ha='center', va='bottom')

### DECISION TREE

In [None]:
depth = 3
sample_split = 40

In [None]:
tree_model = tree.DecisionTreeClassifier(
    max_depth = depth,
    min_samples_split = sample_split
)

tree_model.fit(X_train, y_train)

In [None]:
tree_pred = tree_model.predict(X_test)

In [None]:
accuracy = metrics.accuracy_score(y_test, tree_pred)
accuracy

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, tree_pred)
confusion_matrix

In [None]:
print(metrics.classification_report(y_test, tree_pred))

In [None]:
plot_roc(y_test, tree_model.predict_proba(X_test))
plt.show()

In [None]:
accuracy_scores = manual_grid_search(X_train, y_train, X_valid, y_valid, range(1, 21), range(10, 100, 10))

In [None]:
graph_scores(accuracy_scores)

### SVM

In [None]:
from sklearn import svm

In [None]:
svm_model = svm.SVC(
    probability=True,
    kernel="rbf",
    verbose=True
)

In [None]:
svm_model.fit(X_min, y_min)

In [None]:
svm_pred = svm_model.predict(X_test)

In [None]:
accuracy = metrics.accuracy_score(y_test, svm_pred)
accuracy

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, svm_preds)
confusion_matrix

In [None]:
print(metrics.classification_report(y_test, svm_pred))

In [None]:
plot_roc(y_test, svm_model.predict_proba(X_test))
plt.show()

### KNN

In [32]:
from sklearn.neighbors import KNeighborsClassifier

In [33]:
knn = KNeighborsClassifier(n_neighbors=500, n_jobs = -1, weights='distance')

In [34]:
knn.fit(X_train, y_train)

In [35]:
knn_pred = knn.predict(X_test)

In [36]:
accuracy = metrics.accuracy_score(y_test, knn_pred)
accuracy

In [37]:
confusion_matrix = metrics.confusion_matrix(y_test, knn_pred)
confusion_matrix

In [38]:
print(metrics.classification_report(y_test, knn_pred))

In [39]:
plot_roc(y_test, knn.predict_proba(X_test))
plt.show()

LGBM

In [40]:
import lightgbm as lgb

In [41]:
lgb_model = lgb.LGBMClassifier(random_state=42, objective='multiclass')

In [42]:
lgb_model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],verbose=20,eval_metric='logloss')

In [45]:
print('Testing accuracy {:.4f}'.format(lgb_model.score(X_test,y_test)))

In [47]:
confusion_matrix = metrics.confusion_matrix(y_test, lgb_model.predict(X_test))
confusion_matrix

In [48]:
print(metrics.classification_report(y_test, lgb_model.predict(X_test)))

In [46]:
plot_roc(y_test, lgb_model.predict_proba(X_test))
plt.show()