In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

import sklearn
from sklearn import metrics, model_selection, tree
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('datafinal.csv')

In [3]:
df.shape

(12207873, 17)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,FLOW_DURATION_MILLISECONDS,FLOW_END_MILLISECONDS,FLOW_START_MILLISECONDS,IN_BYTES,L4_DST_PORT,L4_SRC_PORT,OUT_BYTES,RETRANSMITTED_OUT_BYTES,TCP_WIN_MAX_IN,TCP_WIN_MAX_OUT,TCP_WIN_MIN_IN,TCP_WIN_MIN_OUT,TCP_WIN_MSS_IN,TCP_WIN_MSS_OUT,TOTAL_FLOWS_EXP,LABEL
0,0,0,1616660040010,1616660040010,44,60541,22351,40,0,1024,0,1024,0,1460,0,2293398,0
1,1,0,1616660040068,1616660040068,68,0,0,0,0,0,0,0,0,0,0,2293400,0
2,2,44,1616660040114,1616660040070,189,24650,443,104,0,122,1634,122,1634,0,0,2293404,0
3,3,0,1616660040122,1616660040122,52,8728,61075,0,0,8192,0,8192,0,1440,0,2293407,0
4,4,1,1616660040184,1616660040183,189,443,39762,40,0,502,0,502,0,0,0,2293409,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12207873 entries, 0 to 12207872
Data columns (total 17 columns):
 #   Column                      Dtype
---  ------                      -----
 0   Unnamed: 0                  int64
 1   FLOW_DURATION_MILLISECONDS  int64
 2   FLOW_END_MILLISECONDS       int64
 3   FLOW_START_MILLISECONDS     int64
 4   IN_BYTES                    int64
 5   L4_DST_PORT                 int64
 6   L4_SRC_PORT                 int64
 7   OUT_BYTES                   int64
 8   RETRANSMITTED_OUT_BYTES     int64
 9   TCP_WIN_MAX_IN              int64
 10  TCP_WIN_MAX_OUT             int64
 11  TCP_WIN_MIN_IN              int64
 12  TCP_WIN_MIN_OUT             int64
 13  TCP_WIN_MSS_IN              int64
 14  TCP_WIN_MSS_OUT             int64
 15  TOTAL_FLOWS_EXP             int64
 16  LABEL                       int64
dtypes: int64(17)
memory usage: 1.5 GB


In [6]:
df['LABEL'] = df['LABEL'].astype('category')

In [7]:
df.dtypes

Unnamed: 0                       int64
FLOW_DURATION_MILLISECONDS       int64
FLOW_END_MILLISECONDS            int64
FLOW_START_MILLISECONDS          int64
IN_BYTES                         int64
L4_DST_PORT                      int64
L4_SRC_PORT                      int64
OUT_BYTES                        int64
RETRANSMITTED_OUT_BYTES          int64
TCP_WIN_MAX_IN                   int64
TCP_WIN_MAX_OUT                  int64
TCP_WIN_MIN_IN                   int64
TCP_WIN_MIN_OUT                  int64
TCP_WIN_MSS_IN                   int64
TCP_WIN_MSS_OUT                  int64
TOTAL_FLOWS_EXP                  int64
LABEL                         category
dtype: object

In [8]:
samples = []
df = sklearn.utils.shuffle(df)
df_split = np.array_split(df, 10)

for chunk in df_split:
    chunk = chunk.sample(frac =.12, random_state = 33, replace=True)
        
    samples.append(chunk)

In [9]:
df_sample = pd.concat(samples)

In [10]:
sm = SMOTE(random_state=2, n_jobs=8)
X = df_sample.drop(['LABEL'], axis=1)
y = df_sample['LABEL']

print("Before, counts of label '3': {}".format(sum(y == 3)))
print("Before, counts of label '2': {}".format(sum(y == 2)))
print("Before, counts of label '1': {}".format(sum(y == 1)))
print("Before, counts of label '0': {} \n".format(sum(y == 0)))

X_balanced, y_balanced = sm.fit_resample(X, y.ravel())

print("After, counts of label '3': {}".format(sum(y_balanced == 3)))
print("After, counts of label '2': {}".format(sum(y_balanced == 2)))
print("After, counts of label '1': {}".format(sum(y_balanced == 1)))
print("After, counts of label '0': {}".format(sum(y_balanced == 0)))

Before, counts of label '3': 104363
Before, counts of label '2': 272403
Before, counts of label '1': 299791
Before, counts of label '0': 788386 

After, counts of label '3': 788386
After, counts of label '2': 788386
After, counts of label '1': 788386
After, counts of label '0': 788386


In [11]:
X_train, X_test_total, y_train, y_test_total = model_selection.train_test_split(X, y, test_size=0.45, random_state=31)

X_test, X_validation, y_test, y_validation = model_selection.train_test_split(X_test_total, y_test_total, test_size=0.33, random_state=31)

In [12]:
scaler = MinMaxScaler()

In [13]:
X_train = scaler.fit_transform(X_train)

In [14]:
X_test = scaler.transform(X_test)

In [15]:
X_validation = scaler.transform(X_validation)

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

In [17]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=7)

In [18]:
clf.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=7)

In [19]:
clf.score(X_test, y_test)

0.8862456982430719

In [21]:
pred = clf.predict(X_test)

In [22]:
print(metrics.accuracy_score(y_test, pred))
print('Matriz de confusion', metrics.confusion_matrix(y_test, pred))
print(metrics.classification_report(y_test, pred, target_names=['0', '1', '2', '3']))

0.8862456982430719
Matriz de confusion [[221865    143   3906  12076]
 [    20  90568      0     58]
 [  4750      0  47742  29290]
 [     0      0      0  31262]]
              precision    recall  f1-score   support

           0       0.98      0.93      0.96    237990
           1       1.00      1.00      1.00     90646
           2       0.92      0.58      0.72     81782
           3       0.43      1.00      0.60     31262

    accuracy                           0.89    441680
   macro avg       0.83      0.88      0.82    441680
weighted avg       0.93      0.89      0.89    441680



In [24]:
# metrics.plot_roc_curve(clf, X_test, y_test)
# plt.show()

In [26]:
pd.DataFrame(X_train).to_csv("x_train_smote_oversample.csv")
pd.DataFrame(y_train).to_csv("y_train_smote_oversample.csv")

In [28]:
pd.DataFrame(X_test).to_csv("x_test_smote_oversample.csv")
pd.DataFrame(y_test).to_csv("y_test_smote_oversample.csv")

In [29]:
pd.DataFrame(X_validation).to_csv("x_validation_smote_oversample.csv")
pd.DataFrame(y_validation).to_csv("y_validation_smote_oversample.csv")

In [30]:
import joblib
joblib.dump(scaler,'escalador.pkl')

['escalador.pkl']

In [31]:
joblib.dump(clf, 'modelo1.h5')

['modelo1.h5']