## Cargar Dataset

In [73]:
import multiprocessing
import warnings
import pandas as pd
import numpy as np
from datetime import datetime

# Models
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

# Feature Selection
from copy import deepcopy
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import accuracy_score

In [74]:
datasets = ['0_CIC-IDS-2017', '1_UNSW-NB15', '2_NF-UNSW-NB15-v2', '3_CSE-CIC-IDS2018', '4_NSL-KDD']

# Funciones Utilitarias

## Dataset IO

In [75]:
# def execute_feature_selection(dataset):
#     warnings.filterwarnings("ignore")
dataset = datasets[4]

def load_dataset(dataset):
    folder = f'./small_datasets_clean/{dataset}'
    X = pd.read_csv(f'{folder}/X.csv')
    y = pd.read_csv(f'{folder}/Y.csv')
    return X,y

def print_to_results(msg, type = 'a'):
    print(msg)
    folder = 'results_feature_selection'
    file = open(f'./{folder}/{dataset}.txt', type)
    file.write(f'{msg}\n\n')
    file.close()

## Seleccion de Modelos

### Seleccion de features

In [76]:
def select_features(model, params):
    grid = fit_grid(model, params)
    best_selector = grid.best_estimator_['select']
    selected_features = X_train.columns[best_selector.get_support()]
    save_results(grid, selected_features)

### Grid Search

In [77]:
def fit_grid(model, params):
    grid = GridSearchCV(model, params, error_score=0)
    grid.fit(X_train, y_train)    
    return grid

### Guardar resultados

In [78]:
def save_results(grid, selected_features = None):
    train_accuracy = round(grid.best_score_, 6)
    fit_time = round(grid.cv_results_['mean_fit_time'].mean(), 3)
    score_time = round(grid.cv_results_['mean_score_time'].mean(), 3)
    test_accuracy = round(grid.best_estimator_.score(X_test, y_test), 6)

    result =   (f"Best model: {grid.best_estimator_}\n"
                f"Train Accuracy: {train_accuracy}\n"
                f"Average Time to Fit (s): {fit_time}\n"
                f"Average Time to Score (s): {score_time}\n"
                f"Test Accuracy: {test_accuracy}")
                
    if(selected_features is not None):
        result =   (f"{result}"
                    f"\nNumber of features: {len(selected_features)}\n"
                    f"{selected_features}")
                
    print_to_results(result)

## Lectura del Dataset

In [79]:
print_to_results(f'Starting Feature Selection at {datetime.now()}', 'w')
X, y = load_dataset(dataset)

header = (f'Dataset: {dataset}\n'
          f'X shape: {X.shape}\n'  
          f'y shape: {y.shape}\n' 
          f'y proportions: \n{y.value_counts(normalize=True)}\n')

print_to_results(header)



Starting Feature Selection at 2022-10-13 13:42:01.969371
Dataset: 4_NSL-KDD
X shape: (1999, 41)
y shape: (1999, 1)
y proportions: 
class
0        0.51926
1        0.48074
dtype: float64



### Preprocesamiento del dataset

#### Eliminar columnas constantes

In [80]:
variance_filter = VarianceThreshold(threshold=0)
Xcols = X.columns
X = variance_filter.fit_transform(X)
Xcols = Xcols[variance_filter.get_support()]
X = pd.DataFrame(X, columns=Xcols)

#### Reshape Y

In [81]:
y = y.values.ravel()

#### Train Test Split

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

## Modelos Base

In [83]:
print_to_results("Base models")

Base models


### Entrenamiento de modelos base

In [84]:
lr = LogisticRegression()
knn = KNeighborsClassifier()
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()

lr_params = {'classifier__C':[0.1, 1, 10]}
knn_params = {'classifier__n_neighbors': [1, 3, 5]}
tree_params = {'classifier__max_depth': [11, 13, 15]}
forest_params = {'classifier__n_estimators': [50, 100], 'classifier__max_depth': [1, 3]}

In [85]:
pipe_lr = Pipeline([('classifier', lr)])
grid_lr = fit_grid(pipe_lr, lr_params)
save_results(grid_lr)

Best model: Pipeline(steps=[('classifier', LogisticRegression(C=10))])
Train Accuracy: 0.955969
Average Time to Fit (s): 0.015
Average Time to Score (s): 0.001
Test Accuracy: 0.966


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [86]:
pipe_knn = Pipeline([('classifier', knn)])
grid_knn = fit_grid(pipe_knn, knn_params)
save_results(grid_knn)

Best model: Pipeline(steps=[('classifier', KNeighborsClassifier(n_neighbors=1))])
Train Accuracy: 0.973309
Average Time to Fit (s): 0.002
Average Time to Score (s): 0.015
Test Accuracy: 0.98


In [87]:
pipe_tree = Pipeline([('classifier', tree)])
grid_tree = fit_grid(pipe_tree, tree_params)
save_results(grid_tree)

Best model: Pipeline(steps=[('classifier', DecisionTreeClassifier(max_depth=11))])
Train Accuracy: 0.978653
Average Time to Fit (s): 0.005
Average Time to Score (s): 0.001
Test Accuracy: 0.982


In [88]:
pipe_forest = Pipeline([('classifier', forest)])
grid_forest = fit_grid(pipe_forest, forest_params)
save_results(grid_forest)

Best model: Pipeline(steps=[('classifier',
                 RandomForestClassifier(max_depth=3, n_estimators=50))])
Train Accuracy: 0.964647
Average Time to Fit (s): 0.068
Average Time to Score (s): 0.006
Test Accuracy: 0.96


### Eleccion del mejor modelo base

In [89]:
classifiers = [lr, knn, tree, forest]
params = [lr_params, knn_params, tree_params, forest_params]
best_scores = [grid_lr.best_score_, grid_knn.best_score_, grid_tree.best_score_, grid_forest.best_score_]


best_index = np.argmax(best_scores)
classifier = classifiers[best_index]
classifier_params = params[best_index]
print_to_results(f"Best base model: {classifier}")

Best base model: DecisionTreeClassifier()


# Feature Selection

## Filters

In [90]:
print_to_results("Filters")

Filters


### Correlacion

In [91]:
def correlation(X,y):
    y = y.reshape((y.size,-1))
    np_data = np.concatenate([X,y], axis=1)
    pd_data = pd.DataFrame(np_data)    
    corr = pd_data.corr().abs().iloc[-1]
    corr = corr[:-1]
    return np.array(corr)

In [92]:
print_to_results("Correlacion")

corr_pipe = Pipeline([('select', SelectKBest(correlation)), 
                      ('classifier', classifier)])

corr_pipe_params = deepcopy(classifier_params)

corr_pipe_params.update({'select__k':[10,15,20]})

select_features(corr_pipe, corr_pipe_params)

Correlacion
Best model: Pipeline(steps=[('select',
                 SelectKBest(k=20,
                             score_func=<function correlation at 0x00000206429B1700>)),
                ('classifier', DecisionTreeClassifier(max_depth=11))])
Train Accuracy: 0.975984
Average Time to Fit (s): 0.007
Average Time to Score (s): 0.001
Test Accuracy: 0.97
Number of features: 20
Index(['protocol_type', 'service', 'flag', 'dst_bytes', 'logged_in', 'count',
       'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
       'same_srv_rate', 'diff_srv_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')


### P - value

In [93]:
print_to_results("P-value")

p_value_pipe = Pipeline([('select', SelectKBest(f_classif)), 
                         ('classifier', classifier)])

p_value_pipe_params = deepcopy(classifier_params)

p_value_pipe_params.update({'select__k':[10,15,20]})

select_features(p_value_pipe, p_value_pipe_params)  

P-value


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Best model: Pipeline(steps=[('select', SelectKBest(k=20)),
                ('classifier', DecisionTreeClassifier(max_depth=13))])
Train Accuracy: 0.976653
Average Time to Fit (s): 0.004
Average Time to Score (s): 0.001
Test Accuracy: 0.976
Number of features: 20
Index(['protocol_type', 'service', 'flag', 'dst_bytes', 'logged_in', 'count',
       'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
       'same_srv_rate', 'diff_srv_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')


  f = msb / msw
  f = msb / msw


## Wrappers

In [94]:
print_to_results("Wrappers")

Wrappers


### Decision Tree

In [95]:
print_to_results("Decision Tree")

tree_pipe = Pipeline([('select', SelectFromModel(DecisionTreeClassifier())), 
                      ('classifier', classifier)])

tree_pipe_params = deepcopy(classifier_params)

tree_pipe_params.update({
              'select__max_features': [20],
              'select__estimator__max_depth': [None, 1, 3, 5]
              })

select_features(tree_pipe, tree_pipe_params)  


Decision Tree
Best model: Pipeline(steps=[('select',
                 SelectFromModel(estimator=DecisionTreeClassifier(),
                                 max_features=20)),
                ('classifier', DecisionTreeClassifier(max_depth=15))])
Train Accuracy: 0.978651
Average Time to Fit (s): 0.005
Average Time to Score (s): 0.001
Test Accuracy: 0.976
Number of features: 5
Index(['service', 'dst_bytes', 'count', 'dst_host_same_src_port_rate',
       'dst_host_srv_serror_rate'],
      dtype='object')


### Logistic Regression

In [96]:
print_to_results("Logistic Regression")

logistic_pipe = Pipeline([('select', SelectFromModel(LogisticRegression())), 
                          ('classifier', classifier)])

logistic_pipe_params = deepcopy(classifier_params)

logistic_pipe_params.update({
              'select__max_features': [20]
              })
              
select_features(logistic_pipe, logistic_pipe_params) 

Logistic Regression
Best model: Pipeline(steps=[('select',
                 SelectFromModel(estimator=LogisticRegression(),
                                 max_features=20)),
                ('classifier', DecisionTreeClassifier(max_depth=11))])
Train Accuracy: 0.965975
Average Time to Fit (s): 0.015
Average Time to Score (s): 0.001
Test Accuracy: 0.968
Number of features: 15
Index(['protocol_type', 'wrong_fragment', 'hot', 'is_guest_login', 'count',
       'serror_rate', 'srv_serror_rate', 'same_srv_rate', 'diff_srv_rate',
       'dst_host_srv_count', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_srv_rerror_rate'],
      dtype='object')


### SVC

In [97]:
print_to_results("SVC")

svc_pipe = Pipeline([('select', SelectFromModel(LinearSVC())), 
                     ('classifier', classifier)])

svc_pipe_params = deepcopy(classifier_params)

svc_pipe_params.update({
                'select__max_features': [20],
                'select__estimator__dual': [True, False]
              })

select_features(svc_pipe, svc_pipe_params) 

SVC
Best model: Pipeline(steps=[('select',
                 SelectFromModel(estimator=LinearSVC(), max_features=20)),
                ('classifier', DecisionTreeClassifier(max_depth=11))])
Train Accuracy: 0.972651
Average Time to Fit (s): 0.007
Average Time to Score (s): 0.001
Test Accuracy: 0.972
Number of features: 20
Index(['protocol_type', 'flag', 'wrong_fragment', 'hot', 'num_compromised',
       'num_root', 'num_access_files', 'is_guest_login', 'count',
       'srv_serror_rate', 'same_srv_rate', 'diff_srv_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')


In [98]:
print_to_results(f'Finishing Feature Selection at {datetime.now()}')

Finishing Feature Selection at 2022-10-13 13:42:06.490744


In [99]:
# if __name__ == "__main__":
#     pool = multiprocessing.Pool(processes=5)
#     pool.map(execute_feature_selection, datasets)