# Solucion 3 - Deep Learning

Con esta solución queremos probar solamente que tan bien se ajustan algunos modelos de redes neuronales utilizando el estado del arte en deep learning utilizando TensorFlow.

## Cargamos librerías a usar y los datos

In [1]:
%matplotlib inline

from __future__ import print_function
from time import time
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

import tensorflow as tf
# from tensorflow.contrib.tensor_forest.python import tensor_forest
# Ignoramos todos los GPUs, debido a que tf random forest no se beneficia de ello.
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
# !export TF_CPP_MIN_LOG_LEVEL=2

from IPython.display import display, HTML
# plt.style.use('seaborn-white')

In [4]:
# Cargamos los datos completos para seleccionar mas variables
ataques_train = pd.read_csv('data/kddcup.data_clean.csv', sep=',', decimal='.')
print("Cantidad de observaciones %i con %i variables: " %(ataques_train.shape[0],ataques_train.shape[1]))
feature_cols = ['same_srv_rate', 'flag_SF', 'dst_host_same_srv_rate', 'service_private',
       'dst_host_srv_serror_rate', 'service_http', 'logged_in',
       'dst_host_srv_count', 'count', 'srv_serror_rate', 'flag_S0',
       'dst_host_serror_rate', 'dst_host_count', 'rerror_rate', 'serror_rate',
       'dst_host_rerror_rate', 'src_bytes', 'srv_rerror_rate',
       'dst_host_same_src_port_rate', 'dst_host_srv_rerror_rate',
       'protocol_type_udp', 'service_ecr_i', 'flag_REJ', 'service_pop_3',
       'protocol_type_tcp', 'diff_srv_rate', 'hot', 'dst_host_diff_srv_rate',
       'service_telnet', 'service_domain_u', 'wrong_fragment',
       'dst_host_srv_diff_host_rate', 'num_compromised', 'service_smtp',
       'srv_count', 'dst_bytes', 'srv_diff_host_rate', 'service_ftp_data',
       'duration', 'service_ftp', 'attack_category']

ataques_train = ataques_train[feature_cols]
# Cargamos los datos de validación del 10% de la competencia 
ataques_10prec_test = pd.read_csv('data/data_10per_test_preprocessed.csv', sep=',', decimal='.')
ataques_test = ataques_10prec_test[feature_cols]

print("Cantidad de observaciones %i con %i variables (Entrenamiento) " %(ataques_train.shape[0],ataques_train.shape[1]))
print("Cantidad de observaciones %i con %i variables (Validación) " %(ataques_test.shape[0],ataques_test.shape[1]))

# Balanceamos los datos
df_normal = ataques_train[ataques_train.attack_category=='normal']
df_dos = ataques_train[ataques_train.attack_category=='dos']
df_probe = ataques_train[ataques_train.attack_category=='probe']
df_r2l = ataques_train[ataques_train.attack_category=='r2l']
df_u2r = ataques_train[ataques_train.attack_category=='u2r']
#df_unknown = ataques_train[ataques_train.attack_category=='unknown']

# Remuestreo tomando solo un conjnto de datos menor en las clases de mayor frecuencia
df_normal_downsampled = resample(df_normal, replace=False, n_samples=200000, random_state=123)
df_dos_downsampled = resample(df_dos, replace=False, n_samples=100000, random_state=123)
# Combinar las clases con los nuevos datos remuestreados
ataques_train = pd.concat([df_normal_downsampled, df_dos_downsampled, df_probe, df_r2l, df_u2r])

# Eliminamos los datos "unknown" de la muestra de validación, los cuales no tenemos como entrenar
ataques_test = ataques_test[ataques_test.attack_category!='unknown']

# Mostrar las cantidades de los nuevos datos
print("Balanceo de datos: ")
print(ataques_train.attack_category.value_counts())
print(ataques_test.attack_category.value_counts())

# Para trabajar con Tensorflow la mayoria de procesos o algoritmos requieren que las etiquetas a predecir sean numericas 
# Como se requiere que sean numericas las variables para ser computadas procedemos con
# ataques_train["attack_category"] = pd.factorize(ataques_train["attack_category"])
ataques_train["attack_category"] = ataques_train["attack_category"].map({"normal":0,"dos":1,"probe":2, "r2l":3, "u2r":4})
ataques_test["attack_category"] = ataques_test["attack_category"].map({"normal":0,"dos":1,"probe":2, "r2l":3, "u2r":4})

# Definimos los datos en entrenamiento y validación
X = ataques_train.drop(['attack_category'], axis=1)
y = ataques_train.attack_category.copy()
X_test_40var = ataques_test.drop(['attack_category'], axis=1)
y_test_40var = ataques_test.attack_category.copy()

# Definimos un dataset de entrenamiento y pruebas
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=35)

Cantidad de observaciones 1074992 con 122 variables: 
Cantidad de observaciones 1074992 con 41 variables (Entrenamiento) 
Cantidad de observaciones 311029 con 41 variables (Validación) 
Balanceo de datos: 
normal    200000
dos       100000
probe      13860
r2l          999
u2r           52
Name: attack_category, dtype: int64
dos       223298
normal     60593
r2l         5993
probe       2377
u2r           39
Name: attack_category, dtype: int64


## Tensor Flow


Construct, fit and evaluate the classifier

DNNClassifier expects following arguments :

    feature_columns : Feature columns map the data to the model. We can either use raw features from the training dataset or any derived features from them. See here for more information.

    hidden_units : List containing number of hidden units in each layer. All layers would be fully connected.

    n_classes : Number of classes

Optionally we can also set the optimizer, dropout and activation functions. Default activation function is ReLu. If we set a model directory then it'd save the model graph, parameters etc. See the documentation for reading up on DNNClassifier


In [5]:
columns = X_train.columns.tolist()
columns

['same_srv_rate',
 'flag_SF',
 'dst_host_same_srv_rate',
 'service_private',
 'dst_host_srv_serror_rate',
 'service_http',
 'logged_in',
 'dst_host_srv_count',
 'count',
 'srv_serror_rate',
 'flag_S0',
 'dst_host_serror_rate',
 'dst_host_count',
 'rerror_rate',
 'serror_rate',
 'dst_host_rerror_rate',
 'src_bytes',
 'srv_rerror_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_rerror_rate',
 'protocol_type_udp',
 'service_ecr_i',
 'flag_REJ',
 'service_pop_3',
 'protocol_type_tcp',
 'diff_srv_rate',
 'hot',
 'dst_host_diff_srv_rate',
 'service_telnet',
 'service_domain_u',
 'wrong_fragment',
 'dst_host_srv_diff_host_rate',
 'num_compromised',
 'service_smtp',
 'srv_count',
 'dst_bytes',
 'srv_diff_host_rate',
 'service_ftp_data',
 'duration',
 'service_ftp']

In [6]:
# Create the Feature Columns
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)
X_test_40var = X_test_40var.astype(np.float32)
# All of the features in our training dataset are real valued and continuous.
feature_columns = [tf.contrib.layers.real_valued_column(k) for k in columns]


In [7]:
# Make input function
def input_fn(df,labels):
    feature_cols = {k:tf.constant(df[k].values,shape = [df[k].size,1]) for k in columns}
    label = tf.constant(labels.values, shape = [labels.size,1])
    return feature_cols,label

### Construct the Classifer

Here the DNNClassifier is constructed with the feature_columns where the number of hidden units in each layer is 40,20,40 respectively. The number of classes is also specified.

In [50]:
classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,hidden_units=[40,20,40],n_classes = 5)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12051afd0>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/var/folders/gn/6xmfkgxd30jggx1gdklnnk080000gn/T/tmp_0g93v1i'}


### Fit the classifier

We pass the *input_fn* into the fit method and set the number of steps.

In [51]:
classifier.fit(input_fn=lambda: input_fn(X_train,y_train),steps = 1000)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/gn/6xmfkgxd30jggx1gdklnnk080000gn/T/tmp_0g93v1i/model.ckpt.
INFO:tensorflow:loss = 3850.36, step = 1
INFO:tensorflow:global_step/sec: 2.24076
INFO:tensorflow:loss = 0.350254, step = 101 (44.625 sec)
INFO:tensorflow:global_step/sec: 2.23914
INFO:tensorflow:loss = 0.14062, step = 201 (44.659 sec)
INFO:tensorflow:global_step/sec: 2.28229
INFO:tensorflow:loss = 0.108211, step = 301 (43.817 sec)
INFO:tensorflow:global_step/sec: 2.28687
INFO:tensorflow:loss = 0.126694, step = 401 (43.727 sec)
INFO:tensorflow:global_step/sec: 2.29578
INFO:tensorflow:loss = 0.105622, step = 501 (43.557 sec)
INFO:tensorflow:global_step/sec: 2.31451
INFO:tensorflow:loss = 0.0955609, step = 601 (43.206 sec)
INFO:tensorflow:global_step/sec: 2.26322
INFO:tensorflow:loss = 0.0892744, step = 701 (44.185 sec)
INFO:tensorflow:global_step/sec: 2.27005
INFO:tensorflow:loss = 0.0849647, step = 801 (44.052 sec)
INFO:tenso

DNNClassifier(params={'head': <tensorflow.contrib.learn.python.learn.estimators.head._MultiClassHead object at 0x12051aa20>, 'hidden_units': [40, 20, 40], 'feature_columns': (_RealValuedColumn(column_name='same_srv_rate', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='flag_SF', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='dst_host_same_srv_rate', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='service_private', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='dst_host_srv_serror_rate', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='service_http', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _RealValuedColumn(column_name='logged_in', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), _R

### Evaluate the Classifier

Evaluate method returns some statistics like accuracy, auc after being called on the test data

In [52]:
ev = classifier.evaluate(input_fn=lambda: input_fn(X_test,y_test),steps=1)

INFO:tensorflow:Starting evaluation at 2017-11-28-00:50:38
INFO:tensorflow:Restoring parameters from /var/folders/gn/6xmfkgxd30jggx1gdklnnk080000gn/T/tmp_0g93v1i/model.ckpt-1000
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2017-11-28-00:50:39
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.978582, global_step = 1000, loss = 0.0839374


In [53]:
print(ev)

{'loss': 0.083937362, 'accuracy': 0.97858155, 'global_step': 1000}


### Generate predictions :

In [54]:
def input_predict(df):
    feature_cols = {k:tf.constant(df[k].values,shape = [df[k].size,1]) for k in columns}
    return feature_cols

In [55]:
pred = classifier.predict_classes(input_fn=lambda: input_predict(X_test))

INFO:tensorflow:Restoring parameters from /var/folders/gn/6xmfkgxd30jggx1gdklnnk080000gn/T/tmp_0g93v1i/model.ckpt-1000


In [56]:
pred
#print(list(pred))

<generator object DNNClassifier.predict_classes.<locals>.<genexpr> at 0x157b48938>

In [57]:
pred_class = classifier.predict_classes(input_fn=lambda: input_predict(X_test_40var))

y_pred = list(pred_class)
precision = accuracy_score(y_pred, y_test_40var)
print("Precisión NN: %.2f%%" %(precision*100))
conf_mat = confusion_matrix(y_pred, y_test_40var)
print(conf_mat)


INFO:tensorflow:Restoring parameters from /var/folders/gn/6xmfkgxd30jggx1gdklnnk080000gn/T/tmp_0g93v1i/model.ckpt-1000
Precisión NN: 94.84%
[[ 59620   7541    190   5979     39]
 [   802 215734    334      1      0]
 [   171     23   1853     13      0]
 [     0      0      0      0      0]
 [     0      0      0      0      0]]


Se observa que tiene una precisión aceptable pero no modela bien las clases con frecuencia baja, a pesar de que los datos han sido balanceados con aterioridad.