# SVM classifier

In [1]:
import random
import re, string
%load_ext autoreload
%autoreload 2

import numpy as np
import tensorflow as tf

import os

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization



### Load Data

In [2]:
current_dir = os.getcwd()
miner_train_dir = os.path.join(current_dir, 'wast_samples/miner_samples')
benign_train_dir = os.path.join(current_dir, 'wast_samples/benign_samples')

batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'wast_samples',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

Found 40 files belonging to 3 classes.
Using 32 files for training.


### Printing out examples

In [3]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(3):
    print("Review", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

Label 0 corresponds to benign
Label 1 corresponds to miner
Review b'(module\n  (type (;0;) (func (param i32 i32 i32)))\n  (type (;1;) (func (param i32)))\n  (type (;2;) (func (result i32)))\n  (type (;3;) (func (param i32 i32) (result i32)))\n  (type (;4;) (func (param i32 i32 i32) (result i32)))\n  (type (;5;) (func (param i32) (result i32)))\n  (type (;6;) (func (param i32 i32)))\n  (type (;7;) (func (param i32 i32 i32 i32)))\n  (type (;8;) (func (param i32 i32 i64)))\n  (import "env" "memory" (memory (;0;) 256 256))\n  (import "env" "table" (table (;0;) 8 8 funcref))\n  (import "env" "tableBase" (global (;0;) i32))\n  (import "env" "DYNAMICTOP_PTR" (global (;1;) i32))\n  (import "env" "STACKTOP" (global (;2;) i32))\n  (import "env" "abort" (func (;0;) (type 1)))\n  (import "env" "enlargeMemory" (func (;1;) (type 2)))\n  (import "env" "getTotalMemory" (func (;2;) (type 2)))\n  (import "env" "abortOnCannotGrowMemory" (func (;3;) (type 2)))\n  (import "env" "___setErrNo" (func (;4;) (t

### Validation

In [4]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'wast_samples',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)


Found 40 files belonging to 3 classes.
Using 8 files for validation.


### Vectorization:

In [5]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')


max_features = 1024
sequence_length = 2000

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [6]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

#### Retrieve a batch (of 32 reviews and labels) from the dataset:

In [7]:
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b'(module\n  (type (;0;) (func (param i32 i32 i32)))\n  (type (;1;) (func (param i32 i32 i32) (result i32)))\n  (type (;2;) (func (param i32)))\n  (type (;3;) (func (result i32)))\n  (type (;4;) (func (param i32 i32) (result i32)))\n  (type (;5;) (func (param i32) (result i32)))\n  (type (;6;) (func (param i32 i32 i32 i32)))\n  (type (;7;) (func (param i32 i32)))\n  (type (;8;) (func (param i32 f64 i32 i32 i32 i32) (result i32)))\n  (type (;9;) (func (param i32 i32 i32 i32) (result i32)))\n  (type (;10;) (func (param i32 i32 i32 i32 i32) (result f64)))\n  (type (;11;) (func (param i32 i32 i32 i32 i32) (result i32)))\n  (type (;12;) (func (param i32 i32 i32 i32) (result f64)))\n  (type (;13;) (func (param i32 i32) (result f64)))\n  (type (;14;) (func (param f64 f64) (result f64)))\n  (type (;15;) (func (param f64 i32) (result f64)))\n  (type (;16;) (func (param i32 i32 i32 i32 i32)))\n  (import "env" "memory" (memory (;0;) 1024 1024))\n  (import "env" "table" (table (;0

Vocabulary test

In [8]:
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))
for i in random.sample(range(max_features), k=10):
    print(f"{i} ---> {vectorize_layer.get_vocabulary()[i]}")

Vocabulary size: 1024
404 ---> 2004413935125273122
78 ---> 34
87 ---> local
796 ---> 10552
338 ---> 100
931 ---> 5688
591 ---> 2097368
807 ---> 9952
510 ---> l74
637 ---> 2052912941


### Prepare Test data

In [9]:
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'wast_samples/test',
    batch_size=batch_size)


Found 2 files belonging to 2 classes.


In [10]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

### Perfomance tuning

In [11]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

### Model Creation

In [13]:
embedding_dim = 16

model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          16400     
_________________________________________________________________
dropout (Dropout)            (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 17        
Total params: 16,417
Trainable params: 16,417
Non-trainable params: 0
_________________________________________________________________


### optimizer and a loss function


In [12]:

X = np.zeros([157, 128])
Y = np.zeros([157], dtype=np.int32)
example_id = np.array(['%d' % i for i in range(len(Y))])

x_column_name = 'x'
example_id_column_name = 'example_id'

train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={x_column_name: X, example_id_column_name: example_id},
    y=Y,
    num_epochs=None,
    shuffle=True)

svm = tf.contrib.learn.SVM(
    example_id_column=example_id_column_name,
    feature_columns=(tf.contrib.layers.real_valued_column(
        column_name=x_column_name, dimension=128),),
    l2_regularization=0.1)

svm.fit(input_fn=train_input_fn, steps=10)


AttributeError: module 'tensorflow_estimator.python.estimator.api._v2.estimator' has no attribute 'inputs'