# CNN Prototyping

In [23]:
import keras.layers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# from analytics.PhishX.Modeling.build-convolutions import process_urls

plt.style.use('ggplot')
plt.rc('patch', force_edgecolor=True,edgecolor='white')
plt.rc('hist', bins='auto')

In [24]:
data = pd.read_csv('phishing_site_urls.csv')
data.shape

(549346, 2)

In [25]:
data['Label'] = data['Label'].apply(lambda x: 1 if x == 'bad' else 0)
data.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,1
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,1
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,1
3,mail.printakid.com/www.online.americanexpress....,1
4,thewhiskeydregs.com/wp-content/themes/widescre...,1


In [26]:
signal_idx = list(range(0, 400))
good_idx = list(range(18231, 18331))

print(f'Positive signal {len(signal_idx)}\n'
      f'No Signal {len(signal_idx)}')

Positive signal 400
No Signal 400


In [27]:
agg_list = [*signal_idx , *good_idx]
len(agg_list)

500

In [28]:
positive_signal = [data.iloc[signal_idx]]
len(positive_signal)

1

In [29]:
no_signal = [data.iloc[good_idx]]
len(no_signal)

1

In [30]:
input = np.hstack([positive_signal, no_signal])
input = input.reshape(500, 2)
input.shape

(500, 2)

In [31]:
df = pd.DataFrame(input)
df.reset_index(inplace=True)
df.rename(columns={0:'URL', 1:'Label'}, inplace=True)
url_df = df.drop(columns=['Label'])
label_df = df.drop(columns=['URL'])
df_index = df['index']

In [32]:
# Create character list
char_idx_1 = list(map(chr, range(33, 65)))
char_idx_2 = list(map(chr, range(91, 127)))
char_idx = char_idx_1 + char_idx_2

In [33]:
def process_urls(vals_in):

    vals = vals_in.set_index('index')
    conv_3d = np.array([])

    for i in vals.index:
        data = vals.iloc[i]

        zeroed = np.array([])
        if len(data['URL']) <= 256:
            dums = pd.get_dummies(np.array(list(data['URL'])))
            fill_n_rows = 256 - len(data['URL'])
            zeroed = np.zeros((fill_n_rows, len(char_idx)))
        else:
            dums = pd.get_dummies(np.array((list(data['URL']))))
            dums = dums.head(256)

        conv = pd.DataFrame(dums, columns=char_idx)
        conv = np.where(pd.isna(conv), 0, conv)
        if len(zeroed):
            conv = np.vstack((conv, zeroed))


        conv = conv.T
        conv = conv.reshape((-1, 68, 256))

        if len(conv_3d):
            conv_3d = np.vstack((conv_3d, conv))
        else:
            conv_3d = conv

    return conv_3d

In [34]:
url_length = list(range(0, 256))
print(len(agg_list))
print(len(url_length))

500
256


In [35]:
train = process_urls(url_df)
train.shape

(500, 68, 256)

In [36]:
labels = label_df['Label'].to_numpy()  # .reshape(-1, 1)
labels = np.array(labels)[:, np.newaxis]
# labels = labels.reshape(500, -1)
labels.shape

(500, 1)

In [37]:
train = np.asarray(train).astype(np.float32)
labels = np.asarray(labels).astype(np.float32)

In [38]:
X_test = np.vstack([train[:80, :, :], train[-20:, :, :]])
y_test = np.vstack([labels[:80, :], labels[-20:, :]])
print(f'X data for testing: {X_test.shape}\n'
      f'y data fot testing: {y_test.shape}')

X data for testing: (100, 68, 256)
y data fot testing: (100, 1)


In [39]:
X_train = train[:-20, :, :]
X_train = X_train[80:, :, :]
y_train = labels[:-20, :]
y_train = y_train[80:, :]
print(f'X data for training: {X_train.shape}\n'
      f'y data fot training: {y_train.shape}')

X data for training: (400, 68, 256)
y data fot training: (400, 1)


In [40]:
X_train = X_train.reshape(*X_train.shape, -1)
X_test = X_test.reshape(*X_test.shape, -1)

print(f'Train X: {X_train.shape}')
print(f'Test X: {X_test.shape}')

Train X: (400, 68, 256, 1)
Test X: (100, 68, 256, 1)


In [41]:
# y_train = y_train.reshape(1, *y_train.shape)
# y_test = y_test.reshape(1, *y_test.shape)

print(f'Train y: {y_train.shape}')
print(f'Test y: {y_test.shape}')

Train y: (400, 1)
Test y: (100, 1)


In [52]:
# LSTM and CNN for sequence classification in the IMDB dataset
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Conv1D
from keras.layers import Conv2D
from keras.layers import ConvLSTM1D
from keras.layers import MaxPooling1D
from keras.layers import MaxPooling2D
from keras.layers import Embedding
from keras.layers import Flatten
from keras.layers import RepeatVector
from keras.preprocessing import sequence
from keras import metrics
# fix random seed for reproducibility
tf.random.set_seed(7)

In [53]:
# truncate and pad input sequences
# max_review_length = 500

# X_train = sequence.pad_sequences(X_train, maxlen=256)
# X_test = sequence.pad_sequences(X_test, maxlen=256)

top_words = 500
# create the model
embedding_vecor_length =68  # 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_shape=(68,256)))
model.add(Conv2D(filters=1, kernel_size=1, padding='same', activation='relu'))
# X_train = X_train.reshape(68, 256, 1)
model.add(MaxPooling2D(pool_size=2))
model.add(Flatten())
model.add(RepeatVector(1))
model.add(tf.compat.v1.keras.layers.CuDNNLSTM(68, input_shape=(1, 1028)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[metrics.Precision(), metrics.Recall()])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 68, 256, 68)       34000     
                                                                 
 conv2d_2 (Conv2D)           (None, 1, 256, 68)        69        
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 1, 128, 34)       0         
 2D)                                                             
                                                                 
 flatten_2 (Flatten)         (None, 4352)              0         
                                                                 
 repeat_vector_2 (RepeatVect  (None, 1, 4352)          0         
 or)                                                             
                                                                 
 cu_dnnlstm_2 (CuDNNLSTM)    (None, 68)               

In [54]:
model.fit(X_train, y_train, epochs=3, batch_size=1)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 98.77%


In [55]:
val_x = X_test
val_y = y_test

In [56]:
scores

[0.12849897146224976, 0.9876543283462524, 0.9523809552192688]

In [63]:
%%time
# Final evaluation of the model
scores = model.evaluate(val_x, val_y, verbose=0)
print(f'Shape of Training Batch: {X_train.shape}\nTotal record count: {X_train[0].shape[0]:,}')
print(f'Shape of Validation Batch: {val_x.shape}\nRecords being predicted on: \n  {val_x.shape[0]:,}')
print(f"Accuracies:\n  Model 1: {scores[0] * 100:0.4f}%\t Model 2:  {scores[1] * 100:0.4f}%\t Model 3:  {scores[2] * 100:0.4f}%\nTime:")

Shape of Training Batch: (400, 68, 256, 1)
Total record count: 68
Shape of Validation Batch: (100, 68, 256, 1)
Records being predicted on: 
  100
Accuracies:
  Model 1: 12.8499%	 Model 2:  98.7654%	 Model 3:  95.2381%
Time:
CPU times: user 88.2 ms, sys: 9.05 ms, total: 97.3 ms
Wall time: 70.7 ms


In [74]:
%%time
predictions = model.predict(val_x)
pred_y = np.array([e for e in np.round(predictions.clip(0, 1).astype(float))])
real_score = sum([val == pred for val, pred in zip(val_y.tolist(), pred_y.tolist())]) / val_x.shape[0]
print(f'Validation using unseen data:\nRecords being predicted on: \n {val_x[0].shape[0]:,}\nReal prediction score: {real_score * 100:0.2f}%\nTime:')

Validation using unseen data:
Records being predicted on: 
 68
Real prediction score: 95.00%
Time:
CPU times: user 65.8 ms, sys: 4.11 ms, total: 69.9 ms
Wall time: 60.2 ms
