In [4]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Bidirectional, GlobalMaxPool1D, Dot, Activation, Concatenate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
data = pd.read_csv('datasetClean.csv')

# Preprocessing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Sentence'])
sequences = tokenizer.texts_to_sequences(data['Sentence'])
max_len = max([len(x) for x in sequences])
X = pad_sequences(sequences, maxlen=max_len)

# X = ["halo", "tes", "<script>"]
# y = [0, 0, 1]

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Type'])

# Split the data into 80% training+validation and 20% testing
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the 80% training+validation set into 60% training and 20% validation
# X_train_val+y_train_val itu 80% maka jika diambil 25%, 25%*80% jadinya 20%
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

# Building the Model
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

# Input layer
input_layer = Input(shape=(max_len,))
# Embedding layer
embedding_layer = Embedding(vocab_size, embedding_dim, input_length=max_len)(input_layer)
# Bidirectional LSTM layer
lstm_layer = Bidirectional(LSTM(64, return_sequences=True))(embedding_layer)

# Attention mechanism
attention = Dot(axes=[2, 2])([lstm_layer, lstm_layer])
attention = Activation('softmax')(attention)
context = Dot(axes=[2, 1])([attention, lstm_layer])
context = Concatenate()([context, lstm_layer])

# Global Max Pooling
x = GlobalMaxPool1D()(context)
# [[1,1],[1,2]]...
# [1,1,,1,2]
# Dense layers
x = Dense(64, activation='relu')(x)
output_layer = Dense(1, activation='sigmoid')(x)

# Create model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile and Train the Model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy*100:.2f}%')




Epoch 1/10
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m384s[0m 1s/step - accuracy: 0.9506 - loss: 0.1475 - val_accuracy: 0.9992 - val_loss: 0.0018
Epoch 2/10
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 1s/step - accuracy: 1.0000 - loss: 6.5867e-05 - val_accuracy: 0.9994 - val_loss: 0.0027
Epoch 3/10
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m334s[0m 986ms/step - accuracy: 1.0000 - loss: 1.5452e-05 - val_accuracy: 0.9994 - val_loss: 0.0028
Epoch 4/10
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 948ms/step - accuracy: 1.0000 - loss: 7.3543e-06 - val_accuracy: 0.9994 - val_loss: 0.0029
Epoch 5/10
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m319s[0m 940ms/step - accuracy: 1.0000 - loss: 4.6510e-06 - val_accuracy: 0.9994 - val_loss: 0.0030
Epoch 6/10
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 929ms/step - accuracy: 1.0000 - loss: 2.4585e-06 - val_accuracy: 0.9994 - val_los

In [5]:
model.save('my_model.keras')

In [8]:
!mkdir -p saved_model
model.export('saved_model/my_model')

INFO:tensorflow:Assets written to: saved_model/my_model/assets


INFO:tensorflow:Assets written to: saved_model/my_model/assets


Saved artifact at 'saved_model/my_model'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 800), dtype=tf.float32, name='keras_tensor_16')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  127175408831376: TensorSpec(shape=(), dtype=tf.resource, name=None)
  127175408830224: TensorSpec(shape=(), dtype=tf.resource, name=None)
  127175408829840: TensorSpec(shape=(), dtype=tf.resource, name=None)
  127175434534160: TensorSpec(shape=(), dtype=tf.resource, name=None)
  127175434534352: TensorSpec(shape=(), dtype=tf.resource, name=None)
  127175434533968: TensorSpec(shape=(), dtype=tf.resource, name=None)
  127175434532816: TensorSpec(shape=(), dtype=tf.resource, name=None)
  127175434534544: TensorSpec(shape=(), dtype=tf.resource, name=None)
  127175434532048: TensorSpec(shape=(), dtype=tf.resource, name=None)
  127175434026832: TensorSpec(shape=(), dtype=tf.resource, name=None)
  127175434532240

In [9]:
model.save('my_model.h5')



In [15]:
import pickle

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the label encoder
with open('label_encoder.pkl', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [10]:
testXSS = [
                '<script>alert(\'xss\')</script><script><script>',
                'hellomo',
                'https://store.bentley.com/en/shop/search?term=%22%3E%3Cdetails%20open%20ontoggle=prompt(1337)%3ExxLouisLouisLouis',
                'ghfdhgdhjgd',
                'uid%3D19%26list_page%3D%22%3E%3Cscript%3Ealert%28document.cookie%29%3B%3C/script%3E',
                '&template=en_search_error&postalCode=\\\';alert(0)//',
                '&where=%3Cscript%3Ealert%28%27xss%27%29%3C%2Fscript%3E&loctypes=1003%2C1001%2C1000%2C1%2C9%2C5%2C11%2C13%2C19%2C20&from=hdr_localsearch',
                'http://mydata.com/sad/sd/qwd/qwde/qwe/?sessionid=12',
                'http://mydata.com?id=script',
                '&\';}},{scope:\'email,user_about_me,user_hometown,user_interests,user_likes,user_status,user_website,user_birthday,publish_stream,publish_actions,offline_access\'});}alert(0);b=function(response){c=({a:{//',
                'http://myurl.com?<script',
                'http://mydata.com?script=script',
                'composite_search=1&keyword="/><script>alert("Xss:Vijayendra")</script>',
                'http://mysite.com?srtalert',
                'script',
                'alert',
                'Search=%22%3E\'%3E%3CSCRIPT%20SRC=http://br.zone-h.org/testes/xss.js%3E%3C/SCRIPT%3E?',
                'id=15%3Cscript%3Ealert%28document.cookie%29%3C/script%3E',
                'composite_search=1&keyword="/><script>alert("Xss:Vijayendra")</script>',
                'id=123&href=abdc<a<script>alert(1)',
                '<<<<<<>>>>></>,><><>',
                'alert()alert()',
                'alertalert',
                '?url=http://localhost:8888/notebooks/Documents/MachineLearning/Practical%20Machine%20Learning',
                '<script<script',
                '<scriptalert',
                'httphttphttp',
                'https://disqus.com/?ref_noscript',
                'I am a string',
                '<img src="javascript:alert(1)/>"',
                'HelloWorld!',
                'http://mysite.com?<script>',
                '<input type="text" value=`` <div/onmouseover=\'alert(471)\'>X</div>',
                '<img \x47src=x onerror="javascript:alert(324)">',
                '<a href="\xE2\x80\x87javascript:javascript:alert(183)" id="fuzzelement1">test</a>',
                '<body onscroll=javascript:alert(288)><br><br><br><br><br><br>...<br><br><br><br><br><br><br><br><br><br>...<br><br><br><br><br><br><br><br><br><br>...<br><br><br><br><br><br><br><br><br><br>...<br><br><br><br><br><br><br><br><br><br>...<br><br><br><br><input autofocus>',
                '<meta charset="mac-farsi">¼script¾javascript:alert(379)¼/script¾',
                '<HTML xmlns:xss><?import namespace=(493)s" implementation="%(htc)s"><xss:xss>XSS</xss:xss></HTML>""","XML namespace."),("""<XML ID=(494)s"><I><B>&lt;IMG SRC="javas<!-- -->cript:javascript:alert(420)"&gt;</B></I></XML><SPAN DATASRC="#xss" DATAFLD="B" DATAFORMATAS="HTML"></SPAN>'
            ]

In [14]:
# Preprocess the new data
new_sequences = tokenizer.texts_to_sequences(testXSS)
new_X = pad_sequences(new_sequences, maxlen=max_len)

# Predict using the trained model
predictions = model.predict(new_X)

# Since this is a binary classification, convert probabilities to class labels
predicted_labels = (predictions > 0.5).astype(int)

# Decode labels (optional, if you want to see 'Malicious' or 'Not Malicious')
predicted_class_labels = label_encoder.inverse_transform(predicted_labels.flatten())

# Print the predictions
for sentence, label in zip(testXSS, predicted_class_labels):
    print(f"Sentence: {sentence}\nPredicted label: {label}\n")


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step
Sentence: <script>alert('xss')</script><script><script>
Predicted label: Malicious

Sentence: hellomo
Predicted label: Benign

Sentence: https://store.bentley.com/en/shop/search?term=%22%3E%3Cdetails%20open%20ontoggle=prompt(1337)%3ExxLouisLouisLouis
Predicted label: Malicious

Sentence: ghfdhgdhjgd
Predicted label: Benign

Sentence: uid%3D19%26list_page%3D%22%3E%3Cscript%3Ealert%28document.cookie%29%3B%3C/script%3E
Predicted label: Malicious

Sentence: &template=en_search_error&postalCode=\';alert(0)//
Predicted label: Malicious

Sentence: &where=%3Cscript%3Ealert%28%27xss%27%29%3C%2Fscript%3E&loctypes=1003%2C1001%2C1000%2C1%2C9%2C5%2C11%2C13%2C19%2C20&from=hdr_localsearch
Predicted label: Malicious

Sentence: http://mydata.com/sad/sd/qwd/qwde/qwe/?sessionid=12
Predicted label: Malicious

Sentence: http://mydata.com?id=script
Predicted label: Benign

Sentence: &';}},{scope:'email,user_about_me,user_hometown,user_