In [2]:
import re
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Example dataset
data = [
    ("<script>alert('XSS');</script>", 1),  # Malicious
    ("<div>Hello, World!</div>", 0),  # Benign
    ("<img src=x onerror=alert('XSS')>", 1),  # Malicious
    ("<p>This is a paragraph.</p>", 0)  # Benign
]

# Separate the texts and labels
texts, labels = zip(*data)

# Tokenize the texts
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
max_length = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
vocab_size = len(tokenizer.word_index) + 1

# Convert labels to numpy array
labels = np.array(labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2, random_state=42)


In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the model
model = Sequential([
    Embedding(vocab_size, 128, input_length=max_length),
    LSTM(128, return_sequences=True),
    Dropout(0.5),
    LSTM(128),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)


Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.5000 - loss: 0.6920 - val_accuracy: 1.0000 - val_loss: 0.6812
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.6815 - val_accuracy: 1.0000 - val_loss: 0.6717
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 1.0000 - loss: 0.6711 - val_accuracy: 1.0000 - val_loss: 0.6584
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 1.0000 - loss: 0.6470 - val_accuracy: 1.0000 - val_loss: 0.6367
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.6171 - val_accuracy: 1.0000 - val_loss: 0.6035
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 1.0000 - loss: 0.5783 - val_accuracy: 1.0000 - val_loss: 0.5516
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

<keras.src.callbacks.history.History at 0x321296730>

In [4]:
def generate_regex_patterns(model, tokenizer, sample_texts, threshold=0.5):
    patterns = []
    for text in sample_texts:
        # Preprocess text
        sequence = tokenizer.texts_to_sequences([text])
        sequence = pad_sequences(sequence, maxlen=max_length, padding='post')

        # Get model prediction
        prediction = model.predict(sequence)[0][0]

        # If the prediction is above the threshold, consider it as a pattern
        if prediction > threshold:
            # Convert the sequence back to text
            pattern = ''.join(tokenizer.index_word[i] for i in sequence[0] if i > 0)
            patterns.append(pattern)

    return patterns

# Generate patterns
sample_texts = [
    "<script>alert('XSS');</script>",
    "<img src=x onerror=alert('XSS')>"
]
patterns = generate_regex_patterns(model, tokenizer, sample_texts)

# Convert patterns to regex
regex_patterns = [re.escape(pattern) for pattern in patterns]
print(regex_patterns)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
["<script>alert\\('xss'\\);</script>", "<img\\ src=x\\ onerror=alert\\('xss'\\)>"]


In [6]:
from flask import Flask, request, jsonify
from keras.preprocessing.sequence import pad_sequences
import re

app = Flask(__name__)

# Load your model and tokenizer here
# model = ...
# tokenizer = ...
# max_length = ...

@app.route('/generate_regex_patterns', methods=['POST'])
def generate_regex_patterns():
    data = request.json
    model = data['model']
    tokenizer = data['tokenizer']
    sample_texts = data['sample_texts']
    threshold = data.get('threshold', 0.5)
    
    patterns = []
    for text in sample_texts:
        # Preprocess text
        sequence = tokenizer.texts_to_sequences([text])
        sequence = pad_sequences(sequence, maxlen=max_length, padding='post')
        
        # Get model prediction
        prediction = model.predict(sequence)[0][0]
        
        # If the prediction is above the threshold, consider it as a pattern
        if prediction > threshold:
            # Convert the sequence back to text
            pattern = ''.join(tokenizer.index_word[i] for i in sequence[0] if i > 0)
            patterns.append(pattern)
    
    # Convert patterns to regex
    regex_patterns = [re.escape(pattern) for pattern in patterns]
    
    return jsonify({'regex_patterns': regex_patterns})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=285)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:285
 * Running on http://10.32.36.248:285
[33mPress CTRL+C to quit[0m
