In [1]:
import pandas as pd
import os
from pyts.approximation import SymbolicAggregateApproximation
import numpy as np
from sksequitur import Grammar, Parser, Production, Mark
import nltk
from nltk import CFG
from tqdm import tqdm
import re
import itertools

In [308]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Generate grammar from sequitus algorithm

In [309]:
frequent_items = pd.read_csv('frequent_items.csv')
frequent_items = frequent_items[frequent_items['count'] > 2]

In [310]:
len(frequent_items)

48516

In [311]:
parser = Parser()
for item in frequent_items['item']:
    parser.feed([Mark()])
    parser.feed(item)


In [312]:
grammar = Grammar(parser.tree)

In [313]:
def remove_adjacent_letters(input_string):
    # Define a regular expression pattern to match two or more adjacent letters
    pattern = re.compile(r'([a-z])\1')

    # Use the sub() function to replace matched patterns with an empty string
    result = re.sub(pattern, '', input_string)

    return result

In [314]:
with open('grammar_start.txt', 'w') as f:
    print(grammar, file=f)  

In [315]:
def truncate_after_three_consecutive_spaces(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            # Find the index of three consecutive spaces
            index = line.find('   ')
            
            # If three consecutive spaces are found, truncate the line
            if index != -1:
                truncated_line = line[:index + 3] + '\n'
            else:
                truncated_line = line
            
            # Write the truncated line to the output file
            outfile.write(truncated_line)

def encapsulate_lowercase(file_path, output_file_path):
    with open(file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
        for line in infile:
            modified_line = ''.join([f"'{char}'" if char.islower() else char for char in line])
            outfile.write(modified_line)


input_file_path = 'grammar_start.txt'
middle_path = 'grammar_middle.txt'
output_file_path = 'grammar.txt'

truncate_after_three_consecutive_spaces(input_file_path, middle_path)

encapsulate_lowercase(middle_path, output_file_path)

## Convert the grammar to NLTK grammar

In [316]:
with open('grammar.txt', 'r') as f:
    grammar = f.read()

grammar = CFG.fromstring(grammar)

In [317]:
grammar

<Grammar with 52393 productions>

In [318]:
productions = grammar.productions()
for production in productions:
    if production.lhs() == nltk.grammar.Nonterminal('1'):
        print(production)

1 -> 'a' 'a'


In [319]:
parser = nltk.ChartParser(grammar)

In [320]:
def check_sentence(sentence):
    try:
        next(parser.parse(sentence))
        return 0
    except StopIteration:
        return 1
    except ValueError:
        return 1

# Example usage
sentence_to_check = "aaaaabba"
result = check_sentence(sentence_to_check)

print("Result:", result)

Result: 0


## TEST

In [2]:
test = pd.read_csv(os.path.join("data","eshopper","200.csv"))
test.head()

Unnamed: 0,Latency,CategoriesControllerGetcategory,ItemsControllerFindfeaturesitemrandom,ItemsControllerFinditemrandom,ItemsControllerFinditemsrandombyidproduct,ProductsControllerFindproduct,ProductsControllerFindproductrandom,GatewayGet,anomaly
0,384,5,3,4,43,3,3,130,0
1,431,3,2,4,50,3,3,112,0
2,372,4,3,3,52,3,3,114,0
3,404,3,3,4,60,3,4,135,0
4,362,4,3,4,53,3,3,123,0


In [322]:
from pyts.approximation import SymbolicAggregateApproximation

sax = SymbolicAggregateApproximation(n_bins=10)
data_sax = sax.fit_transform(test.drop(['anomaly'], axis=1).T).T

  warn("Some quantiles are equal. The number of bins will "


In [323]:
d = pd.DataFrame(data_sax)

In [324]:
d['string'] =  [''.join(map(str, row)) for row in data_sax]

In [325]:
d['result'] = d.apply(lambda x: check_sentence(x['string']), axis=1)

In [326]:
d['anomaly'] = test['anomaly']

In [3]:
d = pd.read_csv('results.csv')

In [5]:
d.loc[d['anomaly'] == 2, 'anomaly'] = 1

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

# Assuming y_pred and y_test are your predicted and true labels (binary 0 or 1)
y_pred = d['result']
y_test = d['anomaly']

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# F1 Score
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# ROC AUC Score
roc_auc = roc_auc_score(y_test, y_pred)
print("ROC AUC Score:", roc_auc)


Accuracy: 0.9215707853009418
Precision: 0.7565350308612117
Recall: 0.8963572591183145
F1 Score: 0.8205321917233275
Confusion Matrix:
[[161878  12583]
 [  4521  39100]]
ROC AUC Score: 0.9121161284844185
