In [1]:
import pandas
from libs import *

In [2]:
def open_file(filename):
    """
    Opens a file and returns its content.
    """
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            content = f.read()
        return content
    except FileNotFoundError:
        print(f"Error: File not found at {filename}")
        return None
    except Exception as e:
        print(f"Error reading file {filename}: {e}")
        return None

sf = open_file("datasets/safe.txt")

In [3]:
import re


def split_sentences(text):
    # Protect ellipses
    text = text.replace('...', '<ELLIPSIS>')

    # Split sentences
    text = re.sub(r'\.(\s+)', r'.\n', text)

    # Restore ellipses
    text = text.replace('<ELLIPSIS>', '...')

    # Clean each line
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        line = line.strip()                   # Remove leading/trailing spaces
        line = line.replace('.', '')           # Remove dots
        line = re.sub(r'\bbr\b', '', line)      # Remove the word 'br' (whole word only)
        line = re.sub(r'\s+', ' ', line)        # Collapse multiple spaces into one
        line = line.strip()                    # (again) to remove space caused by 'br' removal
        
        # Check if line is meaningful (more than one word, not just short junk)
        if len(line.split()) > 2 and len(line) > 5:
            cleaned_lines.append(line)

    return '\n'.join(cleaned_lines)

processed_text = split_sentences(sf)

# If you want to save it back
with open('datasets/safe_processed.txt', 'w') as f:
    f.write(processed_text)

In [4]:
xss = pandas.read_csv("csv_files/xss-v2.csv")
sql = pandas.read_csv("csv_files/sql-v2.csv")
# safe = open_file("datasets/safe.txt","safe")
safe = pandas.DataFrame(processed_text.split("\n"),columns=["text"])

In [5]:
xss.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Label
0,0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0
1,1,"<tt onmouseover=""alert(1)"">test</tt>",1
2,2,"\t </span> <span class=""reference-text"">Steeri...",0
3,3,"\t </span> <span class=""reference-text""><cite ...",0
4,4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0


In [6]:
xss["label"] = "xss"
sql["label"] = "sql"
safe["label"] = "safe"

In [7]:
xss=xss.rename(columns={'Sentence': 'text'})
sql=sql.rename(columns={'Query': 'text'})
safe=safe.rename(columns={"text":"text"})

In [8]:
xss.head()

Unnamed: 0.1,Unnamed: 0,text,Label,label
0,0,"<li><a href=""/wiki/File:Socrates.png"" class=""i...",0,xss
1,1,"<tt onmouseover=""alert(1)"">test</tt>",1,xss
2,2,"\t </span> <span class=""reference-text"">Steeri...",0,xss
3,3,"\t </span> <span class=""reference-text""><cite ...",0,xss
4,4,"\t </span>. <a href=""/wiki/Digital_object_iden...",0,xss


In [9]:
sql_tokenizer('<tt onmouseover="alert(1)">test</tt>')

['LT',
 'EN_WORD',
 'EN_WORD',
 'EQ',
 'DQUT',
 'EN_WORD',
 'LPRN',
 'EN_WORD',
 'RPRN',
 'DQUT',
 'GT',
 'EN_WORD',
 'LT',
 'SLSH',
 'EN_WORD',
 'GT']

In [12]:
def custom_tokenizer(query):
    return " ".join(sql_tokenizer(query))
print(custom_tokenizer('<script> <tt onmouseover="alert(1)">test</tt>'))

LT SCRIPT GT LT EN_WORD EN_WORD EQ DQUT EN_WORD LPRN EN_WORD RPRN DQUT GT EN_WORD LT SLSH EN_WORD GT


In [13]:
xss['text'] = xss['text'].apply(custom_tokenizer)
sql['text'] = sql['text'].apply(custom_tokenizer)
safe['text'] = safe['text'].apply(custom_tokenizer)

In [14]:
xss.head()

Unnamed: 0.1,Unnamed: 0,text,Label,label
0,0,LT EN_WORD GT LT A EN_WORD EQ DQUT SLSH EN_WOR...,0,xss
1,1,LT EN_WORD EN_WORD EQ DQUT EN_WORD LPRN EN_WOR...,1,xss
2,2,LT SLSH EN_WORD GT LT EN_WORD CLASS EQ DQUT EN...,0,xss
3,3,LT SLSH EN_WORD GT LT EN_WORD CLASS EQ DQUT EN...,0,xss
4,4,LT SLSH EN_WORD GT DOT LT A EN_WORD EQ DQUT SL...,0,xss


In [9]:
# with open('keys/keywords.txt', 'r') as f:
#     keys = {line.strip().upper() for line in f}


# # Load replacements
# replacements = {}
# with open('keys/replace.txt', 'r') as f:
#     for line in f:
#         key, value = line.strip().split("==>")
#         replacements[key.strip()] = value.strip()

# def replace_symbol(word):
#     try:
#         int(word)
#         return "INT"
#     except ValueError:
#         word_list = []
#         for char in word:
#             replacement = replacements.get(char)
#             word_list.append(f" {replacement} " if replacement else char)
#         return "".join(word_list).strip().replace("  ", " ")
        
# def sql_tokenizer(query):
#     query = query.lower()
#     tokenized = ' '.join(map(replace_symbol, query.split()))
#     split = tokenized.split()
#     # print("Missing from keys:", {w.upper() for w in split if w.upper() not in keys})


#     for i, word in enumerate(split):
#         upper = word.upper()
#         if upper in keys:
#             split[i] = upper
#         elif upper in replacements.values():
#             split[i] = upper  # Keep symbolic tokens like LT, GT, STAR
#         else:
#             split[i] = "EN_WORD"

#     return ' '.join(split)


# sql_tokenizer("this is <Script>/int  TEXT select from ' * my name is hema and this is my car")

In [18]:
print(len(xss))
print(len(sql))
print(len(safe))

13686
30919
292466


In [39]:
max_num = 15000
xss_n=xss.head(max_num)
sql_n = sql.head(max_num)
safe_n = safe.head(max_num+int(max_num/1.3))

In [40]:
print(len(xss_n))
print(len(sql_n))
print(len(safe_n))

13686
15000
26538


In [41]:
dfs = []
  # Notice the double brackets!

df_list = [xss_n[['text', 'label']],  sql_n[['text', 'label']],safe_n[['text', 'label']]]

In [42]:
combined = pandas.concat(df_list,ignore_index=True)

In [43]:
combined.to_csv("csv_files/dt-v12.csv")