# Notebook for Converting Sentences to Suggested SPL

# Stage 0 - download models and import libraries

In [10]:
!pip install simplet5 -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [11]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/srv/notebooks/data/ESCU'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/srv/notebooks/data/ESCU/ESCU_Dataset.csv


In [12]:
df = pd.read_csv("/srv/notebooks/data/ESCU/ESCU_Dataset.csv", encoding='latin-1', usecols=['description', 'qualifiedSearch'])

In [13]:
df.head()

Unnamed: 0,description,qualifiedSearch
0,Maintains a list of Authentication app values ...,"tstats summariesonly=true min(""_time"") as ""fi..."
1,Detects user and computer account deletion,"from datamodel:""Change"".""Account_Management"" ..."
2,Maintains a list of users that have authentica...,tstats `summariesonly` count from datamodel=A...
3,Detects excessive number of failed login attem...,"from datamodel:""Authentication"".""Authenticati..."
4,Detects an excessive number of failed login at...,tstats `summariesonly` values(Authentication....


In [14]:
# simpleT5 expects dataframe to have 2 columns: "source_text" and "target_text"
df = df.rename(columns={"qualifiedSearch":"target_text", "description":"source_text"})
df = df[['source_text', 'target_text']]

In [15]:
df.head()

Unnamed: 0,source_text,target_text
0,Maintains a list of Authentication app values ...,"tstats summariesonly=true min(""_time"") as ""fi..."
1,Detects user and computer account deletion,"from datamodel:""Change"".""Account_Management"" ..."
2,Maintains a list of users that have authentica...,tstats `summariesonly` count from datamodel=A...
3,Detects excessive number of failed login attem...,"from datamodel:""Authentication"".""Authenticati..."
4,Detects an excessive number of failed login at...,tstats `summariesonly` values(Authentication....


In [16]:
# T5 model expects a task related prefix: since it is a summarization task, we will add a prefix "summarize: "
df['source_text'] = "summarize: " + df['source_text']
df

Unnamed: 0,source_text,target_text
0,summarize: Maintains a list of Authentication ...,"tstats summariesonly=true min(""_time"") as ""fi..."
1,summarize: Detects user and computer account d...,"from datamodel:""Change"".""Account_Management"" ..."
2,summarize: Maintains a list of users that have...,tstats `summariesonly` count from datamodel=A...
3,summarize: Detects excessive number of failed ...,"from datamodel:""Authentication"".""Authenticati..."
4,summarize: Detects an excessive number of fail...,tstats `summariesonly` values(Authentication....
...,...,...
2029,summarize: Update domain lookup table every 30...,makeresults | `virustotal_domain_list` | map ...
2030,summarize: Update file lookup table every 30 m...,makeresults | `virustotal_file_list` | map se...
2031,summarize: Update IP lookup table every 30 min...,makeresults | `virustotal_ip_list` | map sear...
2032,summarize: Update URL lookup table every 30 mi...,makeresults | `virustotal_url_list` | map sea...


In [17]:
train_df, test_df = train_test_split(df, test_size=0.3)
train_df.shape, test_df.shape

((1423, 2), (611, 2))

In [18]:
from simplet5 import SimpleT5

model = SimpleT5()
model.from_pretrained(model_type="t5", model_name="t5-base")

In [10]:
model.train(train_df=train_df[:5000],
            eval_df=test_df[:100], 
            source_max_token_len=128, 
            target_max_token_len=70, 
            batch_size=8, max_epochs=5, use_gpu=False)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Missing logger folder: /srv/lightning_logs

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
Global seed set to 42
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [19]:
! ( cd outputs; ls )

simplet5-epoch-0-train-loss-2.2409-val-loss-1.3553
simplet5-epoch-1-train-loss-1.4562-val-loss-1.1223
simplet5-epoch-2-train-loss-1.2253-val-loss-1.0013
simplet5-epoch-3-train-loss-1.0682-val-loss-0.9388
simplet5-epoch-4-train-loss-0.9567-val-loss-0.8704


In [20]:
# let's load the trained model from the local output folder for inferencing:
model.load_model("t5","outputs/simplet5-epoch-4-train-loss-0.9567-val-loss-0.8704", use_gpu=False)

In [21]:
text_to_spl="""summarize: Monitor network data for uncommon data flows. Processes utilizing the network that do not normally have network communication or have never been seen before are suspicious."""
model.predict(text_to_spl)

['tstats security_content_summariesonly count min(_time) as firstTime max(_time) as lastTime from datamodel=Network_Traffic where All_Traffic.dest_network_transactions.process_name = "All_']