# CLX Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

In [1]:
import cudf
import s3fs
from os import path

from clx.analytics.cybert import Cybert

# CyBERT

## Model

In [7]:
CLX_S3_BASE_PATH = 'rapidsai-data/cyber/clx'
HF_S3_BASE_PATH = 'models.huggingface.co/bert/raykallen/cybert_apache_parser'

MODEL_DIR = '../models/CyBERT'
DATA_DIR = '../data'
CONFIG_FILENAME = 'config.json'
MODEL_FILENAME = 'pytorch_model.bin'
APACHE_SAMPLE_CSV = 'apache_sample_1k.csv'

In [15]:
if not path.exists(f'{MODEL_DIR}/{MODEL_FILENAME}'):
    fs = s3fs.S3FileSystem(anon=True)
    fs.get(
          f'{HF_S3_BASE_PATH}/{MODEL_FILENAME}'
        , f'{MODEL_DIR}/{MODEL_FILENAME}'
    )

if not path.exists(f'{MODEL_DIR}/{CONFIG_FILENAME}'):
    fs = s3fs.S3FileSystem(anon=True)
    fs.get(
          f'{HF_S3_BASE_PATH}/{CONFIG_FILENAME}'
        , f'{MODEL_DIR}/{CONFIG_FILENAME}'
    )
    
if not path.exists(APACHE_SAMPLE_CSV):
    fs = s3fs.S3FileSystem(anon=True)
    fs.get(
        f'{CLX_S3_BASE_PATH}/{APACHE_SAMPLE_CSV}'
        , f'{DATA_DIR}/{APACHE_SAMPLE_CSV}')

#### clx.analytics.cybert.Cybert.load_model()

In [13]:
cybert = Cybert()
cybert.load_model(
    f'{MODEL_DIR}/{MODEL_FILENAME}'
    , f'{MODEL_DIR}/{CONFIG_FILENAME}'
)

#### clx.analytics.cybert.Cybert.inference()

In [17]:
logs_df = cudf.read_csv(f'{DATA_DIR}/{APACHE_SAMPLE_CSV}')
parsed_df, confidence_df = cybert.inference(logs_df["raw"])

In [19]:
parsed_df.head()

Unnamed: 0,time_received,error_level,error_message,remote_host,other,request_method,request_url,request_http_ver,status,response_bytes_clf,request_header_user_agent,request_header_referer,X
0,[Sun Dec 04 20:22:49 2005],[notice],workerEnv.init () ok/etc/httpd/conf/workers2 .,,,,,,,,,,
1,[01/Sep/2019:03:28:00 +0200],,,193.106.31.130,---,POST,/administrator/index.php,HTTP/1.0,200.0,4481.0,Mozilla/4.0 (compatible.MSIE...; Windows NT...),,
2,[29/Sep/2019:19:41:25 +0200],,,100.1.14.108,---,GET,/components/com.users/dispacher.php,HTTP/1.1,404.0,240.0,python-requests/2.22.0,,
3,[06/Nov/2019:03:15:15 +0100],,,13.84.43.203,---,GET,//administrator/index.php,HTTP/1.1,200.0,4270.0,Mozilla/5.0 (Windows NT 10.0.Win64.x64.rv:65.0...,,
4,[18/Feb/2016:12:38:21 +0100],,,90.188.40.9,---,GET,/administrator/,HTTP/1.1,200.0,4263.0,Mozilla/5.0 (Windows NT.. 1) AppleWebKit/537.3...,,


In [20]:
confidence_df.head()

Unnamed: 0,time_received,error_level,error_message,remote_host,other,request_method,request_url,request_http_ver,status,response_bytes_clf,request_header_user_agent,request_header_referer
0,0.999948,0.99959,0.999615,,,,,,,,,
1,0.999973,,,0.999966,0.999914,0.999903,0.999774,0.999946,0.999914,0.999932,0.999905,
2,0.999973,,,0.999964,0.999892,0.999912,0.999836,0.999945,0.99992,0.999927,0.999888,
3,0.999973,,,0.999963,0.999904,0.999903,0.999735,0.999945,0.9999,0.999925,0.99991,
4,0.999974,,,0.999956,0.999904,0.999911,0.999841,0.999944,0.999892,0.999928,0.999872,


#### clx.analytics.cybert.Cybert.preprocess()

In [23]:
logs_df = cudf.read_csv(f'{DATA_DIR}/{APACHE_SAMPLE_CSV}')
input_ids, attention_masks, meta = cybert.preprocess(logs_df["raw"])

In [25]:
input_ids

tensor([[  164,  3477, 13063,  ...,     0,     0,     0],
        [21781,   119,  9920,  ...,     0,     0,     0],
        [ 1620,   119,   122,  ...,     0,     0,     0],
        ...,
        [ 1620,   119,   122,  ...,     0,     0,     0],
        [21801,   119,  1851,  ...,     0,     0,     0],
        [ 1620,   119,   122,  ...,     0,     0,     0]], device='cuda:0')

In [27]:
attention_masks

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')

In [28]:
meta

array([[  0,   0,  36],
       [  1,   0,  69],
       [  2,   0,  61],
       ...,
       [997,   0,  78],
       [998,   0,  48],
       [999,   0,  74]], dtype=uint32)

# DGA Detector

## Model

In [29]:
import os
import wget
import time
import cudf
import torch
import shutil
import zipfile
import numpy as np
from datetime import datetime
from sklearn.metrics import accuracy_score, average_precision_score
from clx.analytics.detector_dataset import DetectorDataset
from clx.analytics.dga_detector import DGADetector
from cuml.preprocessing.model_selection import train_test_split

In [37]:
dga = {
    "source": "DGA",
    "url": "https://data.netlab.360.com/feeds/dga/dga.txt",
    "compression": None,
    "storage_path": "../data/dga_feed",
}
benign = {
    "source": "Benign",
    "url": "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip",
    "compression": "zip",
    "storage_path": "../data/top-1m",
}

In [35]:
def unpack(compression_type, filepath, output_dir):
     if compression_type == 'zip':
        with zipfile.ZipFile(filepath, 'r') as f:
            f.extractall(output_dir)
        os.remove(filepath)

In [55]:
def download_file(f):
    output_dir = f['storage_path']
    filepath = f'{output_dir}/{f["url"].split("/")[-1]}'
    
    if not os.path.exists(filepath):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        print(f'Downloading {f["url"]}...')
        filepath = wget.download(f['url'], out=output_dir)
        
        print(f'Unpacking {filepath}')
        unpack(entry['compression'], filepath, output_dir)
    print(f'{f["source"]} data is stored to location {output_dir}')

In [56]:
download_file(dga)
download_file(benign)

DGA data is stored to location ../data/dga_feed
Benign data is stored to location ../data/top-1m


In [65]:
def load_input_data(dga, benign):
    dga_df = cudf.read_csv(
        dga['storage_path'] + '/*'
        , names=['generator', 'domain', 'dt_from', 'dt_to']
        , usecols=['domain']
        , skiprows=18
        , delimiter='\t'
    )
    dga_df['type'] = 0
    
    benign_df = cudf.read_csv(
        benign['storage_path'] + '/*'
        , names=["line_num","domain"]
        , usecols=['domain']
    )
    benign_df['type'] = 1
    
    input_df = cudf.concat([benign_df, dga_df], ignore_index=True)
    return input_df

def create_df(domain_df, type_series):
    df = cudf.DataFrame()
    df['domain'] = domain_df['domain'].reset_index(drop=True)
    df['type'] = type_series.reset_index(drop=True)
    return df

def create_dir(dir_path):
    print("Verify if directory `%s` is already exists." % (dir_path))
    if not os.path.exists(dir_path):
        print("Directory `%s` does not exist." % (dir_path))
        print("Creating directory `%s` to store trained models." % (dir_path))
        os.makedirs(dir_path)
        
def cleanup_cache():
    # release memory.
    torch.cuda.empty_cache()

In [63]:
input_df = load_input_data(dga, benign)

(
    domain_train
    , domain_test
    , type_train
    , type_test
) = train_test_split(input_df, 'type', train_size=0.7)

test_df = create_df(domain_test, type_test)
train_df = create_df(domain_train, type_train)

#### clx.analytics.dga_detector.DGADetector.init_model()

In [64]:
LR = 0.001
N_LAYERS = 3
CHAR_VOCAB = 128
HIDDEN_SIZE = 100
N_DOMAIN_TYPE = 2

In [64]:
dd = DGADetector(lr=LR)
dd.init_model(
      n_layers=N_LAYERS
    , char_vocab=CHAR_VOCAB
    , hidden_size=HIDDEN_SIZE
    , n_domain_type=N_DOMAIN_TYPE
)

#### clx.analytics.dga_detector.DGADetector.train_model()	Yes

In [66]:
batch_size = 10000
train_dataset = DetectorDataset(train_df, batch_size)
test_dataset = DetectorDataset(test_df, batch_size)

In [67]:
def train_and_eval(dd, train_dataset, test_dataset, epoch, model_dir):
    print("Initiating model training")
    create_dir(model_dir)
    max_accuracy = 0
    prev_model_file_path = ""
    for i in range(1, epoch + 1):
        print("---------")
        print("Epoch: %s" % (i))
        print("---------")
        dd.train_model(train_dataset)
        accuracy = dd.evaluate_model(test_dataset)
        now = datetime.now()
        output_filepath = (
            model_dir
            + "/"
            + "rnn_classifier_{}.pth".format(now.strftime("%Y-%m-%d_%H_%M_%S"))
        )
        if accuracy > max_accuracy:
            dd.save_model(output_filepath)
            max_accuracy = accuracy
            if prev_model_file_path:
                os.remove(prev_model_file_path)
            prev_model_file_path = output_filepath
            
    print("Model with highest accuracy (%s) is stored to location %s" % (max_accuracy, prev_model_file_path))
    return prev_model_file_path

In [69]:
%%time
epoch = 30
model_dir='../models/DGA_Detector'
model_filepath = train_and_eval(dd, train_dataset, test_dataset, epoch, model_dir)
cleanup_cache()

Initiating model training
Verify if directory `../models/DGA_Detector` is already exists.
Directory `../models/DGA_Detector` does not exist.
Creating directory `../models/DGA_Detector` to store trained models.
---------
Epoch: 1
---------
Test set: Accuracy: 505863/576772 (0.8770588724834076)

---------
Epoch: 2
---------
Test set: Accuracy: 552714/576772 (0.9582885438266767)

---------
Epoch: 3
---------
Test set: Accuracy: 557149/576772 (0.965977890743656)

---------
Epoch: 4
---------
Test set: Accuracy: 559021/576772 (0.9692235406711838)

---------
Epoch: 5
---------
Test set: Accuracy: 561376/576772 (0.9733066098909101)

---------
Epoch: 6
---------
Test set: Accuracy: 557225/576772 (0.9661096585825941)

---------
Epoch: 7
---------
Test set: Accuracy: 562147/576772 (0.9746433599411899)

---------
Epoch: 8
---------
Test set: Accuracy: 565640/576772 (0.9806994791702788)

---------
Epoch: 9
---------
Test set: Accuracy: 567595/576772 (0.9840890334482256)

---------
Epoch: 10
------

#### clx.analytics.dga_detector.DGADetector.evaluate_model()

In [70]:
accuracy = dd.evaluate_model(test_dataset)

Test set: Accuracy: 569612/576772 (0.9875860825421484)



#### clx.analytics.dga_detector.DGADetector.predict()

In [71]:
dd = DGADetector()
dd.load_model('<model_path>')

pred_results = []
true_results = []
for partition in test_dataset.partitioned_dfs:
    pred_results.append(list(dd.predict(partition['domain']).values_host))
    true_results.append(list(partition['type'].values_host))
pred_results = np.concatenate(pred_results)
true_results = np.concatenate(true_results)
accuracy_score = accuracy_score(pred_results, true_results)
print('Model accuracy: %s'%(accuracy_score))
cleanup_cache()

Model accuracy: 0.9908456027685116


# Phishing Detector

## Model

In [72]:
import cudf;
from cuml.preprocessing.model_selection import train_test_split;
from clx.analytics.phishing_detector import PhishingDetector;
import s3fs;
from os import path

In [75]:
DATA_DIR = '../data/phishing'
CLAIR_TSV = "Phishing_Dataset_Clair_Collection.tsv"
SPAM_TSV = "spam_assassin_spam_200_20021010.tsv"
EASY_HAM_TSV = "spam_assassin_easyham_200_20021010.tsv"
HARD_HAM_TSV = "spam_assassin_hardham_200_20021010.tsv"
ENRON_TSV = "enron_10000.tsv"

S3_BASE_PATH = "rapidsai-data/cyber/clx"

In [77]:
def maybe_download(f, output_dir):
    if not path.exists(f'{output_dir}/{f}'):
        print(f'Downloading: {f}')
        fs = s3fs.S3FileSystem(anon=True)
        fs.get(S3_BASE_PATH + "/" + f, f'{output_dir}/{f}')
        
def read_dataset(f, data_dir):
    maybe_download(f, data_dir)
    return cudf.read_csv(
        f'{data_dir}/{f}'
        , delimiter='\t'
        , header=None
        , names=['label', 'email']
    )

In [82]:
dfclair = read_dataset(CLAIR_TSV, DATA_DIR)
dfspam = read_dataset(SPAM_TSV, DATA_DIR)
dfeasyham = read_dataset(EASY_HAM_TSV, DATA_DIR)
dfhardham = read_dataset(HARD_HAM_TSV, DATA_DIR)
dfenron = read_dataset(ENRON_TSV, DATA_DIR)

Downloading: spam_assassin_spam_200_20021010.tsv
Downloading: spam_assassin_easyham_200_20021010.tsv
Downloading: spam_assassin_hardham_200_20021010.tsv
Downloading: enron_10000.tsv


#### clx.analytics.phishing_detector.PhishingDetector.init_model()

In [83]:
phish_detect = PhishingDetector()
phish_detect.init_model(model_or_path='bert-base-uncased')

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=433.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=440473133.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

#### clx.analytics.phishing_detector.PhishingDetector.train_model()

In [85]:
df_all = cudf.concat([
    dfclair
    , dfspam
    , dfeasyham
    , dfhardham
    , dfenron
])

(
    X_train
    , X_test
    , y_train
    , y_test
) = train_test_split(df_all, 'label', train_size=0.8)

phish_detect.train_model(X_train, y_train, epochs=1)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Train loss: 0.06393363227627452


Epoch: 100%|██████████| 1/1 [02:02<00:00, 122.57s/it]

Validation Accuracy: 0.9950787401574803





#### clx.analytics.phishing_detector.PhishingDetector.evaluate_model()

In [87]:
phish_detect.evaluate_model(X_test, y_test)

0.9950455806579469

#### clx.analytics.phishing_detector.PhishingDetector.save_model()

In [90]:
phish_detect.save_model('../models/phishing')

#### clx.analytics.phishing_detector.PhishingDetector.predict()

In [91]:
phish_detect_trained = PhishingDetector()
phish_detect_trained.init_model(model_or_path='../models/phishing')

phish_detect_trained.predict(X_test)

0       0
1       0
2       1
3       0
4       0
       ..
5041    0
5042    0
5043    0
5044    1
5045    0
Length: 5046, dtype: int64

#### clx.analytics.model.rnn_classifier.RNNClassifier()	

#### clx.analytics.stats.rzscore()	

# Tokenize

#### clx.analytics.tokenizer.tokenize_df()	

#### clx.analytics.tokenizer.tokenize_file()	