# CLX Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

# Workflow

#### clx.workflow.workflow.Workflow()	Yes

In [74]:
from clx.workflow.workflow import Workflow
import cudf
import s3fs
from os import path

from clx.analytics.cybert import Cybert

class SimpleWorkflow(Workflow):        
    def workflow(self, dataframe):
        dataframe['length'] = dataframe['raw'].str.len()
        dataframe['ip'] = dataframe['raw'].str.extract(
            '([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+)', expand=True)
        return dataframe

In [75]:
DATA_DIR = '../data'
APACHE_SAMPLE_CSV = 'apache_sample_1k.csv'

source = {
    "type": "fs",
    "input_format": "csv",
    "input_path": f'{DATA_DIR}/{APACHE_SAMPLE_CSV}',
    "schema": ["raw"],
    "delimiter": ",",
    "usecols": ["raw"],
    "dtype": ["str"],
    "header": 0,
}

destination = {
    "type": "fs",
    "output_format": "csv",
    "output_path": f'{DATA_DIR}/{APACHE_SAMPLE_CSV.split(".")[0]}_workflow.csv',
    "index": False
}

In [76]:
workflow = SimpleWorkflow(
    name='SimpleWorkflow'
    , source=source
    , destination=destination
)

#### clx.workflow.workflow.Workflow.run_workflow()	Yes

In [77]:
![ -e ../data/apache_sample_1k_workflow.csv ] && rm ../data/apache_sample_1k_workflow.csv

In [78]:
workflow.run_workflow()

In [79]:
!head ../data/apache_sample_1k_workflow.csv

raw,length,ip
[Sun Dec 04 20:22:49 2005] [notice] workerEnv.init() ok /etc/httpd/conf/workers2.properties,91,
"193.106.31.130 - - [01/Sep/2019:03:28:00 +0200] ""POST /administrator/index.php HTTP/1.0"" 200 4481 ""-"" ""Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"" ""-""
",159,193.106.31.130
"100.1.14.108 - - [29/Sep/2019:19:41:25 +0200] ""GET /components/com_users/dispacher.php HTTP/1.1"" 404 240 ""-"" ""python-requests/2.22.0"" ""-""
",138,100.1.14.108
"13.84.43.203 - - [06/Nov/2019:03:15:15 +0100] ""GET //administrator/index.php HTTP/1.1"" 200 4270 ""-"" ""Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0"" ""-""
",185,13.84.43.203
"90.188.40.9 - - [18/Feb/2016:12:38:21 +0100] ""GET /administrator/ HTTP/1.1"" 200 4263 ""-"" ""Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36"" ""-""
",197,90.188.40.9


#### clx.workflow.workflow.Workflow.destination()	Yes

In [80]:
workflow.destination

{'type': 'fs',
 'output_format': 'csv',
 'output_path': '../data/apache_sample_1k_workflow.csv',
 'index': False}

#### clx.workflow.workflow.Workflow.name()	Yes

In [81]:
workflow.name

'SimpleWorkflow'

#### clx.workflow.workflow.Workflow.set_destination()	Yes

In [82]:
workflow.set_destination(destination=destination)

KeyError: 'destination'

#### clx.workflow.workflow.Workflow.set_source()	Yes

In [83]:
workflow.set_source(source=source)

#### clx.workflow.workflow.Workflow.source()	Yes

In [84]:
workflow.source

{'type': 'fs',
 'input_format': 'csv',
 'input_path': '../data/apache_sample_1k.csv',
 'schema': ['raw'],
 'delimiter': ',',
 'usecols': ['raw'],
 'dtype': ['str'],
 'header': 0}

#### clx.workflow.workflow.Workflow.stop_workflow()	Yes

In [85]:
workflow.stop_workflow()

#### clx.workflow.workflow.Workflow.workflow()	Yes

In [86]:
df = cudf.read_csv(f'{DATA_DIR}/{APACHE_SAMPLE_CSV}')[['raw']]
workflow.workflow(df)

Unnamed: 0,raw,length,ip
0,[Sun Dec 04 20:22:49 2005] [notice] workerEnv....,91,
1,193.106.31.130 - - [01/Sep/2019:03:28:00 +0200...,159,193.106.31.130
2,100.1.14.108 - - [29/Sep/2019:19:41:25 +0200] ...,138,100.1.14.108
3,13.84.43.203 - - [06/Nov/2019:03:15:15 +0100] ...,185,13.84.43.203
4,"90.188.40.9 - - [18/Feb/2016:12:38:21 +0100] ""...",197,90.188.40.9
...,...,...,...
995,154.0.14.250 - - [06/Dec/2016:16:59:06 +0100] ...,227,154.0.14.250
996,62.210.33.127 - - [20/Oct/2019:15:15:40 +0200]...,339,62.210.33.127
997,100.1.14.108 - - [04/Oct/2019:12:21:10 +0200] ...,152,100.1.14.108
998,198.50.156.189 - - [01/Apr/2017:19:47:53 +0200...,110,198.50.156.189


#### clx.workflow.workflow.Workflow.benchmark()

#### clx.workflow.splunk_alert_workflow.SplunkAlertWorkflow()	

#### clx.workflow.splunk_alert_workflow.SplunkAlertWorkflow.interval()	

#### clx.workflow.splunk_alert_workflow.SplunkAlertWorkflow.raw_data_col_name()	

#### clx.workflow.splunk_alert_workflow.SplunkAlertWorkflow.threshold()	

#### clx.workflow.splunk_alert_workflow.SplunkAlertWorkflow.window()	

#### clx.workflow.splunk_alert_workflow.SplunkAlertWorkflow.workflow()	