In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/DTRAG')

In [3]:
!pip install tiktoken
!pip install ipdb



In [5]:
import os
import re
import copy
import time
import shutil
import argparse
import numpy as np
import pandas as pd
import preprocessing as pp
import prompt_adapter as pa
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

from tqdm import tqdm,trange
from datetime import datetime
from llm_client import llm_client
from preprocessing import pipeline
from llmbt import LLMBoostingClassifier
from sklearn.model_selection import train_test_split
from typing import Tuple, Set, Optional, List, Dict, Union

In [6]:
from enum import Enum
class QueryStatus(Enum):
    UNK = 0
    OK = 1
    FER = 2 #format error
    NTR = 3 #network error

## Data Preprocessing

In [7]:
original_table = pd.read_csv('/content/drive/MyDrive/DTRAG/dataset/csic_database_preprocessed.csv')
original_table.drop(columns=['URL', 'content'], inplace=True) #for time series model analysis
original_table.head()

Unnamed: 0.1,Unnamed: 0,Method,User-Agent,Pragma,Cache-Control,Accept,Accept-encoding,Accept-charset,language,host,cookie,content-type,connection,lenght
0,Normal,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=1F767F17239C9B670A39E9B10C3825F4,,close,
1,Normal,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=81761ACA043B0E6014CA42A4BCD06AB5,,close,
2,Normal,POST,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=933185092E0B668B90676E0A2B0767AF,application/x-www-form-urlencoded,Connection: close,68.0
3,Normal,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=8FA18BA82C5336D03D3A8AFA3E68CBB0,,close,
4,Normal,POST,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=7104E6C68A6BCF1423DAE990CE49FEE2,application/x-www-form-urlencoded,Connection: close,63.0


In [8]:
preprocessed_table = pipeline(
    table = original_table,
    funcs = [
        pp.rename_column,
        #pp.sample_rows, #sample rows by label
        pp.remove_unique_columns,
        pp.fill_with_average,
        pp.fill_string_nan_with_none,
        pp.onehot_table
    ],
    args = [
        {'old_col':'Unnamed: 0', 'new_col':'label'},
        #{},
        {},
        {},
        {},
        {'exclude_columns':['label']}
    ]
)

rename_column Unnamed: 0 with label
(61065, 14)
remove_unique_columns ...
(61065, 7)
fill_with_average ...
(61065, 7)
fill_string_nan_with_none ...
(61065, 7)
onehot_table ...
(61065, 13)


In [9]:
preprocessed_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61065 entries, 0 to 61064
Data columns (total 13 columns):
 #   Column                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                         --------------  -----  
 0   label                                                                                                          61065 non-null  object 
 1   lenght                                                                                                         61065 non-null  float64
 2   Method_is_GET                                                                                                  61065 non-null  bool   
 3   Method_is_POST                                                                                                 61065 non-null  bool   
 4   Method_is_PUT                                 

### split train and test dataset

In [10]:
train_table, test_table = train_test_split(preprocessed_table, test_size=0.9)
print(len(train_table), len(test_table))

6106 54959


#### mapping label to int

In [11]:
def is_numeric(column):
    if column.dtype == bool:
        return False
    else:
        return pd.api.types.is_numeric_dtype(column)

#return range and type
def get_feature_info(table):
    feature_range = {} #feature name : range
    feature_type = {} #1 is number 0 is category
    for col in table.columns:
        if is_numeric(table[col]):
            feature_range[col] = (table[col].min(), table[col].max()) #number to max and min
            feature_type[col] = 1
        else:
            feature_range[col] = table[col].unique().tolist() #to category
            feature_type[col] = 0
    return feature_range,feature_type

## split train and test

In [12]:
target_names = ['Normal', 'Anomalous'] #!!!!!
train_x = train_table.drop(columns=['label']).to_numpy()
train_y = train_table['label'].apply(lambda x: target_names.index(x)).to_numpy()
test_x = test_table.drop(columns=['label']).to_numpy()
test_y = test_table['label'].apply(lambda x: target_names.index(x)).to_numpy()

## Data Feature

In [13]:
feature_names = preprocessed_table.drop(columns=['label']).columns.tolist()
feature_ranges,feature_types = get_feature_info(preprocessed_table.drop(columns=['label']))

## Description
#### include desc, role and feature detail mapping

In [14]:
DESC = """
The HTTP dataset CSIC 2010 contains thousands of web requests automatically generated. \
It can be used for the testing of web attack protection systems. It was developed at the "Information Security Institute" \
of CSIC (Spanish Research National Council). This dataset contains two type label: Normal or Anomalous.
"""
ROLE = 'expert of cyber security'

### Tools

In [15]:
def shuffle_arrays(x, y):
    # Ensure x and y are numpy arrays
    x = np.array(x)
    y = np.array(y)
    # Check that both arrays have the same length
    if len(x) != len(y):
        raise ValueError("Arrays must have the same length.")
    # Generate a permutation of indices based on the length of the arrays
    permutation = np.random.permutation(len(x))
    # Apply the permutation to both arrays
    x_shuffled = x[permutation]
    y_shuffled = y[permutation]
    return x_shuffled.tolist(), y_shuffled.tolist()

## Build the Tree

In [16]:
TOPK_RECALL = 4
TEMPERATURE = 0.4
SELECTION_KEYS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

### for reference

In [17]:
from base_tree import DecisionTreeClassifier, Node
from visual import print_tree

In [None]:
# Create a Decision Tree classifier
dtc = DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    random_state=42,
    max_depth=6,
    feature_name=feature_names,
    class_name=target_names
)
# Fit the classifier to the training data
dtc.fit(train_x, train_y)
print(dtc.score(train_x, train_y), dtc.score(test_x, test_y))

0.6703242712086472 0.6634218235411853


In [None]:
print_tree(
    tree=dtc,
    feature_names=feature_names,
    class_names=target_names,
    is_classifier=True,
    figsize = (16, 12),
    fontsize = 10,
    dpi=250
)

In [38]:


'''
options_dicts: [{}, {}, {}]
which contains like that:
option={
    'feature':'Al Aluminum',
    'val_range':(0.1, 0.5),
    'det':'<',
    'val':0.25,
    'condition':{'possibility':97.3, 'result':'building_windows_non_float_processed'}
}
or
option={
    'feature':'Al Aluminum',
    'val_range':(0.1, 0.5),
    'val':0.25
}
'''
def make_selection(llm_cli, desc:str, role:str, options_dicts:List[Dict], selection_keys:str, premise=None):
    index2options = {}
    index2keys,keys2index = {},{}
    option_list = []
    selection_keys = [selection_keys[i] for i in range(len(options_dicts))] #cut off
    for index,(op_dict, key) in enumerate(zip(options_dicts, selection_keys)):
        optext = pa._get_option(**op_dict)
        option_list.append(optext)
        index2options[index] = optext #for reverse
        index2keys[index] = key
        keys2index[key] = index
    #shuffle
    option_list,selection_keys = shuffle_arrays(option_list, selection_keys)
    #to text
    option_tag, option_text = pa.options2text(option_list, selection_keys)
    selection_prompt = pa.get_selection_prompt(
        desc=desc.strip(),
        role=role.strip(),
        options=option_text,
        premise=premise
    )
    print('\n###################### SELECTION ######################')
    print(selection_prompt)
    status,response,num_toks,repeat = llm_cli.generate(question=pa.selection_with_detail_shots + selection_prompt, temperature=TEMPERATURE)
    print(response)
    if status:
        status,answer = pa.get_selection_result(selection_keys, response)
        if status:
            key, reason = answer[0], answer[-1]
            return QueryStatus.OK, (keys2index[key], reason)
        return QueryStatus.FER, None
    else:
        return QueryStatus.NTR, None

'''
see make_selection
'''
def make_explanation(llm_cli, desc:str, role:str, opt_selected:str, premise=None):
    explanation_prompt = pa.get_explanation_prompt(
        desc=desc.strip(),
        role=role.strip(),
        query=opt_selected,
        premise=premise
    )
    print('\n###################### EXPLANATION ######################')
    print(explanation_prompt)
    status,response,num_toks,repeat = llm_cli.generate(question=pa.explanation_with_detail_shots + explanation_prompt, temperature=TEMPERATURE)
    print(response)
    if status:
        status,answer = pa.get_explanation_result(response)
        if status:
            return QueryStatus.OK, answer
        return QueryStatus.FER, None
    else:
        return QueryStatus.NTR, None

In [42]:
'''
splits,explanation,selection = \
    split_call(candidate_feature, candidate_theshold, candidate_gain,candidate_indice)
best_feature,best_theshold,best_gain,best_split = splits[0],splits[1],splits[2],splits[3]
input:
    all of them are sorted by gain (smaller, better)
    candidate_feature: <class 'tuple'> #list of features index
    candidate_theshold: <class 'tuple'> #list of features value
    candidate_gain: <class 'tuple'> #list of maximun gain of each feature
    candidate_indice: <class 'tuple'> #[feature len, 2, [split length]]
    Y: [N, n_class]
'''
def split_call(candidate_feature, candidate_theshold, candidate_gain, candidate_indice, Y, premise=None):
    global feature_names
    global feature_ranges
    global feature_types
    global target_names
    global llm_cli
    options = []
    for index in range(TOPK_RECALL):
        feature_name_ = feature_names[candidate_feature[index]]
        feature_range_ = feature_ranges[feature_name_]
        split_index = candidate_indice[index]
        possibility_left = np.sum(Y[split_index[0]], axis=0)/np.sum(Y[split_index[0]])
        possibility_right = np.sum(Y[split_index[-1]], axis=0)/np.sum(Y[split_index[-1]])
        det, val, possibility, result = None, None, None, None
        if np.max(possibility_left) > np.max(possibility_right):
            if feature_types[feature_name_]==1:
                det = '<'  #number or category
                val = candidate_theshold[index]
            else:
                det = '' #number or category
                val = False
            possibility = np.round(np.max(possibility_left * 100.0), 2)
            result = target_names[np.argmax(possibility_left)]
        else:
            if feature_types[feature_name_]==1:
                det = '>'  #number or category
                val = candidate_theshold[index]
            else:
                det = '' #number or category
                val = True
            possibility = np.round(np.max(possibility_right * 100.0), 2)
            result = target_names[np.argmax(possibility_right)]
        opt = {
                'feature': feature_name_,
                'val_range': feature_range_,
                'det': det,
                'val': val,
                'condition':{'possibility':possibility, 'result':result}
        }
        options.append(opt)
    status,res = make_selection(llm_cli, DESC, ROLE, options, str(SELECTION_KEYS), premise) #get selection
    if status == QueryStatus.OK: #selection succeed
        index_selected, reason_selected = res[0], res[-1]
        status,res = make_explanation(llm_cli, DESC, ROLE, options[index_selected], premise)
        explanation = res
        if status == QueryStatus.OK: #explanation succeed
            #return splits,explanation,selection
            return (candidate_feature[index_selected],
                    candidate_theshold[index_selected],
                    candidate_gain[index_selected],
                    candidate_indice[index_selected]), \
                    reason_selected, \
                    explanation
    raise Exception("QueryStatus is not OK")

In [None]:
import imp
import prompt_adapter
imp.reload(prompt_adapter)
import prompt_adapter as pa

In [None]:
import imp
import llmbt
imp.reload(llmbt)
from llmbt import LLMBoostingClassifier, Node

In [43]:
llm_cli = llm_client(
    url='https://api.xiaoai.plus/v1',
    api_key='sk-dzfmQUflukYgY0mH44EeAe26E0F240DbB6EaD38898Cc405c',
    models='gpt-4o-mini',
    max_tokens=4096,
    debug=False
)
# Create a Decision Tree classifier
lbc = LLMBoostingClassifier(
    criterion='gini',
    splitter='llm',
    split_call=split_call,
    random_state=42,
    max_depth=3,
    feature_name=feature_names,
    class_name=target_names
)
# Fit the classifier to the training data
lbc.fit(train_x, train_y)


###################### SELECTION ######################
Question: The HTTP dataset CSIC 2010 contains thousands of web requests automatically generated. It can be used for the testing of web attack protection systems. It was developed at the "Information Security Institute" of CSIC (Spanish Research National Council). This dataset contains two type label: Normal or Anomalous. Assume you are a expert of cyber securitygiven a group of instances X, which options is the best for further classification?
A. connection_is_Connection: close, category of [False, True], if this feature of X is False, X have a 65.59% chance to be Normal.
B. content-type_is_application/x-www-form-urlencoded, category of [False, True], if this feature of X is False, X have a 65.59% chance to be Normal.
C. content-type_is_None, category of [True, False], if this feature of X is True, X have a 65.59% chance to be Normal.
D. Method_is_GET, category of [True, False], if this feature of X is True, X have a 65.59% chanc

  possibility_left = np.sum(Y[split_index[0]], axis=0)/np.sum(Y[split_index[0]])
  possibility_right = np.sum(Y[split_index[-1]], axis=0)/np.sum(Y[split_index[-1]])



###################### SELECTION ######################
Question: The HTTP dataset CSIC 2010 contains thousands of web requests automatically generated. It can be used for the testing of web attack protection systems. It was developed at the "Information Security Institute" of CSIC (Spanish Research National Council). This dataset contains two type label: Normal or Anomalous. Given a group of instances X satisfy that:In the context of web requests, the `Method_is_GET` feature indicates whether the request method is a GET request, which is commonly used for retrieving data from a server without making any changes. GET requests are typically associated with normal web browsing activities, as they do not alter server state and are less likely to be part of malicious activities compared to methods like POST, which can modify data. Therefore, when the `Method_is_GET` feature is True, it suggests that the request is likely part of standard web traffic, leading to a 65.59% probability that t

## Tree Expansion

In [45]:
C_ucb = 1
def UCDT(fi, n_all, n_i):
    return fi + C_ucb * (n_all/(n_i+1))^0.5

#split data by value v
#f: feature index v: value
#return count(< v) and count(> v)
def _split_data(x, f_index, v):
    x_left,x_right = 0,0
    for index in range(len(x)):
        if x[index][f_index] < v:
            x_left += 1
        else:
            x_right += 1
    return x_left, x_right

def get_median_value(x, feature_index, values):
    v_set = set(values)
    v_selected = np.nan
    v_distance = sys.maxsize
    for v_ in v_set:
        lc,rc = _split_data(x, feature_index, v_)
        if abs(lc-rc) < v_distance:
            v_selected = v_
            v_distance = abs(lc-rc)
    return v_selected

#only feature selection
def get_topk_feature_by_ucdt(feature_name, tree, topk):
    feature_importance_ = tree.feature_importances_
    assert len(feature_name) == len(feature_importance_)
    feature_index = list(range(len(feature_name)))
    feature_count = {name:0 for name in feature_name} #feature_name: count
    all_nodes = tree._get_nodes()
    for node in all_nodes:
        feature_count[feature_name[node['feature']]] += 1
    feature_value = [UCDT(feature_importance_[fn], len(all_nodes), feature_count[fn]) for fn in feature_name]
    sorted_values = sorted(zip(feature_value, feature_name, feature_index))
    sorted_name,sorted_index = [item[1] for item in sorted_values],[item[2] for item in sorted_values]
    return sorted_name[:topk], sorted_index[:topk]

def get_topk_value_of_feature(x, feature_name, feature_index, feature_range):
    assert len(feature_name) == len(feature_index)
    values = [] #values for corresponding feature
    for _,(index,fn) in enumerate(zip(feature_index, feature_name)):
        values.append(get_median_value(x, index, feature_range[fn]))
    return values

'''
category ['positive', 'negative', 'netural']
'''
def make_prediction(llm_cli, desc, role, feature, fea_range, det, value, categories:List, premise:Union[str, None]):
    index2category = {} #index 0,1,2...
    index2keys,keys2index = {},{}
    category_list = copy.deepcopy(categories)
    selection_keys = [selection_keys[i] for i in range(len(categories))] #cut off
    for index, (cat, key) in enumerate(zip(category_list, selection_keys)):
        index2category[index] = cat #for reverse
        index2keys[index] = key #temparary not use
        keys2index[key] = index
    #shuffle
    category_list,selection_keys = shuffle_arrays(category_list, selection_keys)
    category_tag, category_text = pa.categories2text(category_list, selection_keys)
    predict_prompt = pa.get_predition_prompt(
        desc=desc.strip(),
        role=role.strip(),
        feature=feature,
        val_range=fea_range,
        value=value,
        det=det,
        category=category_text,
        premise=premise
    )
    print('\n###################### PREDICTION ######################')
    print(predict_prompt)
    status,response,num_toks,repeat = llm_cli.generate(question=pa.predition_shot + predict_prompt, temperature=TEMPERATURE)
    print(response)
    if status:
        status, (key, _) = pa.get_predition_result(response)
        if status:
            return QueryStatus.OK, keys2index[key]
        return QueryStatus.FER, None
    else:
        return QueryStatus.NTR, None

#change a leaf node to branch node
def leaf2branch(leaf, feature, threshold, left_child, right_child, n_sample=None):
    leaf.content['is_leave'] = False
    leaf.content['feature'] = feature
    leaf.content['threshold'] = threshold
    leaf.content['impurity'] = -1 #unknown
    leaf.content['value'] = -1 #unknown
    leaf.content['output_label'] = -1 #unknown
    leaf.content['n_sample'] = n_sample
    leaf.left_child = left_child
    leaf.right_child = right_child
    return leaf

UCDT_TOPK = 4
MIN_SAMPLES_SPLIT = 10
def tree_expansion(unlabeled_x, feature_name, feature_range, feature_type, tree):
    global feature_names
    global feature_ranges
    global feature_types
    global target_names
    global llm_cli
    update_flag = True
    while(update_flag):
        update_flag = False
        leaves = tree._get_leave() #bfs
        X_leaves = tree.apply(unlabeled_x) #leaves if of each x
        leaf2x = {} #node tag : x of node by []
        for index,leaf_id in enumerate(X_leaves): #build leaf2x
            if leaf_id not in leaf2x:
                leaf2x[leaf_id] = []
            leaf2x[leaf_id].append(unlabeled_x[index])
        #for selection
        for leaf in leaves:
            options = []
            x = leaf2x[leaf.tag] #x instances of leaf
            if len(x) < MIN_SAMPLES_SPLIT:
                continue #Not consider to split
            f_ucdt,index_ucdt = get_topk_feature_by_ucdt(feature_name, feature_range, UCDT_TOPK) #UCDT***
            v_ucdt = get_topk_value_of_feature(x, f_ucdt, index_ucdt, feature_ranges)
            for index,(f_,v_) in enumerate(zip(f_ucdt, v_ucdt)):
                opt = {
                    'feature': f_ucdt,
                    'val_range': feature_ranges[f_ucdt],
                    'val': v_ucdt,
                }
                options.append(opt)
            status,res = make_selection(llm_cli, DESC, ROLE, options, str(SELECTION_KEYS), premise)
            if status == QueryStatus.OK: #selection succeed
                index_selected, reason_selected = res[0], res[-1] #index of selected feature, value
                opt_selected = options[index_selected]
                f_selected,v_selected = opt_selected['feature'], opt_selected['val'] #LLM SELECTED
                range_selected = opt_selected['val_range']
                status,explanation = make_explanation(llm_cli, DESC, ROLE, opt_selected, premise)
                if status == QueryStatus.OK: #explanation succeed
                    status_left,res_left = make_prediction(
                            llm_cli = llm_cli,
                            desc = DESC,
                            role = ROLE,
                            feature = f_selected,
                            fea_range = range_selected,
                            det = '<' if feature_types[f_selected] == 1 else None,
                            value = str(v_selected),
                            categories = target_names,
                            premise = explanation
                        ) #use explanation as premise
                    status_right,res_right = make_prediction(
                            llm_cli = llm_cli,
                            desc = DESC,
                            role = role,
                            feature = f_selected,
                            fea_range = range_selected,
                            det = '>' if feature_types[f_selected] == 1 else None,
                            value = str(v_selected) if feature_types[f_selected] == 1 else ('Not' + str(v_selected)),
                            categories = target_names,
                            premise = explanation
                        ) #use explanation as premise
                    if status_left == QueryStatus.OK and status_right == QueryStatus.OK:
                        if res_left != res_right: # Unconsistent Prediction !!!
                            predict_left, predict_right = target_names[res_left], target_names[res_right]
                            left_node = Node(content={'is_leave':True, 'output_label':predict_left})
                            right_node = Node(content={'is_leave':True, 'output_label':predict_right})
                            #def leaf2branch(leaf, feature, threshold, left_child, right_child, n_sample=None):
                            tree.leaf2branch(
                                leaf = leaf,
                                feature = feature_names.index(f_selected),
                                threshold = v_selected,
                                left_child = left_node,
                                right_child = right_node,
                                n_sample = len(x)
                            )
                            update_flag = True

In [None]:
categories = ['positive', 'netural', 'negative']
keys = 'ABCDEFGHJIKLMN'
selection,selection_text = categories2text(categories, keys)

In [None]:
desc='This is a Glass Identification Data Set from UCI. It contains 10 attributes with unit measurement expect RI that weight \
percent in corresponding oxide... '
role='doctor'
premise='Ba < 0.335, which indicates that the glass has a low barium content, suggesting it is less likely to be a type of glass \
that requires a high density or high refractive index, such as certain types of optical glass, and ...'
print(get_predition_prompt(desc=desc, role=role, category=selection_text, premise=premise))

Question: This is a Glass Identification Data Set from UCI. It contains 10 attributes with unit measurement expect RI that weight percent in corresponding oxide...  Given a group of instances X satisfy that:Ba < 0.335, which indicates that the glass has a low barium content, suggesting it is less likely to be a type of glass that requires a high density or high refractive index, such as certain types of optical glass, and ... Assume you are a doctor, which options is the best for further classification of X?
A. positive
B. netural
C. negative
Your selection:

