In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.insert(0, "/content/drive/MyDrive/TreeRAG")

In [3]:
!pip install -q ipdb
!pip install -q tiktoken

In [4]:
import os
import re
import copy
import time
import random
import shutil
import argparse
import numpy as np
import pandas as pd
import preprocessing as pp
import prompt_adapter as pa
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

from tqdm import tqdm,trange
from datetime import datetime
from llm_client import llm_client
from preprocessing import pipeline
from llmbt import LLMBoostingClassifier
from sklearn.model_selection import train_test_split
from typing import Tuple, Set, Optional, List, Dict, Union

In [5]:
from enum import Enum
class QueryStatus(Enum):
    UNK = 0
    OK = 1
    FER = 2 #format error
    NTR = 3 #network error

## Data Processing

In [6]:
original_table = pd.read_csv('/content/drive/MyDrive/TreeRAG/dataset/csic_database_preprocessed.csv')
original_table.head()

Unnamed: 0,label,Method,User-Agent,Pragma,Cache-Control,Accept,Accept-encoding,Accept-charset,language,host,cookie,content-type,connection,lenght
0,Normal,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=1F767F17239C9B670A39E9B10C3825F4,,close,
1,Normal,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=81761ACA043B0E6014CA42A4BCD06AB5,,close,
2,Normal,POST,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=933185092E0B668B90676E0A2B0767AF,application/x-www-form-urlencoded,Connection: close,68.0
3,Normal,GET,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=8FA18BA82C5336D03D3A8AFA3E68CBB0,,close,
4,Normal,POST,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=7104E6C68A6BCF1423DAE990CE49FEE2,application/x-www-form-urlencoded,Connection: close,63.0


In [7]:
preprocessed_table = pipeline(
    table = original_table,
    funcs = [
        pp.rename_column,
        #pp.sample_rows, #sample rows by label
        pp.remove_unique_columns,
        pp.fill_with_average,
        pp.fill_string_nan_with_none,
        pp.onehot_table
    ],
    args = [
        {'old_col':'Unnamed: 0', 'new_col':'label'},
        #{},
        {'exclude_columns':['label', 'lenght']},
        {},
        {},
        {'prefix_sep':'=', 'suffix_sep':'?', 'exclude_columns':['label']}
    ]
)

rename_column Unnamed: 0 with label
(61065, 14)
remove_unique_columns ...
(61065, 7)
fill_with_average ...
(61065, 7)
fill_string_nan_with_none ...
(61065, 7)
onehot_table ...
(61065, 13)


In [8]:
preprocessed_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61065 entries, 0 to 61064
Data columns (total 13 columns):
 #   Column                                                                                                       Non-Null Count  Dtype  
---  ------                                                                                                       --------------  -----  
 0   label                                                                                                        61065 non-null  object 
 1   lenght                                                                                                       61065 non-null  float64
 2   Method=GET?                                                                                                  61065 non-null  int64  
 3   Method=POST?                                                                                                 61065 non-null  int64  
 4   Method=PUT?                                               

### split train and test dataset

In [9]:
train_table, test_table = train_test_split(preprocessed_table, test_size=0.999)
print(len(train_table), len(test_table))

61 61004


### mapping label to int

In [10]:
def is_numeric(column):
    if column.dtype == bool:
        return False
    else:
        if set(column.unique()) == {0, 1}: #0,1 for bool
            return False
        else:
            return pd.api.types.is_numeric_dtype(column)

#return range and type
def get_feature_info(table):
    feature_range = {} #feature name : range
    feature_type = {} #1 is number 0 is category
    for col in table.columns:
        if is_numeric(table[col]):
            feature_range[col] = (table[col].min(), table[col].max()) #number to max and min
            feature_type[col] = 1
        else:
            feature_range[col] = table[col].unique().tolist() #to category
            feature_type[col] = 0
    return feature_range,feature_type

In [11]:
target_names = ['Normal', 'Anomalous'] #!!!!!
train_x = train_table.drop(columns=['label']).to_numpy()
train_y = train_table['label'].apply(lambda x: target_names.index(x)).to_numpy()
test_x = test_table.drop(columns=['label']).to_numpy()
test_y = test_table['label'].apply(lambda x: target_names.index(x)).to_numpy()
feature_names = preprocessed_table.drop(columns=['label']).columns.tolist()
feature_ranges,feature_types = get_feature_info(preprocessed_table.drop(columns=['label']))

In [12]:
feature_types

{'lenght': 1,
 'Method=GET?': 0,
 'Method=POST?': 0,
 'Method=PUT?': 0,
 'Accept=None?': 0,
 'Accept=text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5?': 0,
 'host=localhost:8080?': 0,
 'host=localhost:9090?': 0,
 'content-type=None?': 0,
 'content-type=application/x-www-form-urlencoded?': 0,
 'connection=Connection: close?': 0,
 'connection=close?': 0}

## Description

In [13]:
DESC = """
The HTTP dataset CSIC 2010 contains thousands of web requests automatically generated. \
It can be used for the testing of web attack protection systems. It was developed at the "Information Security Institute" \
of CSIC (Spanish Research National Council). This dataset contains two type label: Normal or Anomalous.
"""
ROLE = 'expert of cyber security'

## Build the Tree

#### super parameters

In [14]:
TOPK_RECALL = 5
TEMPERATURE = 0.5
SELECTION_KEYS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

#### Tools

In [15]:
def sort_lists_by_order(order_list, x, y):
    # Create a dictionary mapping elements from order_list to their respective indices
    order_map = {element: index for index, element in enumerate(order_list)}
    # Define a custom sorting key using the order_map
    custom_key = lambda pair: order_map[pair[0]]
    # Sort both x and y based on the custom key
    sorted_data = sorted(zip(x, y), key=custom_key)
    # Unpack the sorted data back into separate lists
    sorted_x, sorted_y = zip(*sorted_data)
    return list(sorted_x), list(sorted_y)

### Comparsion with the baseline

In [16]:
import imp
import base_tree
imp.reload(base_tree)
from base_tree import DecisionTreeClassifier

In [17]:
# Create a Decision Tree classifier
dtc = DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    random_state=42,
    max_depth=3,
    min_samples_split = 3,
    feature_name=feature_names,
    class_name=target_names
)
# Fit the classifier to the training data
dtc.fit(train_x, train_y)
print(dtc.score(train_x, train_y), dtc.score(test_x, test_y))

0.6229508196721312 0.6045505212772933


In [18]:
for fn,fi in zip(feature_names, dtc.feature_importances_):
    if fi > 0:
        print(fn, fi)

lenght 1.0


In [19]:
dtc.export_text(feature_name=feature_names)

  : lenght < 54.0 gini : 0.489 prob: [0.574 0.426]
* | : 4 label: 0 prob:[1. 0.] gini : 0.0
  | : lenght < 253.0 gini : 0.496 prob: [0.544 0.456]
* | | : 52 label: 0 prob:[0.577 0.423] gini : 0.488
* | | : 5 label: 1 prob:[0.2 0.8] gini : 0.32


## Our Methods

In [34]:
#return option and reverse option
def get_option(is_number, feature, val, val_range, condition=None, condition_=None):
    opt,opt_ = {},{}
    opt['feature'] = feature
    opt_['feature'] = feature
    opt['val_range'] = val_range
    opt_['val_range'] = val_range
    if is_number:
        opt['val'] = val
        opt_['val'] = val
        if condition and condition_:
            opt['det'] = '<'
            opt_['det'] = '>'
            opt['condition'] = condition #multiclass for soft decision
            opt_['condition'] = condition_
    else:
        opt['val'] = val
        opt_['val'] = 'Not ' + str(val),
        if condition and condition_:
            opt['det'] = None
            opt_['det'] = None
            opt['condition'] = condition #multiclass for soft decision
            opt_['condition'] = condition_
    return opt,opt_

'''
options_dicts: [{}, {}, {}]
which contains like that:
option={
    'feature':'Al Aluminum',
    'val_range':(0.1, 0.5),
    'det':'<',
    'val':0.25,
    'condition':{'possibility':97.3, 'result':'building_windows_non_float_processed'}
}
or
option={
    'feature':'Al Aluminum',
    'val_range':(0.1, 0.5),
    'val':0.25
}
'''
def make_selection(llm_cli, desc:str, role:str, options_dicts:List[Dict], selection_keys:str, premise=None):
    index2options = {}
    index2keys,keys2index = {},{}
    option_list = []
    selection_keys_ = [selection_keys[i] for i in range(len(options_dicts))] #cut off
    random.shuffle(selection_keys_)
    for index,(op_dict, key) in enumerate(zip(options_dicts, selection_keys_)):
        optext = pa._get_option(**op_dict)
        option_list.append(optext)
        index2options[index] = optext #for reverse
        index2keys[index] = key
        keys2index[key] = index
    #sort by original
    selection_keys_,option_list  = sort_lists_by_order(
        [selection_keys[i] for i in range(len(options_dicts))],
        selection_keys_,
        option_list
    )
    option_tag, option_text = pa.options2text(option_list, selection_keys_)
    selection_prompt = pa.get_selection_prompt(
        desc=desc.strip(),
        role=role.strip(),
        options=option_text,
        premise=premise
    )
    print('\n###################### SELECTION ######################')
    print(selection_prompt)
    status,response,num_toks,repeat = llm_cli.generate(question=pa.selection_with_detail_shots + selection_prompt, temperature=TEMPERATURE)
    print(response)
    if status:
        status,answer = pa.get_selection_result(selection_keys, response)
        if status:
            key, reason = answer[0], answer[-1]
            return QueryStatus.OK, (keys2index[key], reason)
        return QueryStatus.FER, None
    else:
        return QueryStatus.NTR, None

'''
see make_selection
'''
def make_explanation(llm_cli, desc:str, role:str, opt_selected:str, premise=None):
    explanation_prompt = pa.get_explanation_prompt(
        desc=desc.strip(),
        role=role.strip(),
        query=opt_selected,
        premise=premise
    )
    print('\n###################### EXPLANATION ######################')
    print(explanation_prompt)
    status,response,num_toks,repeat = llm_cli.generate(question=pa.explanation_with_detail_shots + explanation_prompt, temperature=TEMPERATURE)
    print(response)
    if status:
        status,answer = pa.get_explanation_result(response)
        if status:
            return QueryStatus.OK, answer
        return QueryStatus.FER, None
    else:
        return QueryStatus.NTR, None

In [21]:
'''
splits,explanation,selection = \
    split_call(candidate_feature, candidate_theshold, candidate_gain,candidate_indice)
best_feature,best_theshold,best_gain,best_split = splits[0],splits[1],splits[2],splits[3]
input:
    all of them are sorted by gain (smaller, better)
    candidate_feature: <class 'tuple'> #list of features index
    candidate_theshold: <class 'tuple'> #list of features value
    candidate_gain: <class 'tuple'> #list of maximun gain of each feature
    candidate_indice: <class 'tuple'> #[feature len, 2, [split length]]
    Y: [N, n_class]
'''
def split_call(candidate_feature, candidate_theshold, candidate_gain, candidate_indice, Y, premise=None):
    global feature_names
    global feature_ranges
    global feature_types
    global target_names
    global llm_cli
    option_selected,options,options_ = [],[],[] #options, reverse options
    for index in range(TOPK_RECALL):
        feature_name = feature_names[candidate_feature[index]]
        feature_range = feature_ranges[feature_name]
        split_index = candidate_indice[index]
        distribution = np.sum(Y[split_index[0]], axis=0)/np.sum(Y[split_index[0]])
        distribution_ = np.sum(Y[split_index[-1]], axis=0)/np.sum(Y[split_index[-1]])
        val, possibility, result, possibility_, result_ = None, None, None, None, None
        val = candidate_theshold[index]
        possibility = np.max(distribution)
        possibility_ = np.max(distribution_)
        #possibility validation
        result = target_names[np.argmax(distribution)]
        result_ = target_names[np.argmax(distribution_)]
        condition = {'possibility':possibility, 'result':result, 'multiclass':False} if possibility > 0 and possibility <= 1 else None
        condition_ = {'possibility':possibility_, 'result':result_, 'multiclass':False} if possibility_ > 0 and possibility_ <= 1 else None
        # import ipdb
        # ipdb.set_trace(context=10)
        opt,opt_ = get_option(
            is_number = feature_types[feature_name]==1,
            feature = feature_name, #text
            val = val,
            val_range = feature_ranges[feature_name],
            condition = condition,
            condition_ = condition_
        )
        options.append(opt)
        options_.append(opt_)
        if possibility > possibility_: #use max possibility for option of selection
            option_selected.append(opt)
        else:
            option_selected.append(opt_)

    status,res = make_selection(llm_cli, DESC, ROLE, option_selected, str(SELECTION_KEYS), premise) #get selection
    if status == QueryStatus.OK: #selection succeed
        index_selected, reason_selected = res[0], res[-1]
        status,res = make_explanation(llm_cli, DESC, ROLE, options[index_selected], premise)
        status_,res_ = make_explanation(llm_cli, DESC, ROLE, options_[index_selected], premise) #for reverse
        if status == QueryStatus.OK and status_ == QueryStatus.OK: #explanation succeed
            #return splits,explanation,selection
            explanation = (res, res_) #left, right
            return (candidate_feature[index_selected],
                    candidate_theshold[index_selected],
                    candidate_gain[index_selected],
                    candidate_indice[index_selected]), \
                    reason_selected, \
                    explanation
    raise Exception("QueryStatus is not OK")


### Start our method

In [22]:
import imp
import llmbt
imp.reload(llmbt)
from llmbt import LLMBoostingClassifier, Node

In [23]:
import imp
import prompt_adapter
imp.reload(prompt_adapter)
import prompt_adapter as pa

In [24]:
llm_cli = llm_client(
    url='https://api.xiaoai.plus/v1',
    api_key='sk-dzfmQUflukYgY0mH44EeAe26E0F240DbB6EaD38898Cc405c',
    models='gpt-4o-mini',
    max_tokens=4096,
    debug=False
)
# Create a Decision Tree classifier
lbc = LLMBoostingClassifier(
    criterion='gini',
    splitter='llm',
    split_call=split_call,
    random_state=42,
    max_depth=3,
    feature_name=feature_names,
    class_name=target_names
)
# Fit the classifier to the training data
lbc.fit(train_x, train_y)


###################### SELECTION ######################
Question: The HTTP dataset CSIC 2010 contains thousands of web requests automatically generated. It can be used for the testing of web attack protection systems. It was developed at the "Information Security Institute" of CSIC (Spanish Research National Council). This dataset contains two type label: Normal or Anomalous. Assume you are a expert of cyber securitygiven a group of instances X, which options is the best for further classification?
A. content-type=application/x-www-form-urlencoded?, category of [0, 1], X can be classified by considering which category they are of.
B. content-type=None?, category of [1, 0], X can be classified by considering which category they are of.
C. connection=Connection: close?, category of [0, 1], X can be classified by considering which category they are of.
D. Method=POST?, category of [0, 1], X can be classified by considering which category they are of.
E. lenght, range from 4.0 to 836.0, i

  distribution = np.sum(Y[split_index[0]], axis=0)/np.sum(Y[split_index[0]])
  distribution_ = np.sum(Y[split_index[-1]], axis=0)/np.sum(Y[split_index[-1]])


{B}  
In the context of web requests, the `content-type` header is crucial for determining how the server should interpret the data being sent. The `application/x-www-form-urlencoded` content type is commonly used for form submissions in web applications. Given that this content type is often associated with normal web traffic, it can be a strong indicator for classification. 

Option A, C, and E involve host or connection types, which may not provide a clear distinction between normal and anomalous requests. Option D is not a valid comparison as it suggests comparing a length to -1, which doesn't make sense in this context.

Thus, option B (content-type=application/x-www-form-urlencoded) is the best choice for further classification of the instances in this dataset, as it is likely to indicate normal web traffic patterns.

###################### EXPLANATION ######################
Question: The HTTP dataset CSIC 2010 contains thousands of web requests automatically generated. It can be

In [25]:
lbc.export_text(feature_name=feature_names)

  : lenght < 54.0 gini : 0.489 prob: [0.574 0.426]
* | : 4 label: 0 prob:[1. 0.] gini : 0.0
  | : lenght < 253.0 gini : 0.496 prob: [0.544 0.456]
* | | : 52 label: 0 prob:[0.577 0.423] gini : 0.488
* | | : 5 label: 1 prob:[0.2 0.8] gini : 0.32


In [26]:
print(lbc.score(train_x, train_y), lbc.score(test_x, test_y))

0.6229508196721312 0.6045505212772933


## Tree Explanation

In [27]:
C_ucb = 1
def UCDT(fi, n_all, n_i):
    return fi + C_ucb * (n_all/(n_i+1))**0.5

#split data by value v
#f: feature index v: value
#return count(< v) and count(> v)
def _split_data(x, f_index, v):
    x_left,x_right = 0,0
    for index in range(len(x)):
        if x[index][f_index] < v:
            x_left += 1
        else:
            x_right += 1
    return x_left, x_right

def get_median_value(x, feature_index, values):
    v_set = set(values)
    v_selected = np.nan
    v_distance = sys.maxsize
    for v_ in v_set:
        lc,rc = _split_data(x, feature_index, v_)
        if abs(lc-rc) < v_distance:
            v_selected = v_
            v_distance = abs(lc-rc)
    return v_selected

#only feature selection
def get_topk_feature_by_ucdt(feature_name, tree, topk):
    feature_importance_ = tree.feature_importances_
    assert len(feature_name) == len(feature_importance_)
    feature_index = list(range(len(feature_name)))
    feature_count = {name:0 for name in feature_name} #feature_name: count
    all_nodes = tree._get_nodes()
    for node in all_nodes:
        feature_count[feature_name[node.content['feature']]] += 1
    feature_value = [UCDT(feature_importance_[index], len(all_nodes), feature_count[fn]) for (index,fn) in enumerate(feature_name)]
    sorted_values = sorted(zip(feature_value, feature_name, feature_index))
    sorted_name,sorted_index = [item[1] for item in sorted_values],[item[2] for item in sorted_values]
    return sorted_name[:topk], sorted_index[:topk]

def get_topk_value_of_feature(x, feature_name, feature_index, feature_range):
    assert len(feature_name) == len(feature_index)
    values = [] #values for corresponding feature
    for _,(index,fn) in enumerate(zip(feature_index, feature_name)):
        values.append(get_median_value(x, index, feature_range[fn]))
    return values

def shuffle_lists(x, y):
    zipped = list(zip(x, y))
    random.shuffle(zipped)
    new_x, new_y = zip(*zipped)
    return list(new_x), list(new_y)

'''
category ['positive', 'negative', 'netural']
'''
def make_prediction(llm_cli, desc, role, categories:List[str], selection_keys:str, premise:Union[str, None], condition:Dict):
    index2category = {} #index 0,1,2...
    index2keys,keys2index = {},{}
    category_list = copy.deepcopy(categories)
    selection_keys_ = [selection_keys[i] for i in range(len(categories))] #cut off
    random.shuffle(selection_keys_)
    for index, (cat, key) in enumerate(zip(category_list, selection_keys_)):
        index2category[index] = cat #for reverse
        index2keys[index] = key #temparary not use
        keys2index[key] = index
    #sort by original
    selection_keys_,category_list  = sort_lists_by_order(
        [selection_keys[i] for i in range(len(categories))], #original
        selection_keys_,
        category_list
    )
    category_tag, category_text = pa.categories2text(category_list, selection_keys)
    predict_prompt = pa.get_predition_prompt(
        desc=desc.strip(),
        role=role.strip(),
        category=category_text,
        premise=premise,
        condition=condition
    )
    print('\n###################### PREDICTION ######################')
    print(predict_prompt)
    status,response,num_toks,repeat = llm_cli.generate(question=pa.predition_shot + predict_prompt, temperature=TEMPERATURE)
    print(response)
    if status:
        status, (key, _) = pa.get_predition_result(selection_keys, response) #return status, (prediction, explanation)
        if status:
            return QueryStatus.OK, keys2index[key]
        return QueryStatus.FER, None
    else:
        return QueryStatus.NTR, None

#### implement of tree expansion

In [28]:
def tree_expansion(tree, llm_cli, unlabeled_x, feature_name, feature_range, feature_type, desc:str, role:str,
        selection_key:str, min_sample_split=100, expand_depth=2, ucdt_topk=4):
    update_flag = True
    last_depth = tree.get_depth()
    while(update_flag):
        update_flag = False
        leaves = tree._get_leave() #bfs
        X_leaves = tree.apply(unlabeled_x) #leaves if of each x

        print('expand ' + str(len(leaves)) + ' leaves!!!')
        print('\n')

        leaf2x = {} #node tag : x of node by []
        for index,leaf_id in enumerate(X_leaves): #build leaf2x
            if leaf_id not in leaf2x:
                leaf2x[leaf_id] = []
            leaf2x[leaf_id].append(unlabeled_x[index])
        #conditions to quit (1)
        if tree.get_depth() - last_depth > expand_depth:
            break
        for index,leaf in enumerate(leaves):
            print('enumerating the ' + str(index+1) + ' leaf !!!')

            options,options_ = [],[]
            option_selected = []
            x = leaf2x[leaf.tag] #x instances of leaf

            #conditions to quit (2)
            if len(x) < min_sample_split:
                continue #Not consider to split
            #UCDT***
            f_ucdt,index_ucdt = get_topk_feature_by_ucdt(feature_name, tree, ucdt_topk)
            v_ucdt = get_topk_value_of_feature(x, f_ucdt, index_ucdt, feature_range)

            for index,(f_,v_) in enumerate(zip(f_ucdt, v_ucdt)):

                opt,opt_ = get_option(
                    is_number = (feature_type[f_] == 1),
                    feature = f_,
                    val = v_,
                    val_range = feature_range[f_]
                )
                options.append(opt)
                options_.append(opt_)
                if random.randint(0, 1) == 1: #all selection is fair
                    option_selected.append(opt)
                else:
                    option_selected.append(opt_)

            status,res = make_selection(llm_cli, desc, role, option_selected, selection_key, leaf.explanation)

            if status == QueryStatus.OK: #selection succeed
                index_selected, reason_selected = res[0], res[-1] #index of selected feature, value
                opt_selected = option_selected[index_selected]
                f_selected,v_selected = opt_selected['feature'], opt_selected['val'] #LLM SELECTED
                range_selected = opt_selected['val_range']

                status,res = make_prediction(
                        llm_cli = llm_cli,
                        desc = desc,
                        role = role,
                        categories = target_names,
                        selection_keys = selection_key,
                        premise = leaf.explanation,
                        condition = options[index_selected]
                    ) #use explanation as premise
                status_,res_= make_prediction(
                        llm_cli = llm_cli,
                        desc = desc,
                        role = role,
                        categories = target_names,
                        selection_keys = selection_key,
                        premise = leaf.explanation,
                        condition = options_[index_selected]
                    ) #use explanation as premise

                if status == QueryStatus.OK and status_ == QueryStatus.OK: #predict left and right
                    if res != res_:
                        predict_left, predict_right = target_names[res], target_names[res_]
                        opt, opt_ = get_option(
                            is_number = (feature_type[f_selected] == 1),
                            feature = f_selected,
                            val = v_selected,
                            val_range = range_selected,
                            condition = {'possibility':0.9, 'result':predict_left, 'multiclass':True},
                            condition_ = {'possibility':0.9, 'result':predict_right, 'multiclass':True}
                        )
                        status,res = make_explanation(llm_cli, desc, ROLE, opt, leaf.explanation) #for left
                        status_,res_ = make_explanation(llm_cli, desc, ROLE, opt_, leaf.explanation) #for right

                        #ipdb.set_trace(context=10)
                        if status == QueryStatus.OK and status_ == QueryStatus.OK: #explanation succeed
                            explanation = (res, res_) #left, right
                            left_node = Node(position='left', content={'is_leave':True, 'output_label':predict_left})
                            right_node = Node(position='right', content={'is_leave':True, 'output_label':predict_right})
                            #def leaf2branch(leaf, feature, threshold, left_child, right_child, n_sample=None):
                            tree.leaf2branch(
                                leaf = leaf,
                                feature = feature_name.index(f_selected),
                                threshold = v_selected,
                                left_child = left_node,
                                right_child = right_node,
                                selection = reason_selected,
                                explanation = explanation,
                                n_sample = len(x)
                            )
                            update_flag = True
                    else:
                        print('Unconsistent Prediction !!!') # Unconsistent Prediction !!! ******* #skip
                        print(target_names[res], target_names[res_])

In [35]:
#def tree_expansion(tree, unlabeled_x, feature_name, feature_range, feature_type, min_sample_split=100, ucdt_topk=4):
lbc_copy = copy.deepcopy(lbc)
tree_expansion(
    tree=lbc_copy,
    llm_cli=llm_cli,
    unlabeled_x=test_x,
    feature_name=feature_names,
    feature_range=feature_ranges,
    feature_type=feature_types,
    desc=DESC,
    role=ROLE,
    selection_key=str(SELECTION_KEYS),
    min_sample_split=200,
    expand_depth=2,
    ucdt_topk=4
)

expand 3 leaves!!!


enumerating the 1 leaf !!!
0 <class 'int'>
('Not 0',)
Not 0
1 <class 'int'>
('Not 1',)
Not 1
1 <class 'int'>
('Not 1',)
Not 1
0 <class 'int'>
('Not 0',)
Not 0

###################### SELECTION ######################
Question: The HTTP dataset CSIC 2010 contains thousands of web requests automatically generated. It can be used for the testing of web attack protection systems. It was developed at the "Information Security Institute" of CSIC (Spanish Research National Council). This dataset contains two type label: Normal or Anomalous. Given a group of instances X satisfy that: In the context of the HTTP dataset CSIC 2010, a request length of less than 54.0 indicates a high likelihood (100%) of being classified as Normal. This is because typical web requests, especially those for standard resources like images, scripts, or small HTML files, tend to have shorter lengths. Anomalous requests, often associated with attacks or malicious activities, typically exhibit longer

KeyboardInterrupt: 