In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from base_tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier as SKDecisionTreeClassifier
from sklearn.tree import plot_tree
pd.set_option('display.max_colwidth', 10)
pd.set_option("display.max_columns", None)

### Testing for CSIC-Web-2010 data

In [None]:
# Load the iris dataset
# data_table = pd.read_csv('./dataset/csic_database.csv')
# data_table = data_table.drop(columns=['classification'])
data_table = pd.read_csv('./dataset/KDDCup99.csv')
data_table.head()

In [None]:
# For KDD data
data_table['label'].value_counts()

In [None]:
#For KDD
def sample_rows(table, column, value, n):
    # Filter rows where column 'c' equals value 'x'
    filtered_rows = table.loc[table[column] == value]
    # Sample rows where column 'c' equals value 'x' (e.g., sampling 5 rows)
    sampled_rows = filtered_rows.sample(n=n, random_state=42)
    return sampled_rows

In [None]:
rows1 = sample_rows(data_table, 'label', 'normal', 1000)
rows2 = sample_rows(data_table, 'label', 'back', 1000)
rows3 = sample_rows(data_table, 'label', 'satan', 1000)
rows4 = sample_rows(data_table, 'label', 'warezclient', 1000)
data_table = pd.concat([rows1, rows2, rows3, rows4])

In [None]:
data_table.describe(include='all')

In [None]:
#remove the column with values are unique=1 or unique=all
def remove_unique_columns(df):
    vaild_cols = []
    for col in df.columns:
        if len(df[col].unique()) > 1 and len(df[col].unique()) < len(df):
            vaild_cols.append(col)
    # Return a DataFrame with only non-unique columns
    return df[vaild_cols]

In [None]:
data_table_ = remove_unique_columns(data_table)

In [None]:
data_table_.shape

In [None]:
# # for CSIC
# def get_number_from_string(x):
#     if type(x) is float or type(x) is int:
#         return x
#     else:
#         return float(x.split(': ')[-1])
# #get the number in a string
# data_table_['lenght'] = data_table_['lenght'].apply(get_number_from_string)

In [None]:
def fill_with_average(df):
    # Iterate over each column in the DataFrame
    for column in df.columns:
        # Check if the column is numeric
        if pd.api.types.is_numeric_dtype(df[column]):
            # Calculate the mean of the column, ignoring NaN values
            mean_value = df[column].mean()
            # Fill NaN values with the mean and assign back to the column
            df[column] = df[column].fillna(mean_value)
    return df

def fill_string_nan_with_none(df):
    # Iterate over each column in the DataFrame
    for column in df.columns:
        # Check if the column is of object type (usually string columns)
        if df[column].dtype == 'object' or df[column].dtype == 'O':
            # Fill NaN values with 'None'
            df[column] = df[column].fillna('None')
    return df

In [None]:
#fill nan with average or 'None'
data_table_ = fill_with_average(data_table_)
data_table_ = fill_string_nan_with_none(data_table_)

In [None]:
def onehot_table(df, exclude_columns=None):
    # Identify string-type columns
    string_cols = df.select_dtypes(include=['object']).columns
    string_cols_ = []
    for col in string_cols:
        if exclude_columns:
            if col in exclude_columns:
                continue
        string_cols_.append(col)
    # Create one-hot encoded DataFrame for string columns
    one_hot_encoded_df = pd.get_dummies(df[string_cols_], drop_first=False, prefix_sep='_is_')
    # Concatenate the one-hot encoded columns with the original DataFrame (excluding original string columns)
    df_final = pd.concat([df.drop(columns=string_cols_), one_hot_encoded_df], axis=1)
    return df_final

In [None]:
#onehot exclude time series data
#For CSIC dataset
#data_table_ = onehot_table(data_table_, exclude_columns=['Unnamed: 0', 'content', 'URL'])
#For Breast dataset
#data_table_ = onehot_table(data_table_, exclude_columns=['Status'])
#For Econic dataset
#data_table_ = onehot_table(data_table_, exclude_columns=['Bankrupt?'])
#For economic
data_table_ = onehot_table(data_table_, exclude_columns=['label'])

In [None]:
data_table_.info()

### split train and test

In [None]:
train_table, test_table = train_test_split(data_table_, test_size=0.5)

### split the time series column

In [None]:
## for CSIC DATASET
# train_series,test_series = train_table[['content', 'URL']],test_table[['content', 'URL']]
# train_x = train_table.drop(columns=['Unnamed: 0', 'content', 'URL']).to_numpy()
# train_y = train_table['Unnamed: 0'].apply(lambda x: 1 if x=='Anomalous' else 0).to_numpy()
# test_x = test_table.drop(columns=['Unnamed: 0', 'content', 'URL']).to_numpy()
# test_y = test_table['Unnamed: 0'].apply(lambda x: 1 if x=='Anomalous' else 0).to_numpy()
## for breast dataset
# train_x = train_table.drop(columns=['Status']).to_numpy()
# train_y = train_table['Status'].apply(lambda x: 1 if x=='Alive' else 0).to_numpy()
# test_x = test_table.drop(columns=['Status']).to_numpy()
# test_y = test_table['Status'].apply(lambda x: 1 if x=='Alive' else 0).to_numpy()
# ## for company bankruptcy
# train_x = train_table.drop(columns=['Bankrupt?']).to_numpy()
# train_y = train_table['Bankrupt?'].to_numpy()
# test_x = test_table.drop(columns=['Bankrupt?']).to_numpy()
# test_y = test_table['Bankrupt?'].to_numpy()
## for glass
# mapping = {1:0, 2:1, 3:2, 5:3, 6:4, 7:5}
# train_x = train_table.drop(columns=['Type']).to_numpy()
# train_y = train_table['Type'].apply(lambda x: mapping[x]).to_numpy()
# test_x = test_table.drop(columns=['Type']).to_numpy()
# test_y = test_table['Type'].apply(lambda x: mapping[x]).to_numpy()
##for KDD
mapping = {
    'normal':0, 
    'back':1, #dos
    'satan':2, #probe
    'warezclient':3 #r2l
}
train_x = train_table.drop(columns=['label']).to_numpy()
train_y = train_table['label'].apply(lambda x: mapping[x]).to_numpy()
test_x = test_table.drop(columns=['label']).to_numpy()
test_y = test_table['label'].apply(lambda x: mapping[x]).to_numpy()

In [None]:
import imp
import base_tree
imp.reload(base_tree)
from base_tree import DecisionTreeClassifier

In [None]:
data_table.describe(include='all')

In [None]:
data_table['src_bytes'].max()

In [None]:
# Create a Decision Tree classifier
feature_names = train_table.drop(columns=['label']).columns
class_names = list(mapping.keys())
clf = DecisionTreeClassifier(
    criterion='gini', 
    random_state=42, 
    build_method='bfs',
    max_depth=7, 
    feature_name=feature_names,
    class_name=class_names
)
# Fit the classifier to the training data
clf.fit(train_x, train_y)
# Print the accuracy of the model
accuracy = clf.score(test_x, test_y)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
from visual import print_tree
print_tree(
    tree=clf,
    feature_names=feature_names,
    class_names=class_names,
    is_classifier=True,
    figsize = (16, 12),
    fontsize = 10,
    dpi=200
)

In [None]:
clf._n_features

In [None]:
data = load_breast_cancer()
train_x, test_x, train_y, test_y = train_test_split(data.data, data.target, test_size=0.5)

In [None]:
pd.DataFrame(data['data'], columns=data.feature_names).describe(include='all')

In [None]:
data.feature_names[7]

In [None]:
data.target_names

In [None]:
import imp
import base_tree
imp.reload(base_tree)
from base_tree import DecisionTreeClassifier

In [None]:
# Create a Decision Tree classifier
clf = DecisionTreeClassifier(
    criterion='gini', 
    random_state=42, 
    max_depth=6,
    feature_name=data.feature_names,
    class_name=data.target_names
)
# Fit the classifier to the training data
clf.fit(train_x, train_y)
# Print the accuracy of the model
accuracy = clf.score(test_x, test_y)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
# _, ax = plt.subplots(figsize = (16, 16), dpi=200)
# plot_tree(
#     clf,
#     feature_names=data.feature_names,
#     class_names=data.target_names,
#     filled=True,
#     ax = ax
# )

In [None]:
from visual import print_tree

In [None]:
print_tree(
    tree=clf,
    feature_names=data.feature_names,
    class_names=data.target_names,
    is_classifier=True,
    figsize = (16, 12),
    fontsize = 10,
    dpi=200
)

In [68]:
import imp
import prompt_adapter
imp.reload(prompt_adapter)
from prompt_adapter import _get_condition,_get_query,get_options
from prompt_adapter import get_explanation_prompt, get_explanation_result, get_selection_prompt, get_selection_result

In [51]:
query = _get_query(
    feature='Al Aluminum',
    val_range=(0.1, 0.5),
    det='<',
    val=0.25,
    condition={'possibility':97.3, 'result':'building_windows_non_float_processed'}
)
print(query)

Al Aluminum, range from 0.1 to 0.5, if it < 0.25, why these instances have 97.3% possibility to be building_windows_non_float_processed? 


In [57]:
desc='This is a Glass Identification Data Set from UCI. It contains 10 attributes with unit measurement expect RI that weight \
percent in corresponding oxide. The response is glass type containing building_windows_float_processed, \
building_windows_non_float_processed, vehicle_windows_float_processed, containers, tableware and headlamps.'
role='doctor'
query={
    'feature':'Al Aluminum',
    'val_range':(0.1, 0.5),
    'det':'<',
    'val':0.25,
    'condition':{'possibility':97.3, 'result':'building_windows_non_float_processed'}
}
premise='Ba < 0.335, which indicates that the glass has a low barium content, suggesting it is less likely to be a type of glass \
that requires a high density or high refractive index, such as certain types of optical glass, and Mg > 2.78, which suggests \
that the glass may be designed for improved chemical durability and resistance to weathering. Magnesium is not typically present \
in high amounts in standard soda-lime glass, which is commonly used in windows. Those glass of above attributes are \
likely to be non-float processed.'
print(get_explanation_prompt(
    desc=desc,
    role=role,
    query=query,
    premise=premise
))


Question: The Iris flower data set is a multivariate data set introduced by the British statistician and biologist Ronald Fisher in his 1936 paper The use of multiple measurements in taxonomic problems. It is sometimes called Anderson's Iris data set because Edgar Anderson collected the data to quantify the morphologic variation of Iris flowers of three related species. The data set consists of 50 samples from each of three species of Iris (Iris Setosa, Iris virginica, and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters. Assume you are an botanist, please explain that petal width, range from 0.1 to 2.5 (cm), if it is less than 0.8, why these instances are most likely to be setosa ?
Your explanation:
Iris Setosa is typically distinguished by its smaller petal width, usually less than 0.8 cm, compared to Iris Versicolor and Iris Virginica. This characteristic likely results from evolutionary adaptations to

In [61]:
response = 'Iris Setosa is typically distinguished by its smaller petal width, usually less than 0.8 cm, \
compared to Iris Versicolor and Iris Virginica. This characteristic likely results from evolutionary adaptations \
to specific ecological niches or pollination strategies. As such, a petal width below 0.8 cm is a strong indicator of \
Iris Setosa, aiding botanists in species classification.'
status,res = get_explanation_result(response)
print(status, res)

True Iris Setosa is typically distinguished by its smaller petal width, usually less than 0.8 cm, compared to Iris Versicolor and Iris Virginica. This characteristic likely results from evolutionary adaptations to specific ecological niches or pollination strategies. As such, a petal width below 0.8 cm is a strong indicator of Iris Setosa, aiding botanists in species classification.


In [69]:
opts = [
    'RI refractive index, range from 1.51 to 1.53, if it is less than 1.517, if has 73% possibility to be building_windows_non_float_processed.',
    'Ca Calcium, range from 5.4 to 16.2, if it is greater than 8.235, it has 50% possibility to be building_windows_float_processed.',
    'Al Aluminum, range from 0.29 to 3.5, if it is greater than 1.42, it has 70% possibility to be building_windows_non_float_processed.',
    'K Potassium, range from 0.0 to 6.21, if it is less than 0.03, it has 70% possibility to be tableware.'
]
selection_dict,selection_text = _get_options(opts)
print(selection_dict)
print(selection_text)

{'A': 'RI refractive index, range from 1.51 to 1.53, if it is less than 1.517, if has 73% possibility to be building_windows_non_float_processed.', 'B': 'Ca Calcium, range from 5.4 to 16.2, if it is greater than 8.235, it has 50% possibility to be building_windows_float_processed.', 'C': 'Al Aluminum, range from 0.29 to 3.5, if it is greater than 1.42, it has 70% possibility to be building_windows_non_float_processed.', 'D': 'K Potassium, range from 0.0 to 6.21, if it is less than 0.03, it has 70% possibility to be tableware.'}
A. RI refractive index, range from 1.51 to 1.53, if it is less than 1.517, if has 73% possibility to be building_windows_non_float_processed.
B. Ca Calcium, range from 5.4 to 16.2, if it is greater than 8.235, it has 50% possibility to be building_windows_float_processed.
C. Al Aluminum, range from 0.29 to 3.5, if it is greater than 1.42, it has 70% possibility to be building_windows_non_float_processed.
D. K Potassium, range from 0.0 to 6.21, if it is less than

In [None]:
desc='This is a Glass Identification Data Set from UCI. It contains 10 attributes with unit measurement expect RI that weight \
percent in corresponding oxide. The response is glass type containing building_windows_float_processed, \
building_windows_non_float_processed, vehicle_windows_float_processed, containers, tableware and headlamps.'
role='doctor'
query={
    'feature':'Al Aluminum',
    'val_range':(0.1, 0.5),
    'det':'<',
    'val':0.25,
    'condition':{'possibility':97.3, 'result':'building_windows_non_float_processed'}
}
premise='Ba < 0.335, which indicates that the glass has a low barium content, suggesting it is less likely to be a type of glass \
that requires a high density or high refractive index, such as certain types of optical glass, and Mg > 2.78, which suggests \
that the glass may be designed for improved chemical durability and resistance to weathering. Magnesium is not typically present \
in high amounts in standard soda-lime glass, which is commonly used in windows. Those glass of above attributes are \
likely to be non-float processed.'
print(get_selection_prompt(
    desc=desc,
    role=role,
    options=_get_options(opts),
    premise=premise
))