# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

# ID3 Algorithma

## Importing libraries

In [1]:
from collections import Counter
from typing import Tuple, Any

import pandas as pd
import numpy as np
import os

from pandas import Series, DataFrame

## Loading the dataset

In [2]:
# Load test data
test_folder = 'data/test/'
test_files = os.listdir(test_folder)
test_df = pd.DataFrame()
for file in test_files:
    df = pd.read_csv(test_folder + file)
    test_df = pd.concat([test_df, df], axis=1)

# Load train data
train_folder = 'data/train/'
train_files = os.listdir(train_folder)
train_df = pd.DataFrame()
for file in train_files:
    df = pd.read_csv(train_folder + file)
    train_df = pd.concat([train_df, df], axis=1)

### ID3 Implementation

In [3]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature  # Feature index
        self.threshold = threshold  # Threshold value for the feature
        self.left = left  # Left child
        self.right = right  # Right child
        self.value = value  # Leaf node value

In [4]:
def entropy(y):
    """Calculate entropy of label array y."""
    hist = np.bincount(y)
    ps = hist / len(y)
    return -np.sum([p * np.log2(p) for p in ps if p > 0])


In [5]:
def information_gain(y, X_column, threshold):
    """Calculate information gain."""
    parent_entropy = entropy(y)

    # Generate left and right child
    left_mask = X_column <= threshold
    right_mask = ~left_mask

    if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
        return 0

    # Calculate weighted avg. entropy of children
    n = len(y)
    n_l, n_r = len(y[left_mask]), len(y[right_mask])
    e_l, e_r = entropy(y[left_mask]), entropy(y[right_mask])
    child_entropy = (n_l/n) * e_l + (n_r/n) * e_r

    # Return information gain
    return parent_entropy - child_entropy

In [6]:
def find_best_split(X, y):
    """Find the best split using information gain."""
    best_gain = -1
    best_feature = None
    best_threshold = None

    n_features = X.shape[1]

    for feature in range(n_features):
        thresholds = np.unique(X[:, feature])
        for threshold in thresholds:
            gain = information_gain(y, X[:, feature], threshold)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold

In [7]:
def build_tree(X, y, feature_names, max_depth=10, min_samples_split=2, min_impurity_decrease=0.0, depth=0):
    """Recursively build the decision tree with pruning."""
    n_samples, n_features = X.shape
    n_classes = len(np.unique(y))

    # Stopping criteria
    if depth >= max_depth or n_samples < min_samples_split or n_classes == 1:
        leaf_value = Counter(y).most_common(1)[0][0]
        return Node(value=leaf_value)

    # Find best split
    best_feature, best_threshold = find_best_split(X, y)

    # If no split improves information gain, return leaf node
    if best_feature is None:
        leaf_value = Counter(y).most_common(1)[0][0]
        return Node(value=leaf_value)

    # Calculate impurity decrease
    parent_entropy = entropy(y)
    left_mask = X[:, best_feature] <= best_threshold
    right_mask = ~left_mask
    n = len(y)
    n_l, n_r = len(y[left_mask]), len(y[right_mask])
    e_l, e_r = entropy(y[left_mask]), entropy(y[right_mask])
    child_entropy = (n_l/n) * e_l + (n_r/n) * e_r
    impurity_decrease = parent_entropy - child_entropy

    # Prune if impurity decrease is less than the threshold
    if impurity_decrease < min_impurity_decrease:
        leaf_value = Counter(y).most_common(1)[0][0]
        return Node(value=leaf_value)

    # Create child splits
    left = build_tree(X[left_mask], y[left_mask], feature_names, max_depth, min_samples_split, min_impurity_decrease, depth + 1)
    right = build_tree(X[right_mask], y[right_mask], feature_names, max_depth, min_samples_split, min_impurity_decrease, depth + 1)

    return Node(feature=feature_names[best_feature],
                threshold=best_threshold,
                left=left,
                right=right)


In [8]:
def predict_sample(node, x):
    """Predict single sample."""
    if node.value is not None:
        return node.value

    if x[node.feature] <= node.threshold:
        return predict_sample(node.left, x)
    return predict_sample(node.right, x)



In [9]:
def predict(tree, X):
    """Predict multiple samples."""
    return np.array([predict_sample(tree, x) for x in X])


In [10]:
# Function to print the tree (for visualization)
def print_tree(node, depth=0):
    """Print the decision tree."""
    if node.value is not None:
        print('  ' * depth + f'Predict: {node.value}')
        return

    print('  ' * depth + f'Feature {node.feature} <= {node.threshold}')
    print_tree(node.left, depth + 1)
    print_tree(node.right, depth + 1)

In [11]:
# Pre-processing the data
# Set train set 20% of the data
train_set = train_df.sample(frac=0.2, random_state=42)

# Splitting the data into X and y
X = train_set.drop('attack_cat', axis=1)
y = train_set['attack_cat']
feature_names = train_set.drop('attack_cat', axis=1).columns.tolist()

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Impute missing values
# For numerical columns, we will impute missing values with the mean
for col in numerical_cols:
    mean_value = X[col].mean()
    X[col] = X[col].fillna(mean_value)

# For categorical columns, we will impute missing values with the mode (most frequent value)
for col in categorical_cols:
    mode_value = X[col].mode()[0]
    X[col] = X[col].fillna(mode_value)

# Encode categorical variables with one-hot encoding
X = pd.get_dummies(X)

# Scaling the numerical columns (using numpy for standardization)
for col in numerical_cols:
    col_mean = X[col].mean()
    col_std = X[col].std()
    X[col] = (X[col] - col_mean) / col_std
    
# Encode the target variable
y = pd.get_dummies(y)
y = y.values.argmax(axis=1)
y = np.array(y, dtype=int)


In [14]:
# Create a small dataset using NumPy
np.random.seed(42)  # For reproducibility
X = np.random.rand(10, 3)  # 10 samples, 3 features
y = np.random.randint(0, 2, size=10)  # Binary target variable

# Convert to pandas DataFrame
df = pd.DataFrame(X, columns=['feature1', 'feature2', 'feature3'])
df['target'] = y

# Display the DataFrame
print("DataFrame:")
print(df)

# Basic operations
# Calculate mean of each feature
feature_means = df[['feature1', 'feature2', 'feature3']].mean()
print("\nFeature Means:")
print(feature_means)

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values:")
print(missing_values)

# Encode target variable
df['target_encoded'] = df['target'].astype(int)

# Display the updated DataFrame
print("\nUpdated DataFrame with Encoded Target:")
print(df)

decision_tree = build_tree(X, y, df.columns.tolist())

# Print the tree structure
print_tree(decision_tree)

DataFrame:
   feature1  feature2  feature3  target
0  0.374540  0.950714  0.731994       0
1  0.598658  0.156019  0.155995       0
2  0.058084  0.866176  0.601115       0
3  0.708073  0.020584  0.969910       0
4  0.832443  0.212339  0.181825       0
5  0.183405  0.304242  0.524756       1
6  0.431945  0.291229  0.611853       1
7  0.139494  0.292145  0.366362       0
8  0.456070  0.785176  0.199674       1
9  0.514234  0.592415  0.046450       1

Feature Means:
feature1    0.429695
feature2    0.447104
feature3    0.438993
dtype: float64

Missing Values:
feature1    0
feature2    0
feature3    0
target      0
dtype: int64

Updated DataFrame with Encoded Target:
   feature1  feature2  feature3  target  target_encoded
0  0.374540  0.950714  0.731994       0               0
1  0.598658  0.156019  0.155995       0               0
2  0.058084  0.866176  0.601115       0               0
3  0.708073  0.020584  0.969910       0               0
4  0.832443  0.212339  0.181825       0          

In [81]:

# Print the tree structure
print_tree(decision_tree)

# Make predictions
y_pred = predict(decision_tree, X.values)

# Calculate accuracy
accuracy = np.mean(y_pred == y)
print(f"Accuracy: {accuracy:.4f}")


Predicted class: None
