In [None]:
import csv
import math
from collections import Counter
from sklearn.model_selection import KFold

# Load and preprocess the data from CSV
file_path = 'Book1.csv'
data = []

with open(file_path, 'r') as file:
    reader = csv.reader(file)
    headers = next(reader)  # Skip headers
    for row in reader:
        processed_row = []
        for value in row:
            if '.' in value and value.replace('.', '').isdigit():
                processed_row.append(value)
            elif value.isdigit():
                processed_row.append(int(value))
            else:
                processed_row.append(value)
        data.append(processed_row)

# Gini index calculation
def gini_index(data):
    class_counts = Counter(row[-1] for row in data)
    total_samples = len(data)
    gini = 1.0 - sum((count / total_samples) ** 2 for count in class_counts.values())
    return gini

# Attribute Gini index calculation
def attribute_gini_index(data, attribute_index):
    total_samples = len(data)
    value_counts = {}

    for sample in data:
        value = sample[attribute_index]
        if value not in value_counts:
            value_counts[value] = []
        value_counts[value].append(sample)

    attribute_gini = 0.0
    for subset in value_counts.values():
        prob = len(subset) / total_samples
        attribute_gini += prob * gini_index(subset)

    return attribute_gini

# Build decision tree
def build_decision_tree(data, attributes):
    tree = {}
    selected_attributes = []

    while attributes:
        best_attribute_gini = float('inf')
        best_attribute = None

        for attribute in attributes:
            gini = attribute_gini_index(data, attribute)
            print(f"Gini Index for Attribute {attribute}: {gini:.3f}")
            if gini < best_attribute_gini:
                best_attribute_gini = gini
                best_attribute = attribute

        if best_attribute is None:
            break

        print(f"\nSelected Attribute {best_attribute} with Weighted Gini Index = {best_attribute_gini:.3f}")
        print("-"*100)
        attributes.remove(best_attribute)
        tree[f'Attribute_{best_attribute}'] = {}

        attribute_values = set(row[best_attribute] for row in data)
        for value in attribute_values:
            subset = [row for row in data if row[best_attribute] == value]
            class_labels = [row[-1] for row in subset]
            if len(set(class_labels)) == 1:
                tree[f'Attribute_{best_attribute}'][value] = class_labels[0]
            else:
                tree[f'Attribute_{best_attribute}'][value] = "Further Splitting Required"

        #print(f"Updated Tree Node for Attribute {best_attribute}: {tree[f'Attribute_{best_attribute}']}")

    return tree

# Prediction function
def predict(tree, sample):
    for node in tree:
        attribute = int(node.split('_')[1])
        if sample[attribute] in tree[node]:
            result = tree[node][sample[attribute]]
            if isinstance(result, dict):
                return predict(result, sample)
            else:
                return result
    return None

# Perform cross-validation
kf = KFold(n_splits=5)
fold = 1
accuracies = []

for train_index, test_index in kf.split(data):
    train_data = [data[i] for i in train_index]
    test_data = [data[i] for i in test_index]

    print(f"\n--- Fold {fold} ---")
    attributes = list(range(len(data[0]) - 1))
    tree = build_decision_tree(train_data, attributes)

    correct_predictions = 0
    for sample in test_data:
        prediction = predict(tree, sample)
        if prediction == sample[-1]:
            correct_predictions += 1
        #print(f"Sample: {sample}, Prediction: {prediction}, Actual: {sample[-1]}")

    accuracy = correct_predictions / len(test_data)
    accuracies.append(accuracy)
    print(f"Fold {fold} Accuracy: {accuracy:.3f}")
    print("*"*100)

    fold += 1

# Calculate and print overall confidence
mean_accuracy = sum(accuracies) / len(accuracies)
print(f"\nOverall Confidence (Mean Accuracy): {mean_accuracy:.3f}")

FileNotFoundError: [Errno 2] No such file or directory: 'Book1.csv'

In [None]:
# Import necessary libraries
import pandas as pd

# Step 1: Upload the CSV file
from google.colab import files

print("Please upload your CSV file:")
uploaded = files.upload()

# Assuming the uploaded file is named as the first key in `uploaded`
input_file = list(uploaded.keys())[0]
output_file = 'DT_processed.csv'

# Step 2: Read the uploaded CSV file
df = pd.read_csv(input_file)

# Step 3: Process the 'input(url)' column to keep only domain names
df['input(url)'] = df['input(url)'].str.replace(r'^https?://', '', regex=True).str.split('/').str[0]

# Step 4: Combine the processed 'input(url)' column with the 'probe_asn' column
df['combined'] = df['input(url)'] + ' (' + df['probe_asn (probe_network_name)'] + ')'

# Step 5: Drop the original 'probe_asn' column
df = df.drop(columns=['probe_asn (probe_network_name)'])

# Step 6: Reorder and keep only the required columns
columns_to_keep = ['combined', 'measurement_start_time(UTC)', 'resolver_asn (resolver_ip)', 'resolver_network_name', 'blocking_type']
df = df[columns_to_keep]

# Step 7: Save the processed data to a new CSV file
df.to_csv(output_file, index=False)
print(df.head())

print(f"Processed file saved as {output_file}")

Please upload your CSV file:


Saving Book1csv.csv to Book1csv.csv
                                            combined  \
0      www.weedy.be (AS44020 (Modern Solutions LTD))   
1   www.xvideos.com (AS44020 (Modern Solutions LTD))   
2      xhamster.com (AS44020 (Modern Solutions LTD))   
3  xn--80aaifmgl1achx.xn--p1ai (AS44020 (Modern S...   
4     znakomstva.ru (AS44020 (Modern Solutions LTD))   

  measurement_start_time(UTC) resolver_asn (resolver_ip)  \
0            01-10-2022 00:00    AS15169 (74.125.46.133)   
1            01-10-2022 00:00    AS15169 (74.125.46.133)   
2            01-10-2022 00:00    AS15169 (74.125.46.133)   
3            01-10-2022 00:00    AS15169 (74.125.46.133)   
4            01-10-2022 00:00    AS15169 (74.125.46.133)   

  resolver_network_name blocking_type  
0            Google LLC           dns  
1            Google LLC         FALSE  
2            Google LLC         FALSE  
3            Google LLC           dns  
4            Google LLC         FALSE  
Processed file saved as DT

2 classes

In [None]:
import csv
import math
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from sklearn.model_selection import KFold

# Load and preprocess the data from CSV
file_path = 'DT_processed.csv'
data = []
data_ = []

with open(file_path, 'r') as file:
    reader = csv.reader(file)
    headers = next(reader)  # Skip headers
    for row in reader:
        processed_row = []
        for i, value in enumerate(row):
            # Handle numeric values
            if '.' in value and value.replace('.', '').isdigit():
                processed_row.append(float(value))  # Convert to float
            elif value.isdigit():
                processed_row.append(int(value))  # Convert to integer
            else:
                # Normalize class labels (last column assumed to be class label)
                if i == len(row) - 1:  # Check if it's the class label
                    if value.strip().lower() == 'false':
                        processed_row.append('false')
                    else:
                        processed_row.append('true')
                else:
                    processed_row.append(value.strip().lower())  # Normalize other values
        data_.append(processed_row)

# Limiting the dataset size for quicker processing
data = data_[:40000]

# Function for Gini index calculation
def gini_index(data):
    class_counts = Counter(row[-1] for row in data)
    total_samples = len(data)
    gini = 1.0 - sum((count / total_samples) ** 2 for count in class_counts.values())
    return gini

# Function for Attribute Gini index calculation
def attribute_gini_index(data, attribute_index):
    total_samples = len(data)
    value_counts = {}

    for sample in data:
        value = sample[attribute_index]
        if value not in value_counts:
            value_counts[value] = []
        value_counts[value].append(sample)

    attribute_gini = 0.0
    for subset in value_counts.values():
        prob = len(subset) / total_samples
        attribute_gini += prob * gini_index(subset)

    return attribute_gini

# Function to build decision tree
def build_decision_tree(data, attributes):
    tree = {}
    selected_attributes = []

    while attributes:
        best_attribute_gini = float('inf')
        best_attribute = None

        for attribute in attributes:
            gini = attribute_gini_index(data, attribute)
            print(f"Gini Index for Attribute {attribute}: {gini:.3f}")
            if gini < best_attribute_gini:
                best_attribute_gini = gini
                best_attribute = attribute

        if best_attribute is None:
            break

        print(f"\nSelected Attribute {best_attribute} with Weighted Gini Index = {best_attribute_gini:.3f}")
        print("-" * 100)
        attributes.remove(best_attribute)
        tree[f'Attribute_{best_attribute}'] = {}

        attribute_values = set(row[best_attribute] for row in data)
        for value in attribute_values:
            subset = [row for row in data if row[best_attribute] == value]
            class_labels = [row[-1] for row in subset]
            if len(set(class_labels)) == 1:
                tree[f'Attribute_{best_attribute}'][value] = class_labels[0]
            else:
                tree[f'Attribute_{best_attribute}'][value] = subset

    return tree

# Prediction function for test data
def predict(tree, sample):
    for node in tree:
        attribute = int(node.split('_')[1])
        if sample[attribute] in tree[node]:
            result = tree[node][sample[attribute]]
            if isinstance(result, dict):
                return predict(result, sample)
            elif isinstance(result, list):
                return max(set([row[-1] for row in result]), key=lambda x: [row[-1] for row in result].count(x))
            else:
                return result
    return "unknown"

# Perform cross-validation to find the accuracy between test and training data
kf = KFold(n_splits=5)
fold = 1
accuracies = []
all_actual = []
all_predicted = []
class_labels = set(row[-1] for row in data)  # Unique class labels

for train_index, test_index in kf.split(data):
    train_data = [data[i] for i in train_index]
    test_data = [data[i] for i in test_index]

    print(f"\n--- Fold {fold} ---")
    attributes = list(range(len(data[0]) - 1))
    tree = build_decision_tree(train_data, attributes)

    correct_predictions = 0
    fold_actual = []
    fold_predicted = []

    for sample in test_data:
        prediction = predict(tree, sample)
        fold_actual.append(sample[-1])
        fold_predicted.append(prediction)
        if prediction == sample[-1]:
            correct_predictions += 1

    all_actual.extend(fold_actual)
    all_predicted.extend(fold_predicted)

    accuracy = correct_predictions / len(test_data)
    accuracies.append(accuracy)
    print(f"Fold {fold} Accuracy: {accuracy:.3f}")
    print("*" * 100)
    fold += 1

# Build confusion matrix to understand the accuracy
confusion_matrix = defaultdict(lambda: defaultdict(int))
for actual, predicted in zip(all_actual, all_predicted):
    confusion_matrix[actual][predicted] += 1

print("\nConfusion Matrix:")
print(f"{'':>15}", end="")
for predicted_label in sorted(class_labels):
    print(f"{predicted_label:>15}", end="")
print()

for actual_label in sorted(class_labels):
    print(f"{actual_label:>15}", end="")
    for predicted_label in sorted(class_labels):
        print(f"{confusion_matrix[actual_label][predicted_label]:>15}", end="")
    print()

# Calculating overall confidence
mean_accuracy = sum(accuracies) / len(accuracies)
print(f"\nOverall Confidence (Mean Accuracy): {mean_accuracy:.3f}")



--- Fold 1 ---
Gini Index for Attribute 0: 0.010
Gini Index for Attribute 1: 0.262
Gini Index for Attribute 2: 0.264
Gini Index for Attribute 3: 0.270

Selected Attribute 0 with Weighted Gini Index = 0.010
----------------------------------------------------------------------------------------------------
Gini Index for Attribute 1: 0.262
Gini Index for Attribute 2: 0.264
Gini Index for Attribute 3: 0.270

Selected Attribute 1 with Weighted Gini Index = 0.262
----------------------------------------------------------------------------------------------------
Gini Index for Attribute 2: 0.264
Gini Index for Attribute 3: 0.270

Selected Attribute 2 with Weighted Gini Index = 0.264
----------------------------------------------------------------------------------------------------
Gini Index for Attribute 3: 0.270

Selected Attribute 3 with Weighted Gini Index = 0.270
----------------------------------------------------------------------------------------------------
Fold 1 Accuracy: 0.8

4 classes

In [None]:
import csv
import math
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from sklearn.model_selection import KFold

# Load and preprocess the data from CSV
file_path = 'DT_processed.csv'
data = []

with open(file_path, 'r') as file:
    reader = csv.reader(file)
    headers = next(reader)  # Skip headers
    for row in reader:
        processed_row = []
        for value in row:
            if '.' in value and value.replace('.', '').isdigit():
                processed_row.append(value)
            elif value.isdigit():
                processed_row.append(int(value))
            else:
                processed_row.append(value.strip().lower())  # Normalize class labels
        data.append(processed_row)

# Gini index calculation
def gini_index(data):
    class_counts = Counter(row[-1] for row in data)
    total_samples = len(data)
    gini = 1.0 - sum((count / total_samples) ** 2 for count in class_counts.values())
    return gini

# Attribute Gini index calculation
def attribute_gini_index(data, attribute_index):
    total_samples = len(data)
    value_counts = {}

    for sample in data:
        value = sample[attribute_index]
        if value not in value_counts:
            value_counts[value] = []
        value_counts[value].append(sample)

    attribute_gini = 0.0
    for subset in value_counts.values():
        prob = len(subset) / total_samples
        attribute_gini += prob * gini_index(subset)

    return attribute_gini

# Build decision tree
def build_decision_tree(data, attributes):
    tree = {}
    selected_attributes = []

    while attributes:
        best_attribute_gini = float('inf')
        best_attribute = None

        for attribute in attributes:
            gini = attribute_gini_index(data, attribute)
            print(f"Gini Index for Attribute {attribute}: {gini:.3f}")
            if gini < best_attribute_gini:
                best_attribute_gini = gini
                best_attribute = attribute

        if best_attribute is None:
            break

        print(f"\nSelected Attribute {best_attribute} with Weighted Gini Index = {best_attribute_gini:.3f}")
        print("-" * 100)
        attributes.remove(best_attribute)
        tree[f'Attribute_{best_attribute}'] = {}

        attribute_values = set(row[best_attribute] for row in data)
        for value in attribute_values:
            subset = [row for row in data if row[best_attribute] == value]
            class_labels = [row[-1] for row in subset]
            if len(set(class_labels)) == 1:
                tree[f'Attribute_{best_attribute}'][value] = class_labels[0]
            else:
                tree[f'Attribute_{best_attribute}'][value] = subset  # Retain for further splitting

    return tree

# Prediction function
def predict(tree, sample):
    for node in tree:
        attribute = int(node.split('_')[1])
        if sample[attribute] in tree[node]:
            result = tree[node][sample[attribute]]
            if isinstance(result, dict):
                return predict(result, sample)
            elif isinstance(result, list):
                return max(set([row[-1] for row in result]), key=lambda x: [row[-1] for row in result].count(x))
            else:
                return result
    return "unknown"

# Perform cross-validation
kf = KFold(n_splits=5)
fold = 1
accuracies = []
all_actual = []
all_predicted = []
class_labels = set(row[-1] for row in data)  # Unique class labels

for train_index, test_index in kf.split(data):
    train_data = [data[i] for i in train_index]
    test_data = [data[i] for i in test_index]

    print(f"\n--- Fold {fold} ---")
    attributes = list(range(len(data[0]) - 1))
    tree = build_decision_tree(train_data, attributes)

    correct_predictions = 0
    fold_actual = []
    fold_predicted = []

    for sample in test_data:
        prediction = predict(tree, sample)
        fold_actual.append(sample[-1])
        fold_predicted.append(prediction)
        if prediction == sample[-1]:
            correct_predictions += 1

    all_actual.extend(fold_actual)
    all_predicted.extend(fold_predicted)

    accuracy = correct_predictions / len(test_data)
    accuracies.append(accuracy)
    print(f"Fold {fold} Accuracy: {accuracy:.3f}")
    print("*" * 100)
    fold += 1

# Build confusion matrix
confusion_matrix = defaultdict(lambda: defaultdict(int))
for actual, predicted in zip(all_actual, all_predicted):
    confusion_matrix[actual][predicted] += 1

# Print confusion matrix
print("\nConfusion Matrix:")
print(f"{'':>15}", end="")
for predicted_label in sorted(class_labels):
    print(f"{predicted_label:>15}", end="")
print()

for actual_label in sorted(class_labels):
    print(f"{actual_label:>15}", end="")
    for predicted_label in sorted(class_labels):
        print(f"{confusion_matrix[actual_label][predicted_label]:>15}", end="")
    print()

# Calculate and print overall confidence
mean_accuracy = sum(accuracies) / len(accuracies)
print(f"\nOverall Confidence (Mean Accuracy): {mean_accuracy:.3f}")



--- Fold 1 ---
Gini Index for Attribute 0: 0.000
Gini Index for Attribute 1: 0.319
Gini Index for Attribute 2: 0.263
Gini Index for Attribute 3: 0.263

Selected Attribute 0 with Weighted Gini Index = 0.000
----------------------------------------------------------------------------------------------------
Gini Index for Attribute 1: 0.319
Gini Index for Attribute 2: 0.263
Gini Index for Attribute 3: 0.263

Selected Attribute 2 with Weighted Gini Index = 0.263
----------------------------------------------------------------------------------------------------
Gini Index for Attribute 1: 0.319
Gini Index for Attribute 3: 0.263

Selected Attribute 3 with Weighted Gini Index = 0.263
----------------------------------------------------------------------------------------------------
Gini Index for Attribute 1: 0.319

Selected Attribute 1 with Weighted Gini Index = 0.319
----------------------------------------------------------------------------------------------------
Fold 1 Accuracy: 0.7

In [None]:
# Converting the confusion matrix to a numpy array for visualization
matrix = np.array([[confusion_matrix[actual][predicted] for predicted in class_labels] for actual in class_labels])

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix Visualization')
plt.colorbar()

# Add labels to the heatmap
tick_marks = np.arange(len(class_labels))
plt.xticks(tick_marks, class_labels, rotation=45)
plt.yticks(tick_marks, class_labels)

# Adding text annotations
threshold = matrix.max() / 2.0
for i, row in enumerate(matrix):
    for j, val in enumerate(row):
        plt.text(j, i, f"{val}", ha="center", va="center",
                 color="white" if val > threshold else "black")

plt.ylabel('Actual Labels')
plt.xlabel('Predicted Labels')
plt.tight_layout()

# Display the visualization
plt.show()