# PART 1 -Decision Tree classification

In [15]:
import pandas as pd
import numpy as np
import time

In [16]:
import numpy as np

class Node:
    def __init__(self, data, target):
        self.data = data
        self.target = target
        self.left = None
        self.right = None
        self.feature_index = None
        self.threshold = None
        self.value = None

def entropy(y):
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities + 1e-10))  # Adding a small epsilon to avoid log(0)

def information_gain(y, y_left, y_right):
    H_parent = entropy(y)
    H_left = entropy(y_left)
    H_right = entropy(y_right)

    p_left = len(y_left) / len(y)
    p_right = len(y_right) / len(y)

    IG = H_parent - (p_left * H_left + p_right * H_right)
    return IG

def split_dataset(X, y, feature_index, threshold):
    left_mask = X[:, feature_index] <= threshold
    right_mask = ~left_mask
    return X[left_mask], y[left_mask], X[right_mask], y[right_mask]

def find_best_split(X, y):
    m, n = X.shape
    if m <= 1:
        return None, None, None, None

    num_classes = len(set(y))
    best_ig = 0
#     print(best_ig)
    best_feature_index = None
    best_threshold = None

    for feature_index in range(n):
        
        thresholds = sorted(set(X[:, feature_index]))
        for i in range(1, len(thresholds)):
            threshold = (thresholds[i - 1] + thresholds[i]) / 2
            X_left, y_left, X_right, y_right = split_dataset(X, y, feature_index, threshold)

            if len(y_left) == 0 or len(y_right) == 0:
                continue

            ig = information_gain(y, y_left, y_right)
            
            if ig > best_ig:
#                 print(ig)
                best_ig = ig
                best_feature_index = feature_index
                best_threshold = threshold
#     print(best_feature_index,best_threshold)
    return best_feature_index, best_threshold

def build_tree(X, y, depth=0, max_depth=33):
#     print(f"-->{depth}")
    if depth == max_depth or len(set(y)) == 1:
        leaf = Node(data=None, target=y)
        leaf.value = max(set(y), key=list(y).count)
        return leaf

    feature_index, threshold = find_best_split(X, y)
#     print(threshold)
    if feature_index is None:
        leaf = Node(data=None, target=y)
        leaf.value = max(set(y), key=list(y).count)
        return leaf
    X_left, y_left, X_right, y_right = split_dataset(X, y, feature_index, threshold)
    node = Node(data=(feature_index, threshold), target=None)
    node.left = build_tree(X_left, y_left, depth + 1, max_depth)
    node.right = build_tree(X_right, y_right, depth + 1, max_depth)

    return node


In [17]:
def predict(tree, x):
    if tree.value is not None:
        # If the current node is a leaf node, return its predicted value
        return tree.value
    else:
        # If the current node is not a leaf node, traverse to the appropriate child
        feature_index, threshold = tree.data
        if x[feature_index] <= threshold:
            return predict(tree.left, x)
        else:
            return predict(tree.right, x)


In [18]:
def smallest_prime_divisor(threshold, number):
    if threshold >= number:
        return 
    current_number = max(threshold + 1, 2)  # Ensure we start from at least 2
    while current_number < number:
        if  number % current_number == 0:
            return current_number
        current_number += 1
    return

In [19]:
def cross_validation(X,y):
        
        indices = np.arange(len(y))

        np.random.shuffle(indices)

        #compute the fold size
        fold_size =  int(len(y)/smallest_prime_divisor(10,len(y)))

        #determime folds
        folds = [indices[i:i+smallest_prime_divisor(10,len(y))] for i in range(0, len(y), smallest_prime_divisor(10,len(y)))]
        return folds,fold_size

In [20]:
def setconfusion_matrix(predicted_label,true_label):
    
        confusion_matrix = np.zeros((2, 2))
        for i in range(len(predicted_label)):
            if predicted_label[i] == 1 and true_label[i] == 1:
                confusion_matrix[1, 1] += 1  # True Positive
            elif predicted_label[i] == 1 and true_label[i] == 0:
                confusion_matrix[1, 0] += 1  # False Positive
            elif predicted_label[i] == 0 and true_label[i] == 1:
                confusion_matrix[0, 1] += 1  # False Negative
            elif predicted_label[i] == 0 and true_label[i] == 0:
                confusion_matrix[0, 0] += 1  # True Negative
#         print(confusion_matrix)
        return confusion_matrix
def compute_matrix(confusion_matrix):
    accuracy = (confusion_matrix[0, 0] + confusion_matrix[1, 1]) / np.sum(confusion_matrix)
    precision = confusion_matrix[1, 1] / (confusion_matrix[1, 0] + confusion_matrix[1, 1])
    recall = confusion_matrix[1, 1] / (confusion_matrix[0, 1] + confusion_matrix[1, 1])
    return accuracy,precision,recall
def calculate_average_metrics(accuracy_scores,precision_scores,recall_scores):
    # Calculate the average metrics
    average_accuracy = np.mean(accuracy_scores)
    average_precision = np.mean(precision_scores)
    average_recall = np.mean(recall_scores)
    
    return average_accuracy,average_precision,average_recall

In [21]:
dataset = pd.read_csv("trial.csv")
# dataset_ = pd.read_csv("hour.csv")
df_numeric = dataset.apply(pd.to_numeric, errors='coerce')

# Replace NaN values with 0 
dataset = df_numeric.fillna(0)
# Extract features (X) by selecting all columns except the last one (labels)
X = dataset.iloc[:, :-1].values

# Extract labels (y) by selecting the last column
y = dataset.iloc[:, -1].values

In [22]:
folds,fold_size=cross_validation(X,y)

In [23]:
recall_scores=[]

precision_scores=[]

accuracy_scores=[]
start_time = time.time()
for i in range(fold_size):
    
    test_data = folds[i]

    train_data = np.concatenate([f for j, f in enumerate(folds) if j != i])
    
    X_train,y_train=X[train_data],y[train_data]
    
    dt_model=build_tree(X_train,y_train,0,3)
    
    X_test,y_test=X[test_data],y[test_data]
    
    predicitons=[predict(dt_model, sample) for sample in X_test]
    
    predictions=np.array(predicitons)

    confusion_matrix=setconfusion_matrix(predictions,y_test)
    print(f"Confusion matrix of the folds {i+1}")
    print(confusion_matrix)
    accuracy,precision,recall=compute_matrix(confusion_matrix)
    print(f"\naccuracy={accuracy},precision={precision},recall={recall} of the folds {i+1}\n")
    accuracy_scores.append(accuracy)
    
    precision_scores.append(precision)
    
    recall_scores.append(recall)

average_accuracy,average_precision,average_recall=calculate_average_metrics(accuracy_scores,precision_scores,recall_scores)
end_time = time.time()

# Calculate the elapsed time
totTime = end_time - start_time   
print(f"\naverage of the accuracy={average_accuracy},precision={average_precision},recall={average_recall} \n")
print(f"Total run time is {totTime}")

Confusion matrix of the folds 1
[[35.  0.]
 [ 0. 62.]]

accuracy=1.0,precision=1.0,recall=1.0 of the folds 1

Confusion matrix of the folds 2
[[33.  0.]
 [ 0. 64.]]

accuracy=1.0,precision=1.0,recall=1.0 of the folds 2

Confusion matrix of the folds 3
[[37.  0.]
 [ 0. 60.]]

accuracy=1.0,precision=1.0,recall=1.0 of the folds 3

Confusion matrix of the folds 4
[[30.  0.]
 [ 0. 67.]]

accuracy=1.0,precision=1.0,recall=1.0 of the folds 4

Confusion matrix of the folds 5
[[51.  0.]
 [ 0. 46.]]

accuracy=1.0,precision=1.0,recall=1.0 of the folds 5

Confusion matrix of the folds 6
[[33.  0.]
 [ 0. 64.]]

accuracy=1.0,precision=1.0,recall=1.0 of the folds 6

Confusion matrix of the folds 7
[[27.  0.]
 [ 0. 70.]]

accuracy=1.0,precision=1.0,recall=1.0 of the folds 7

Confusion matrix of the folds 8
[[44.  0.]
 [ 0. 53.]]

accuracy=1.0,precision=1.0,recall=1.0 of the folds 8


average of the accuracy=1.0,precision=1.0,recall=1.0 

Total run time is 2.650033712387085


# RESULT OF THE DECISION TREE BASE ON THE MODEL CLASSIFICATION

--> I have implemented decision tree for the classification. Result of the this model is very well.Therefore I couldnt trust result
. Then I have check attribute and threshold which are vest information gain. next I convert some data point to TP and FN. Model could find them and result have been change with these change. 

# PART 2 -Decision Tree REGRESSION

In [24]:
def sum_squared_residuals(y):
    if len(y) == 0:
        return 0
    mean = np.mean(y)
    return np.sum((y - mean) ** 2)

def total_squared_residuals(y_left, y_right):
    return sum_squared_residuals(y_left) + sum_squared_residuals(y_right)
def find_best_split_reg(X, y):
    m, n = X.shape
    if m <= 1:
        return None, None, None, None

    best_mse = float('inf')
    best_feature_index = None
    best_threshold = None

    for feature_index in range(n):
        thresholds = sorted(set(X[:, feature_index]))
        for i in range(1, len(thresholds)):
            threshold = (thresholds[i - 1] + thresholds[i]) / 2
            X_left, y_left, X_right, y_right = split_dataset(X, y, feature_index, threshold)

            if len(y_left) == 0 or len(y_right) == 0:
                continue

            mse = total_squared_residuals(y_left, y_right)

            if mse < best_mse:
                best_mse = mse
                best_feature_index = feature_index
                best_threshold = threshold

    return best_feature_index, best_threshold

def build_tree_reg(X, y, depth=0, max_depth=None):
    if depth == max_depth or len(set(y)) == 4:
        leaf = Node(data=None, target=y)
        leaf.value = np.mean(y)
        return leaf

    feature_index, threshold = find_best_split_reg(X, y)

    if feature_index is None:
        leaf = Node(data=None, target=y)
        leaf.value = np.mean(y)
        return leaf

    X_left, y_left, X_right, y_right = split_dataset(X, y, feature_index, threshold)

    node = Node(data=(feature_index, threshold), target=None)
    node.left = build_tree(X_left, y_left, depth + 1, max_depth)
    node.right = build_tree(X_right, y_right, depth + 1, max_depth)

    return node

In [25]:
dataset_ = pd.read_csv("hour.csv")
df_numeric_ = dataset_.apply(pd.to_numeric, errors='coerce')

# Replace NaN values with 0 
dataset_ = df_numeric_.fillna(0)
# Extract features (X) by selecting all columns except the last one (labels)
X_ = dataset_.iloc[:, :-1].values

# Extract labels (y) by selecting the last column
y_ = dataset_.iloc[:, -1].values


In [26]:
folds_,fold_size_=cross_validation(X_,y_)

In [27]:
start_time_ = time.time()
mse_scores=[]
for i in range(fold_size_):
    start_time_1 = time.time()
    test_data = folds_[i]

    train_data = np.concatenate([f for j, f in enumerate(folds_) if j != i])
    
    X_train,y_train=X_[train_data],y_[train_data]
    
    dt_model=build_tree_reg(X_train,y_train,0,2)
    
    X_test,y_test=X_[test_data],y_[test_data]

    predicitons=[predict(dt_model, sample) for sample in X_test]  
    predictions=np.array(predicitons)
    mse = np.mean((predictions - y_test) ** 2)
    print(f"TEST FOLD {i+1}")
    print(f"MSE(Mean Squared Error) = {mse}")
    mse_scores.append(mse)
    end_time_1 = time.time()
    totTime_1 = end_time_1 - start_time_1   
    print(f"Run time of Fold {i+1}")
    print(totTime_1)
    
    
end_time_ = time.time()

# Calculate the elapsed time
totTime_ = end_time - start_time   
average_mse = np.mean(mse_scores)
print(f"Average MSE of DECISION TREE = {average_mse}")
print(f"Total run time is {totTime}")

TEST FOLD 1
MSE(Mean Squared Error) = 10007.973070947695
Run time of Fold 1
41.49535632133484
TEST FOLD 2
MSE(Mean Squared Error) = 10263.105126877266
Run time of Fold 2
38.48461723327637
TEST FOLD 3
MSE(Mean Squared Error) = 7625.818228896945
Run time of Fold 3
38.46780252456665
TEST FOLD 4
MSE(Mean Squared Error) = 8595.917659243914
Run time of Fold 4
36.7067186832428
TEST FOLD 5
MSE(Mean Squared Error) = 9013.859658208183
Run time of Fold 5
39.23066735267639
TEST FOLD 6
MSE(Mean Squared Error) = 10004.28586224754
Run time of Fold 6
36.5459418296814
TEST FOLD 7
MSE(Mean Squared Error) = 8155.599689280166
Run time of Fold 7
36.57511830329895
TEST FOLD 8
MSE(Mean Squared Error) = 9409.042982910409
Run time of Fold 8
36.52086901664734
TEST FOLD 9
MSE(Mean Squared Error) = 8733.885551527706
Run time of Fold 9
36.35563111305237
Average MSE of DECISION TREE = 9089.943092237758
Total run time is 2.650033712387085


# RESULT OF THE DECISION TREE BASE ON THE MODEL REGRESSION

In this model I Have implemented decision tree for the regression problem.I aim to find to best split and because of that I have used SSR(Sum Squared Residuals). I have used TSE(Total Squared Error) but its results are worst than SSR. Therefore I have chosen SSR in my model.My dataset it too big for regression. So That my run time is too bad.

# PART 3 - Implementation Random Forest

In [28]:
from sklearn.metrics import accuracy_score


# Function for bootstrapping
from sklearn.ensemble import RandomForestClassifier

def bootstrap_sample(dataset, labels,X_test, num_samples=None, num_features=None):
    
    if num_samples is None:
        num_samples = len(dataset)

    if num_features is None:
        num_features = int(len(dataset[0])*0.5)
    
    indices = np.random.choice(len(dataset), size=num_samples, replace=True)
    bootstrap_dataset = dataset[indices]
    bootstrap_labels = labels[indices]

    selected_features = np.random.choice(dataset.shape[1], num_features, replace=False)
      
    bootstrap_dataset = bootstrap_dataset[:, selected_features]
    X_test=X_test[:, selected_features]
    return X_test,bootstrap_dataset, bootstrap_labels

# Function for bagging
def build_rdf(X, y,X_test,test, num_models):
    all_predictions = []
    for _ in range(num_models):
        test1,X_sampled, y_sampled = bootstrap_sample(X,y,X_test)
        model = build_tree(X_sampled,y_sampled,0,33)
        predictions_for_tree = [predict(model, sample) for sample in test1]
        all_predictions.append(predictions_for_tree)
        
    result_array = np.apply_along_axis(custom_mode, axis=0, arr=all_predictions)

    confusion_matrix=setconfusion_matrix(result_array,test)
    
    accuracy,precision,recall=compute_matrix(confusion_matrix)
    return accuracy,precision,recall

def build_rdf_reg(X, y,X_test,test, num_models):
    all_predictions = []
    for _ in range(num_models):
        test1,X_sampled, y_sampled = bootstrap_sample(X,y,X_test)
        model = build_tree_reg(X_sampled,y_sampled,0,3)
        predictions_for_tree = [predict(model, sample) for sample in test1]
        all_predictions.append(predictions_for_tree)
    
    return all_predictions

def custom_mode(arr):
    unique_elements, counts = np.unique(arr, return_counts=True)
    most_frequent_index = np.argmax(counts)
    return unique_elements[most_frequent_index]

#     result_array = np.apply_along_axis(custom_mode, axis=0, arr=all_predictions_array)

def predict_rdf(rdf, X, options):
    all_predictions = []

    # Iterate through each tree and get predictions
    for tree in rdf:
        predictions_for_tree = [predict(tree, sample) for sample in X]
        all_predictions.append(predictions_for_tree)
    return all_predictions

def weights_ofTree(models,X,y):
    
    accuricies=[]
    for tree in models:
        #find predictions of the each tree
        predictions_for_tree = [predict(tree, sample) for sample in X]
            
        #set confusion matrix of the each tree    
        confusion_matrix=setconfusion_matrix(predictions_for_tree,y)
        
        accuracy = (confusion_matrix[0, 0] + confusion_matrix[1, 1]) / np.sum(confusion_matrix)
        
        accuricies.append(accuracy)
        
    return np.array(accuricies) / sum(accuricies)
        
        

In [31]:

recall_scores=[]

precision_scores=[]

accuracy_scores=[]

start_time = time.time()
for i in range(fold_size):
    
    test_data = folds[i]

    train_data = np.concatenate([f for j, f in enumerate(folds) if j != i])
    
    X_train,y_train=X[train_data],y[train_data]
    
    X_test,y_test=X[test_data],y[test_data]
    
    accuracy,precision,recall=build_rdf(X_train,y_train,X_test,y_test,10)
    
    print(f"\naccuracy={accuracy},precision={precision},recall={recall} of the folds {i+1}\n")
    accuracy_scores.append(accuracy)
    
    precision_scores.append(precision)
    
    recall_scores.append(recall)

average_accuracy,average_precision,average_recall=calculate_average_metrics(accuracy_scores,precision_scores,recall_scores)
end_time = time.time()

# Calculate the elapsed time
totTime = end_time - start_time   
print(f"\naverage of the accuracy={average_accuracy},precision={average_precision},recall={average_recall} \n")
print(f"Total run time is {totTime}")
    
end_time = time.time()

# Calculate the elapsed time
totTime = end_time - start_time   
# print(f"\naverage of the accuracy={average} \n")
print(f"Total run time is {totTime}")


accuracy=1.0,precision=1.0,recall=1.0 of the folds 1


accuracy=1.0,precision=1.0,recall=1.0 of the folds 2


accuracy=0.979381443298969,precision=1.0,recall=0.9666666666666667 of the folds 3


accuracy=1.0,precision=1.0,recall=1.0 of the folds 4


accuracy=0.9896907216494846,precision=1.0,recall=0.9782608695652174 of the folds 5


accuracy=1.0,precision=1.0,recall=1.0 of the folds 6


accuracy=0.9587628865979382,precision=1.0,recall=0.9428571428571428 of the folds 7


accuracy=1.0,precision=1.0,recall=1.0 of the folds 8


average of the accuracy=0.990979381443299,precision=1.0,recall=0.9859730848861283 

Total run time is 17.504173278808594
Total run time is 17.504173278808594


# Random Forest Classificaton problem Result

In this essemble model, firstly I obtained new dataset by bootstrapping method. Then for each dataset I have use random data distrubiiton . I use half of the total attribution  and create 100 tree for random forest. at the essemble method I have obtain better accuracy and recall when I compare it the single decision tree

# PART 4 - Implementation Random Forest Regression

In [32]:

from sklearn.metrics import mean_squared_error


start_time_ = time.time()
mse_scores=[]
for i in range(fold_size_):
    start_time_1 = time.time()
    test_data = folds_[i]

    train_data = np.concatenate([f for j, f in enumerate(folds_) if j != i])
    
    X_train,y_train=X_[train_data],y_[train_data]
    
    X_test,y_test=X_[test_data],y_[test_data]
    
    predicitons=build_rdf_reg(X_train,y_train,X_test,y_test,100)
    
    predictions = np.array(predicitons)

    
    
    aggregated_predictions = [sum(pred) / len(pred) for pred in zip(*predictions)]
    
    mse = mean_squared_error(y_test, aggregated_predictions)
    mse_scores.append(mse)
    print(f"MSE of folds {i+1} : {mse}")
    end_time_1 = time.time()
    totTime_1 = end_time_1 - start_time_1   
    print(f"Run time of Fold {i+1}")
    print(totTime_1)
    
    
end_time_ = time.time()

# Calculate the elapsed time
totTime_ = end_time - start_time   
average_mse = np.mean(mse_scores)
print(f"Average MSE of DECISION TREE = {average_mse}")
print(f"Total run time is {totTime}")


MSE of folds 1 : 7469.992687985499
Run time of Fold 1
1171.2392075061798
MSE of folds 2 : 7818.110715898499
Run time of Fold 2
1256.55770277977
MSE of folds 3 : 7773.846053599171
Run time of Fold 3
2397.3146402835846
MSE of folds 4 : 9053.616383635423
Run time of Fold 4
2944.1461493968964
MSE of folds 5 : 7814.925433661316
Run time of Fold 5
30545.2113802433
MSE of folds 6 : 10495.882805644744
Run time of Fold 6
1096.4526777267456
MSE of folds 7 : 8889.403350595545
Run time of Fold 7
1224.321694612503
MSE of folds 8 : 7440.391503158985
Run time of Fold 8
1387.5264151096344
MSE of folds 9 : 6314.653501398239
Run time of Fold 9
1304.5333626270294
Average MSE of DECISION TREE = 8118.980270619712
Total run time is 17.504173278808594


# RESULT OF REGRESSIN BASED ON RANDOM FOREST

In this model I have implemented Random forest for regression. But it is to hard to create decision tree for regression. furthermore when we use random forest for regression it's run time will bee too bad. I dont choose random forest fpr regression