In [None]:
import numpy as np
import pandas as pd
import scipy as si
import scipy.io as sio
from math import log2

from collections import Counter
from scipy.stats import mode
from sklearn.base import clone

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

np.random.seed(0)

In [None]:
def results_to_csv(y_test):
    y_test = y_test.astype(int)
    df = pd.DataFrame({'Category': y_test})
    df.index += 1  # Ensures that the index starts at 1
    df.to_csv('submission.csv', index_label='Id')

In [None]:
class DecisionTree:
    class Node:
        def __init__(self, feature = None, threshold = None, left = None, right = None, value = None):
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.val = value
        
        def is_leaf(self):
            return self.val is not None

    def __init__(self, depth):
        self.depth = depth
        self.tree = None
    
    def fit(self, X, y):
        self.tree = self.grow(X, y, 0)
    
    def predict(self, X):
        collect = [self.traverse(i, self.tree) for i in X]
        return np.array(collect)

    def common_label(self, y):
        counter = Counter(y)
        most_common = counter.most_common(1)[0][0]
        return most_common

    def entropy(self, y):
        num_of_ys = Counter(y)
        v = 0
        for i in num_of_ys:
            p = num_of_ys[i] / len(y)
            v -= p * log2(p)
        return v

    def grow(self, X, y, depth):
        rows, columns = X.shape
        num_labels = len(set(y))

        randcolumns = np.random.choice(columns, columns, replace=False)
        optimal_feature, optimal_feature_threshold = self.optimal_split(X, y, randcolumns)

        if depth >= self.depth or num_labels == 1:
            commonlabel = self.common_label(y)
            return self.Node(value=commonlabel)
        
        if optimal_feature is None:
            commonlabel = self.common_label(y)
            return self.Node(value = commonlabel)
        else:
            left_index, right_index = self.split(X[:, optimal_feature], optimal_feature_threshold)
            left = self.grow(X[left_index, :], y[left_index], depth + 1)
            right = self.grow(X[right_index, :], y[right_index], depth + 1)
            return self.Node(optimal_feature, optimal_feature_threshold, left, right)
    
    def optimal_split(self, X, y, given_index):
        
        best = -1000000000
        index, thresh = None, None
        
        for i in given_index:
            X_column = X[:, i]
            x_cols = np.unique(X_column)
            
            for i in x_cols:
                what_to_gain = self.info(y, X_column, i)
                if what_to_gain > best:
                    best = what_to_gain
                    index = i
                    thresh = i
        return index, thresh

    def traverse(self, x, node):
        if node.is_leaf():
            #print("Leaf: " , node.val, ". Threshold: ", node.threshold)
            return node.val
        if x[node.feature] < node.threshold:
            #print("Node Left: ", node.feature, ". Threshold:", node.threshold)
            return self.traverse(x, node.left)
        else:
            #print("Node Right: ", node.feature, ". Threshold:", node.threshold)
            return self.traverse(x, node.right)

    def split(self, X, threshold):
        left_index = np.argwhere(X < threshold)
        left_index = left_index.flatten()

        right_index = np.argwhere(X >= threshold)
        right_index = right_index.flatten()
        
        return left_index, right_index

    def info(self, y, X_column, threshold):
        before_entropy = self.entropy(y)

        left, right = self.split(X_column, threshold)
        if len(left) == 0 or len(right) == 0:
            return 0

        n = len(y)
        n_l, n_r = len(left), len(right)
        e_l, e_r = self.entropy(y[left]), self.entropy(y[right])
        after_entropy = (n_l / n) * e_l + (n_r / n) * e_r

        return before_entropy - after_entropy

In [None]:
class RandomForest:
    def __init__(self, depth=0):
        self.trees = []
        self.depth = depth

    def fit(self, X, y):
        self.trees = []
        for _ in range(10):
            tree = DecisionTree(self.depth)
            squeaky_boots_X, squeaky_boots_y = self.squeakyBoots(X, y)
            
            tree.fit(squeaky_boots_X, squeaky_boots_y)
            
            self.trees.append(tree)
    
    def squeakyBoots(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, size=n_samples, replace=True)
        return X[idxs], y[idxs]
    
    def predict(self, X):
        creepy_tree_pee = np.array([tree.predict(X) for tree in self.trees])
        creepy_tree_pee = np.swapaxes(creepy_tree_pee, 0, 1)

        most = mode(creepy_tree_pee, axis=1)[0]
        return most.flatten()

In [None]:
titanic_data = pd.read_csv('datasets/titanic/titanic_training.csv')

wordy_features = ['age', 'fare', 'embarked', 'sex']
target_to_guess = 'survived'
X = titanic_data[wordy_features]
y = titanic_data[target_to_guess].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

number_features = ['age', 'fare']
simple_Imputer = SimpleImputer(strategy='median')

specific_features = ['embarked', 'sex']
pipeline_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))])

preprocessor = ColumnTransformer(transformers=[('num', simple_Imputer, number_features),('cat', pipeline_transformer, specific_features)])

model = DecisionTree(7) # 6: 0.801980198019802, 7:0.8217821782178217, 8:

pipeline = Pipeline(steps=[('preprocessor', preprocessor),('model', model)])

pipeline.fit(X, y)

In [None]:
import matplotlib.pyplot as plt

depths = range(1, 40)
accuracies = []

X_train_np = preprocessor.transform(X_train)
y_train_np = y_train
X_val_np = preprocessor.transform(X_val)
y_val_np = y_val

for depth in depths:
    model.fit(X_train_np, y_train_np)

    y_pred = model.predict(X_val_np)
    
    accuracy = accuracy_score(y_val, y_pred)
    accuracies.append(accuracy)

plt.figure(figsize=(5, 5))
plt.plot(depths, accuracies)
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.show()

In [None]:
#Titanic Prediction
X_train_np = preprocessor.transform(X_train)
y_train_np = y_train
X_val_np = preprocessor.transform(X_val)
y_val_np = y_val

model.fit(X_train_np, y_train_np)

y_pred = model.predict(X_train_np)
print("Titanic Decision Tree Training Accuracy: " , accuracy_score(y_train_np, y_pred))

y_pred = model.predict(X_val_np)
print("Titanic Decision Tree Validation Accuracy: " , accuracy_score(y_val_np, y_pred))

rand_forest = RandomForest(1)  #7: 0.8507992895204263,
rand_forest.fit(X_train_np, y_train_np)

y_pred = rand_forest.predict(X_train_np)
print("Titanic Random Forest Traning Accuracy: " , accuracy_score(y_train, y_pred))

y_pred = rand_forest.predict(X_val_np)
print("Titanic Random Forest Traning Accuracy: " , accuracy_score(y_val, y_pred))

In [None]:
def print_tree(node, depth = 1):
    
    if node.is_leaf():
        print("\t" * depth + "Leaf: ", node.feature)
    
    else:
        print("\t" * depth + "Node: ", node.feature,  " = ", node.threshold)
        
        if node.right is not None:
            print("\t" * depth + "Right: ")
            print_tree(node.right, depth + 1)
        
        if node.left is not None:
            print("\t" * depth + "Left: ")
            print_tree(node.left, depth + 1)

decision_tree = DecisionTree(2)
decision_tree.fit(X_train_np, y_train_np)

print_tree(decision_tree.tree)

In [None]:
spam_data = sio.loadmat('datasets/spam_data/spam_data.mat')

X = spam_data['training_data']
y = spam_data['training_labels'].reshape(-1).astype(int)

X_Kaggle_Test = spam_data['test_data']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Decision_Tree = DecisionTree(6)
Decision_Tree.fit(X, y)
results_to_csv(Decision_Tree.predict(X_Kaggle_Test))
guesses = Decision_Tree.predict(X_test)
accuracy = accuracy_score(y_test, guesses)
print("Spam Data Decision tree Validation: ", accuracy) #This one

guesses = Decision_Tree.predict(X_train)
accuracy = accuracy_score(y_train, guesses)
print("Spam Data Decision tree Training: ", accuracy) #This one

rand_forest = RandomForest(1)  # Adjust parameters as necessary 7: 0.8507992895204263,  
rand_forest.fit(X, y)
results_to_csv(rand_forest.predict(X_Kaggle_Test))
guesses = rand_forest.predict(X_test)
accuracy = accuracy_score(y_test, guesses)
print("Spam Data RandomForest Validation: ", accuracy)

guesses = rand_forest.predict(X_train)
accuracy = accuracy_score(y_train, guesses)
print("Spam Data RandomForest Training: ", accuracy)