In [1]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import numpy as np

In [2]:
X, y = make_blobs(n_samples=1000, n_features=10, centers=9)

In [3]:
# plt.figure(figsize=(6, 6))
# plt.scatter(X[:, 0], X[:, 1], c=y)

In [10]:
class Node:
    
    def __init__(self, result=None, feature=None, thresh=None):
        self.result = result
        self.feature = feature
        self.thresh = thresh

class DT:
    
    def __init__(self, max_depth=6):
        self.root = None
        self.max_depth = max_depth
    
    def fit(self, X, y):
        self.root = self.generate(X, y, self.max_depth)
        self.classes = np.unique(y)
        
    
    @classmethod
    def entropy(cls, data):
        classes = np.unique(data)
        
        ent = 0
        for klass in classes:

            p = (np.array(data) == klass).mean()
            ent += (-p * np.log(p))

        return ent
    
        
        
    @classmethod
    def infogains(cls, X, y):
        gains = []

        for index in range(X.shape[1]):

            thresh = X[:, index].mean()

            lefty = y[X[:, index] < thresh]
            righty = y[X[:, index] >= thresh]

            pleft = len(lefty)/len(y)
            pright = len(righty)/len(y)  

            ig = DT.entropy(y) - pleft * DT.entropy(lefty) - pright * DT.entropy(righty)
            gains.append(ig)

        return gains
        
        
        
    def generate(self, X, y, max_depth):
        
        if max_depth == 0:
            node = Node(result=y.mean())
            return node
        
        
        
        gains = DT.infogains(X, y)
        
        maxIndex = np.argmax(gains)
        
        maxValue = gains[maxIndex]
        
        if maxValue <= 0:
            node = Node(result=y.mean())
            return node
        
        thresh = X[:, maxIndex].mean()
        leftX = X[X[:, maxIndex] < thresh]
        rightX = X[X[:, maxIndex] >= thresh]

        lefty = y[X[:, maxIndex] < thresh]
        righty = y[X[:, maxIndex] >= thresh]
        
        node = Node(feature=maxIndex, thresh=thresh)
        node.left = self.generate(leftX, lefty, max_depth-1)
        node.right = self.generate(rightX, righty, max_depth-1)
        
        return node
    
    def predictPoint(self, x):
        result = self.predictPointRec(x, self.root)
        classIndex = np.argmin(np.abs(self.classes - result))
        return self.classes[classIndex]
    
    def predict(self, X):
        results = []
        for point in X:
            results.append(self.predictPoint(point))
            
        return np.array(results)
    
    def predictPointRec(self, x, node):
        if node.result != None:
            return node.result
        
        if x[node.feature] < node.thresh:
            return self.predictPointRec(x, node.left)
        else:
            return self.predictPointRec(x, node.right)
        
    def score(self, X, y):
        yp = self.predict(X)
        return (yp==y).mean()
        
    

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [13]:
model = DT(max_depth=11)

In [14]:
model.fit(X_train, y_train)

In [15]:
model.predict(X_test[:10])

array([2, 6, 1, 8, 4, 7, 5, 0, 0, 6])

In [16]:
y_test[:10]

array([2, 6, 1, 8, 4, 7, 5, 0, 0, 6])

In [17]:
model.score(X_test, y_test)

0.9939393939393939

In [18]:
yt = y[:20]

In [19]:
clases = np.unique(yt)

In [20]:
clases

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [21]:
clases - .6

array([-0.6,  0.4,  1.4,  2.4,  3.4,  4.4,  5.4,  6.4,  7.4])

In [22]:
np.abs(clases - .6)

array([0.6, 0.4, 1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4])

In [23]:
np.argmin(np.abs(clases - .6))

1

In [24]:
import pandas as pd

In [25]:
df = pd.read_csv("../datasets/mnist_train_small.csv", header=None)

In [26]:
data = df.values

In [27]:
X, y = data[:, 1:], data[:, 0]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [29]:
model = DT(max_depth=10)

In [30]:
model.fit(X_train[:300], y_train[:300])

In [31]:
model.predict(X_test[:20])

array([9, 3, 9, 1, 6, 0, 4, 8, 9, 7, 5, 6, 2, 5, 0, 3, 9, 9, 6, 8])

In [32]:
y_test[:20]

array([9, 5, 2, 6, 6, 0, 0, 0, 9, 8, 9, 6, 2, 5, 0, 3, 9, 2, 6, 2])