In [80]:
import seaborn as sns
import numpy as np
import pandas as pd
import os
import pywt
import pywt.data

In [109]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import export_graphviz
import graphviz

In [81]:
from skimage.io import imread
from PIL import Image
from sklearn.decomposition import PCA

In [82]:
from google.colab import drive
drive.mount('/content/drive')

imagedir = 'drive/MyDrive/ML/images/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#  Decision Tree

## Pre Processing

In [83]:
def get_feacture(picture: np.array, cortes: int) -> np.array:
    LL = picture
    for i in range(cortes):
       LL, (LH, HL, HH) = pywt.dwt2(LL, 'haar')
    return LL.flatten()

def get_dimension(path: str) -> tuple:
    images = os.listdir(path)
    ancho_minimo = float('inf')
    alto_minimo  = float('inf')
    for image in images:
        imagen = Image.open(os.path.join(path, image))
        ancho, alto = imagen.size
        ancho_minimo = min(ancho_minimo, ancho)
        alto_minimo = min(alto_minimo, alto)
    return ancho_minimo, alto_minimo


def resize_image(path_in: str, path_out: str):
    images = os.listdir(path_in)
    ancho, alto = get_dimension(path_in)
    for image in images:
        imagen = Image.open(os.path.join(path_in, image))
        imagen_resize = imagen.resize((ancho, alto), Image.ANTIALIAS)
        imagen_resize.save(os.path.join(path_out, image))

def crop_images(path_in: str, path_out: str, margen: int = 20):
    images = os.listdir(path_in)
    for image in images:
        imagen = Image.open(os.path.join(path_in, image))

        ancho, alto = imagen.size
        izquierda = margen
        arriba = margen
        derecha = ancho - margen
        abajo = alto - margen

        imagen_crop = imagen.crop((izquierda, arriba, derecha, abajo))
        imagen_crop.save(os.path.join(path_out, image))

def get_labels(path_in: str) -> np.ndarray:
    images = os.listdir(path_in)
    labels = [image[:3] for image in images]

    return np.array(labels)

def get_feactures(path_in: str) -> np.ndarray:

    images = os.listdir(path_in)
    images_list = [get_feacture(imread(os.path.join(path_in, image)), cortes = 1) for image in images]

    return np.array(images_list)

In [84]:
def one_hot_encoding(data) -> np.ndarray:
    return pd.get_dummies(data).to_numpy()

def min_max_scaler(data: np.ndarray) -> np.ndarray:

    min_vals = np.min(data, axis=0)
    max_vals = np.max(data, axis=0)
    return (data - min_vals) / (max_vals - min_vals)

def standard_scaler(data: np.ndarray) -> np.ndarray:

    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    return (data - mean) / std

In [85]:
output_path = 'drive/MyDrive/ML/images/'
resize_image(imagedir, output_path)

  imagen_resize = imagen.resize((ancho, alto), Image.ANTIALIAS)


In [86]:
input_path = 'drive/MyDrive/ML/images/'
output_path = 'drive/MyDrive/ML/images'
margin = 20
crop_images(input_path, output_path, margin)

In [87]:
input_path = 'drive/MyDrive/ML/images'
features = get_feactures(input_path)

In [88]:
input_path = 'drive/MyDrive/ML/images'
labels = get_labels(input_path)

In [105]:
def normalize_features(features):
    scaler = MinMaxScaler()
    features_normalized = [scaler.fit_transform(feature.reshape(-1, 1)).flatten() for feature in features]
    return features_normalized

In [107]:
indep_normalized = normalize_features(features)

In [108]:
pca = PCA(n_components=831)
features = pca.fit_transform(features)

rows, columns = features.shape
print(rows)
print(columns)

print(labels.shape)

832
831
(832,)


## Decision Tree Structure

In [92]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None, is_leaf=False):

        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        self.value = value
        self.is_leaf = is_leaf

class DecisionTreeClassifier_m():
    def __init__(self):
        self.root = None

    def calculate_entropy(self, Y):
        unique_labels, counts = np.unique(Y, return_counts=True)
        probabilities = counts / len(Y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def get_best_split(self, dataset, num_samples, num_features):
        best_split = {}
        max_info_gain = -float("inf")

        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)

            print(possible_thresholds)
            float_array = np.array(possible_thresholds, dtype=float)
            threshold = np.median(float_array)
            dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
            if len(dataset_left) > 0 and len(dataset_right) > 0:
                y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]

                entropy_parent = self.calculate_entropy(y)
                entropy_left = self.calculate_entropy(left_y)
                entropy_right = self.calculate_entropy(right_y)

                info_gain = entropy_parent - (len(left_y) / num_samples) * entropy_left - (len(right_y) / num_samples) * entropy_right

                if info_gain > max_info_gain:
                    best_split["feature_index"] = feature_index
                    best_split["threshold"] = threshold
                    best_split["dataset_left"] = dataset_left
                    best_split["dataset_right"] = dataset_right
                    best_split["info_gain"] = info_gain
                    max_info_gain = info_gain

        return best_split

    def split(self, dataset, feature_index, threshold):
        feature_values = dataset[:, feature_index].astype(float)
        dataset_left = dataset[feature_values <= threshold]
        dataset_right = dataset[feature_values > threshold]
        return dataset_left, dataset_right

    def build_tree(self, dataset, curr_depth=0):
        X, Y = dataset[:, :-1], dataset[:, -1]
        num_samples, num_features = np.shape(X)

        if len(np.unique(Y)) == 1 or num_features == 0:
            return Node(value=Y[0], is_leaf=True)

        best_split = self.get_best_split(dataset, num_samples, num_features)

        if best_split["info_gain"] > 0:
            left_subtree = self.build_tree(best_split["dataset_left"], curr_depth + 1)
            right_subtree = self.build_tree(best_split["dataset_right"], curr_depth + 1)
            return Node(
                best_split["feature_index"],
                best_split["threshold"],
                left_subtree,
                right_subtree,
                best_split["info_gain"]
            )

        leaf_value = np.argmax(np.bincount(Y))
        return Node(value=leaf_value, is_leaf=True)

    def fit(self, X, Y):
        dataset = np.concatenate((X, Y.reshape(-1, 1)), axis=1)
        self.root = self.build_tree(dataset)

    def make_prediction(self, x, tree):
        if tree.is_leaf:
            return tree.value
        feature_val = x[tree.feature_index]
        if feature_val <= tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

    def predict(self, X):
        predictions = [self.make_prediction(x, self.root) for x in X]
        return predictions

### Simple Training

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

dt = DecisionTreeClassifier_m()

dt.fit(X_train, y_train)

y_valid_pred = dt.predict(X_valid)

precision = precision_score(y_valid, y_valid_pred, average='weighted')
recall = recall_score(y_valid, y_valid_pred, average='weighted')
f1 = f1_score(y_valid, y_valid_pred, average='weighted')

performance_table = pd.DataFrame({
    'Metric': ['Precision', 'Recall', 'F1-Score'],
    'Value': [precision, recall, f1]
})

# Display the performance metrics table
print("Performance Metrics:")
print(performance_table)

y_test_pred = dt.predict(X_test)
test_accuracy = np.mean(y_test_pred == y_test)
print("Test Accuracy:", test_accuracy)


[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
 '-52.0673195963152' '-52.210740083865' '-9.46272045354928'
 '1.2354264996245814' '11.492775672293458' '25.500900635039187'
 '28.194589306704774' '4.372576190959851' '5.675228864778227']
['-0.06440704712575192' '-0.08490292521865753' '-1.5026066270204592'
 '-1.7517306688442613' '-1.8951048715227155' '-16.643777618240485'
 '-17.1402620764964' '-18.02409378539654' '-23.57634325298864'
 '-26.2537634419989' '-35.456011338017206' '-6.653018662321046'
 '-9.823231966739161' '1.4087633733065303' '15.241404018076892'
 '16.930447407740353' '20.400163754720374' '7.757791260910534']
['-19.260593759166888' '-2.4130723285169746' '-2.940365533560093'
 '-3.813002386505072' '-32.85434454133395' '-51.39573409358639'
 '-6.018102148605501' '0.5084920246059618' '0.6812328548250302'
 '12.516154646702537' '2.5637339662507883' '24.99615471675368'
 '25.02390519381276' '3.0854018691143787' '5.672153405669916'
 '7.6644120706342855' '8.222

### K-Fold Training

In [112]:
# Define the number of partitions (folds) and k values
partitions = 5  # Number of partitions
k_values = range(1, 10)  # Values of k to test

y_pred_vals = []
y_test_vals = []

kf = KFold(n_splits=partitions, shuffle=True, random_state=42)

# Initialize lists to store performance metrics
performance_metrics = []
recall_metrics = []
f1_metrics = []

for k in k_values:
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for train_index, val_index in kf.split(features):
        X_train_fold, X_val_fold = features[train_index], features[val_index]
        y_train_fold, y_val_fold = labels[train_index], labels[val_index]

        dt = DecisionTreeClassifier_m()
        dt.fit(X_train_fold, y_train_fold)

        y_val_pred = dt.predict(X_val_fold)
        y_pred_vals.append(y_val_pred)

        y_test_vals.append(y_val_fold)

        # Calculate metrics
        precision = precision_score(y_val_fold, y_val_pred, average='weighted')
        recall = recall_score(y_val_fold, y_val_pred, average='weighted')
        f1 = f1_score(y_val_fold, y_val_pred, average='weighted')

        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)

    # Calculate the average performance metrics across folds for k
    avg_precision = np.mean(precision_scores)
    avg_recall = np.mean(recall_scores)
    avg_f1 = np.mean(f1_scores)

    performance_metrics.append(avg_precision)
    recall_metrics.append(avg_recall)
    f1_metrics.append(avg_f1)


performance_table = pd.DataFrame({
    'K Value': k_values,
    'Precision': performance_metrics,
    'Recall': recall_metrics,
    'F1-Score': f1_metrics
})

# Display
print("Performance Metrics:")
print(performance_table)

# Best k value based on F1-Score and print it
best_k = k_values[np.argmax(f1_metrics)]
print("Best K Value:", best_k)

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
 '67.17349726219959' '7.311829202983365' '7.735711108069726'
 '7.804428770352993' '73.10142297302099' '78.68338594664334'
 '8.161968689241942' '80.23380256917463' '81.20984207089616'
 '85.15276539519914' '85.86579687776342' '86.92432319323851'
 '92.5607857479711' '96.28652615091782' '98.90551881927038']
['-0.21743711571107938' '-0.3109662388317828' '-0.3239385260839876'
 '-0.837105831298097' '-1.0394798982225029' '-1.0632025601312256'
 '-1.079935718272004' '-1.0864176388791769' '-1.116220077591495'
 '-1.1560658641498058' '-1.1712611006477953' '-1.7930969483073151'
 '-10.066983942056003' '-11.13388629962098' '-11.222901666018663'
 '-11.282551394071884' '-11.483138138116157' '-12.006223916276483'
 '-12.218875651156164' '-12.426905713892305' '-12.63767760452555'
 '-13.26472178334376' '-13.334161215003135' '-13.712717837220994'
 '-13.75454427628232' '-13.801251999273024' '-14.056904403999148'
 '-14.6141490045671' '-

KeyboardInterrupt: ignored

### Confusion Matrix

In [None]:
# Calcular la matriz de confusión

cm = confusion_matrix(y_test, y_test_pred)

# Mostrar la matriz de confusión como un mapa de calor (heatmap)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicciones')
plt.ylabel('Valores Verdaderos')
plt.title('Matriz de Confusión')
plt.show()

### Test Report

In [None]:
# Graficamos la precision promedio de las predicciones
k_values = range(1, 21)
# Después de calcular las métricas, fuera del bucle
plt.figure(figsize=(10, 6))  # Ajusta el tamaño de la figura

# Graficar la precisión
plt.plot(k_values, performance_metrics, label='Precisión', marker='o')

# Graficar el recall
plt.plot(k_values, recall_metrics, label='Recall', marker='x')

# Graficar el F1-score
plt.plot(k_values, f1_metrics, label='F1-score', marker='s')

# Añadir una leyenda para identificar las métricas
plt.legend()

# Etiquetas de los ejes y título
plt.xlabel('Valor de k')
plt.ylabel('Métrica de Rendimiento')
plt.title('Métricas de Rendimiento para Diferentes Valores de k')

# Mostrar la gráfica
plt.grid(True)  # Agregar una cuadrícula
plt.show()

In [None]:
y_pred = knn_multiple(x_train, y_train, x_test, best_k)
report = classification_report(y_test, y_pred)
print("Informe de clasificación para el mejor valor de k:")
print(report)