In [109]:
import numpy as np
from sklearn.datasets import load_iris
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd


In [110]:


# Definición de la función para calcular la entropía
def entropy(y):
    unique, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

# Definición de la función para dividir los datos
def split_data(X, y, feature_index, threshold):
    left_mask = X[:, feature_index] <= threshold
    right_mask = ~left_mask
    X_left, y_left = X[left_mask], y[left_mask]
    X_right, y_right = X[right_mask], y[right_mask]
    return X_left, y_left, X_right, y_right

# Definición de la clase del nodo del árbol de decisión
class Node:
    def __init__(self, depth=0, max_depth=None):
        self.depth = depth
        self.max_depth = max_depth
        self.feature_index = None
        self.threshold = None
        self.left = None
        self.right = None
        self.value = None

    def fit(self, X, y):
        unique_classes, class_counts = np.unique(y, return_counts=True)
        self.value = unique_classes[np.argmax(class_counts)]
        if self.depth == self.max_depth or len(unique_classes) == 1: # Se pregunta las dos condiciones de paradas, sin divisiones y profundidad máxima
            return

        num_features = X.shape[1] # cantidad de Features
        best_info_gain = -1 # métrica
        
        for feature_index in range(num_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                X_left, y_left, X_right, y_right = split_data(X, y, feature_index, threshold)
                if len(y_left) == 0 or len(y_right) == 0:
                    continue

                info_gain = entropy(y) - ((len(y_left) / len(y)) * entropy(y_left) + (len(y_right) / len(y)) * entropy(y_right))

                if info_gain > best_info_gain:
                    best_info_gain = info_gain
                    self.feature_index = feature_index
                    self.threshold = threshold

        if self.feature_index is not None:
            X_left, y_left, X_right, y_right = split_data(X, y, self.feature_index, self.threshold)
            self.left = Node(depth=self.depth + 1, max_depth=self.max_depth)
            self.left.fit(X_left, y_left)
            self.right = Node(depth=self.depth + 1, max_depth=self.max_depth)
            self.right.fit(X_right, y_right)

    def predict(self, X):
        if self.feature_index is None:
            return self.value
        if X[self.feature_index] <= self.threshold:
            return self.left.predict(X)
        else:
            return self.right.predict(X)



In [111]:
# Leer los csv files

df1 = pd.read_table("DataSetNotas/notas1.csv", delimiter =",")
df1 = df1[['Examen 1 25.0%', 'Examen 2 25.0%', 'Quiz1', 'Total: ']]
df1.columns = ['E1', 'E2', 'Q1', 'T']
df1 = df1.fillna(0)

df2 = pd.read_table("DataSetNotas/notas2.csv", delimiter =";")
df2 = df2[['E1 25.00%', 'E2 25.00%', 'Q1 3.00%', 'Total: ']]
df2.columns = ['E1', 'E2', 'Q1', 'T']
df2 = df2.fillna(0)

df3 = pd.read_table("DataSetNotas/notas3.csv", delimiter =",")
df3 = df3[['Examen 1 25.00%', 'Examen 2 25.00%', 'Quiz 1', 'Total: ']]
df3.columns = ['E1', 'E2', 'Q1', 'T']
df3 = df3.fillna(0)

df4 = pd.read_table("DataSetNotas/notas4.csv", delimiter =",")
df4 = df4[['Examen 1 25.00%', 'Examen 2 25%', 'Quiz 1', 'Total: ']]
df4.columns = ['E1', 'E2', 'Q1', 'T']
df4 = df4.fillna(0)

df5 = pd.read_table("DataSetNotas/notas5.csv", delimiter =";")
df5 = df5[['Examen 1 25%', 'Examen 2 25.00%', 'Q1 3%', 'Total: ']]
df5 = df5.replace(' ', 0)
df5.columns = ['E1', 'E2', 'Q1', 'T']

frames = [df1, df2, df3, df4, df5]
allData = pd.concat(frames, ignore_index=True)
allData = allData.apply(pd.to_numeric, errors='coerce')
allData.fillna(0)
allData['Pass'] = np.where(allData['T'] >= 67.5, 1, 0)
#allData = allData.mul({'E1': 0.25, 'E2': 0.25, 'Q1': 0.03, 'T': 1, 'Pass': 1})


print(allData)

        E1    E2    Q1      T  Pass
0     57.4  75.6  93.3  70.50     1
1    100.0  96.3  86.7  95.80     1
2     19.6  46.7  26.7  44.80     0
3     34.5  34.2  36.7  28.40     0
4     89.5  87.7  93.3  91.40     1
..     ...   ...   ...    ...   ...
119   45.0  95.0  35.0  67.55     1
120  100.0  75.0  80.0  83.90     1
121   10.0   0.0  35.0   7.55     0
122   15.0   0.0  10.0   4.05     0
123   60.0   0.0  85.0  20.75     0

[124 rows x 5 columns]


In [122]:
x = allData[["E1","E2","Q1"]].to_numpy()
scaler = StandardScaler()
x = scaler.fit_transform(x)
print(x)
y = allData["Pass"].to_numpy()
print(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.35, random_state=45)

[[-1.35227354e-01  5.70129478e-01  8.69673337e-01]
 [ 1.66590951e+00  1.32928684e+00  6.51414823e-01]
 [-1.73341922e+00 -4.89756886e-01 -1.33275349e+00]
 [-1.10344412e+00 -9.48185244e-01 -1.00205877e+00]
 [ 1.22196732e+00  1.01388813e+00  8.69673337e-01]
 [ 8.92181701e-01 -2.69711274e-01  8.69673337e-01]
 [ 6.72324620e-01  3.68421000e-01  4.29849362e-01]
 [-2.57839957e-01 -4.75087178e-01  4.29849362e-01]
 [-1.08230401e+00 -2.20244523e+00 -2.21570838e+00]
 [-7.73658495e-01  5.04115794e-01  8.69673337e-01]
 [-3.71996519e-01  2.10721645e-01  8.69673337e-01]
 [ 4.44011496e-01  9.07532749e-01  4.29849362e-01]
 [-1.26410891e+00 -8.23492730e-01 -1.33606043e+00]
 [-7.86342558e-01 -2.20244523e+00 -8.92929510e-01]
 [ 4.31327434e-01  3.97760415e-01  9.82109541e-01]
 [-7.60350626e-02 -1.34016480e-01 -1.22410817e-01]
 [-3.97364644e-01  9.66211579e-01  1.09123880e+00]
 [-2.45155895e-01  1.23760117e+00 -1.11449497e+00]
 [-7.18694225e-01 -3.21055250e-01  2.08283901e-01]
 [ 3.97503267e-01  7.49833394e-

In [117]:
# Creación y entrenamiento del árbol de decisión
max_depth = 300
tree = Node(max_depth=max_depth)
tree.fit(X_train, y_train)

# Realización de predicciones en el conjunto de prueba
y_pred = [tree.predict(x) for x in X_test]

# Evaluación del modelo
accuracy = np.mean(y_pred == y_test)
print(f"Precisión del modelo: {accuracy:.2f}")

Precisión del modelo: 0.82
