# 1. Predicting Heart Disease Using a Classification Tree

## 1.1 Data Cleaning

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import category_encoders as ce
import matplotlib.pyplot as plt
import seaborn as sns
from econml.grf import CausalForest
from sklearn.linear_model import LinearRegression

In [None]:
# --- Leer datos ---
data = pd.read_csv("C:/Users/VICTOR/Documents/GitHub/Decision_Trees/R/input/processed.cleveland.data",
                   header=None)

# --- Renombrar columnas ---
data.columns = ['age', 'sex', 'cp', 'restbp', 'chol', 'fbs',
                'restecg', 'thalach', 'exang', 'oldpeak',
                'slope', 'ca', 'thal', 'hd']

# --- Reemplazar "?" por NaN y eliminar ---
data = data.replace("?", np.nan).dropna()
data = data.apply(pd.to_numeric)

# --- Crear variable binaria de enfermedad ---
data['y'] = np.where(data['hd'] > 0, 1, 0)

# --- Variables categóricas ---
categorical_vars = ["cp", "restecg", "slope", "ca", "thal", "hd"]
for var in categorical_vars:
    data[var] = data[var].astype('category')

# --- Crear variables dummy ---
encoder = ce.OneHotEncoder(cols=categorical_vars, drop_invariant=True, use_cat_names=True)
data = encoder.fit_transform(data)

# --- Separar train y test ---
train, test = train_test_split(data, test_size=0.2, random_state=123, stratify=data['y'])

# --- Modelo de árbol de clasificación ---
X_train = train.drop(columns=['y'])
y_train = train['y']
X_test = test.drop(columns=['y'])
y_test = test['y']

tree_model = DecisionTreeClassifier(random_state=123)
tree_model.fit(X_train, y_train)
predictions = tree_model.predict(X_test)

# --- Métricas ---
print("Accuracy:", accuracy_score(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

plt.figure(figsize=(20, 8))
plot_tree(tree_model, filled=True, feature_names=X_train.columns, class_names=["No HD", "HD"])
plt.show()

## 1.2 Data Analysis

# 2. Causal Forest 