In [1]:
# ---
# ✅ Q1. What is a Decision Tree, and how does it work in classification?
# A Decision Tree is a supervised machine learning algorithm used for classification and regression tasks.
# It works by splitting the dataset into branches based on feature values.
# In classification, it asks yes/no questions to predict a class label.
# Each internal node represents a decision based on a feature,
# branches represent the outcomes, and leaves represent final class labels.

# ---
# ✅ Q2. Gini Impurity and Entropy as impurity measures
# Gini Impurity:
# Measures the likelihood of an incorrect classification.
# Gini = 1 - sum(p_i^2), where p_i is the probability of class i
# Entropy:
# Measures the amount of disorder or uncertainty.
# Entropy = -sum(p_i * log2(p_i))
# Lower impurity (Gini or Entropy) means better splits in the Decision Tree.

# ---
# ✅ Q3. Pre-Pruning vs Post-Pruning
# Pre-Pruning:
# Stops tree growth early (e.g., using max_depth, min_samples_split).
# Advantage: Prevents overfitting early.
# Post-Pruning:
# Tree is fully grown then pruned back by removing low-importance nodes.
# Advantage: Can result in a simpler, more general model.

# ---
# ✅ Q4. What is Information Gain?
# Information Gain = Reduction in entropy due to a split.
# It helps select the best feature to split the data.
# Higher information gain → better feature for decision.

# ---
# ✅ Q5. Real-world applications of Decision Trees
# Applications:
# - Medical diagnosis
# - Credit scoring
# - Fraud detection
# - Marketing predictions
# Advantages:
# - Easy to interpret
# - Handles both numerical and categorical data
# Limitations:
# - Prone to overfitting
# - Can create biased trees if class is imbalanced

# ---
# ✅ Q6. Train Decision Tree Classifier with Gini (Iris Dataset)
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier(criterion='gini')
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)
importances = clf.feature_importances_
print("Accuracy:", accuracy)
print("Feature Importances:", importances)

# ---
# ✅ Q7. Compare max_depth=3 vs fully grown tree
clf_full = DecisionTreeClassifier()
clf_full.fit(X_train, y_train)

clf_limited = DecisionTreeClassifier(max_depth=3)
clf_limited.fit(X_train, y_train)

acc_full = clf_full.score(X_test, y_test)
acc_limited = clf_limited.score(X_test, y_test)
print("Full Tree Accuracy:", acc_full)
print("Max Depth=3 Accuracy:", acc_limited)

# ---
# ✅ Q8. Decision Tree Regressor on Boston Housing
from sklearn.datasets import fetch_california_housing
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

boston = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.2, random_state=42)

reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)

preds = reg.predict(X_test)
mse = mean_squared_error(y_test, preds)
print("MSE:", mse)
print("Feature Importances:", reg.feature_importances_)

# ---
# ✅ Q9. Tune max_depth and min_samples_split using GridSearchCV (Iris)
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [2, 3, 4, 5], 'min_samples_split': [2, 3, 4, 5]}
grid = GridSearchCV(DecisionTreeClassifier(), param_grid=params, cv=5)
grid.fit(iris.data, iris.target)

print("Best Parameters:", grid.best_params_)
print("Best Accuracy:", grid.best_score_)

# ---
# ✅ Q10. Healthcare use-case with Decision Tree (Step-by-step)
# Step 1: Handle Missing Values
# - Use imputation (mean for numerical, mode for categorical)
# Step 2: Encode Categorical Variables
# - Use OneHotEncoder or LabelEncoder
# Step 3: Train the Model
# - Use DecisionTreeClassifier with train_test_split
# Step 4: Hyperparameter Tuning
# - Use GridSearchCV for max_depth, min_samples_split
# Step 5: Evaluate Model
# - Use accuracy, confusion matrix, classification report
# Business Value:
# - Predict diseases early → personalized treatment
# - Reduce costs & improve patient outcomes
# - Enables scalable diagnosis support tools


Accuracy: 1.0
Feature Importances: [0.03334028 0.         0.88947325 0.07718647]
Full Tree Accuracy: 1.0
Max Depth=3 Accuracy: 1.0
MSE: 0.4965269504982073
Feature Importances: [0.52788015 0.05272698 0.0524354  0.02789796 0.02975409 0.13132227
 0.09456355 0.0834196 ]
Best Parameters: {'max_depth': 3, 'min_samples_split': 4}
Best Accuracy: 0.9733333333333334
