In [None]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn import tree

print("All libraries imported successfully!")

StatementMeta(cd79b5aa-c06e-4c4c-8a02-0fbc73d1aae4, 1, 6, Finished, Available, Finished)

2026-02-13 12:49:43.117025: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow Version: 2.15.0


In [None]:
# Sample dataset: Study hours, previous exam scores, and pass/fail labels
data = {
    'StudyHours': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'PrevExamScore': [30, 40, 45, 50, 60, 65, 70, 75, 80, 85],
    'Pass': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]  # 0 = Fail, 1 = Pass
}

# Convert to DataFrame
df = pd.DataFrame(data)
print("Dataset Summary:")
print(df)
print(f"\nDataset shape: {df.shape}")

In [None]:
# Features (X) and Target (y)
X = df[['StudyHours', 'PrevExamScore']]  # Features
y = df['Pass']  # Target variable (0 = Fail, 1 = Pass)

# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data: {X_train.shape}, {y_train.shape}")
print(f"Testing data: {X_test.shape}, {y_test.shape}")
print("\nTraining set:")
print(X_train)
print("Training target:")
print(y_train.values)

In [None]:
# Initialize and train the Logistic Regression model
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)

# Make predictions using the test set
y_pred_logreg = logreg_model.predict(X_test)

# Evaluate the Logistic Regression model
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)

print("=" * 60)
print("LOGISTIC REGRESSION MODEL")
print("=" * 60)
print(f"Accuracy: {accuracy_logreg:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_logreg))

In [None]:
# Initialize and train the Decision Tree Classifier
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

# Make predictions using the test set
y_pred_tree = tree_model.predict(X_test)

# Evaluate the Decision Tree model
accuracy_tree = accuracy_score(y_test, y_pred_tree)

print("=" * 60)
print("DECISION TREE MODEL")
print("=" * 60)
print(f"Accuracy: {accuracy_tree:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_tree))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tree))

In [None]:
# Compare model performance
print("=" * 60)
print("MODEL PERFORMANCE COMPARISON")
print("=" * 60)
print(f"\nLogistic Regression Accuracy: {accuracy_logreg:.4f}")
print(f"Decision Tree Accuracy:       {accuracy_tree:.4f}")

if accuracy_logreg > accuracy_tree:
    print(f"\n✓ Logistic Regression performs better by {(accuracy_logreg - accuracy_tree)*100:.2f}%")
elif accuracy_tree > accuracy_logreg:
    print(f"\n✓ Decision Tree performs better by {(accuracy_tree - accuracy_logreg)*100:.2f}%")
else:
    print("\n✓ Both models have equal accuracy")

# Create comparison visualization
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

models = ['Logistic\nRegression', 'Decision\nTree']
accuracies = [accuracy_logreg, accuracy_tree]
colors = ['#3498db', '#e74c3c']

axes[0].bar(models, accuracies, color=colors, alpha=0.7, edgecolor='black', linewidth=2)
axes[0].set_ylabel('Accuracy', fontsize=12)
axes[0].set_title('Model Accuracy Comparison', fontsize=13, fontweight='bold')
axes[0].set_ylim([0, 1])
axes[0].grid(axis='y', alpha=0.3)

# Add accuracy labels on bars
for i, acc in enumerate(accuracies):
    axes[0].text(i, acc + 0.02, f'{acc:.4f}', ha='center', fontweight='bold')

# Plot prediction results
x_range = np.arange(len(y_test))
axes[1].plot(x_range, y_test.values, 'ko-', linewidth=2, markersize=8, label='Actual', alpha=0.7)
axes[1].plot(x_range, y_pred_logreg, 's--', linewidth=2, markersize=6, label='LogReg Pred', alpha=0.7)
axes[1].plot(x_range, y_pred_tree, '^--', linewidth=2, markersize=6, label='Tree Pred', alpha=0.7)
axes[1].set_xlabel('Test Sample Index', fontsize=12)
axes[1].set_ylabel('Prediction (0=Fail, 1=Pass)', fontsize=12)
axes[1].set_title('Predictions on Test Set', fontsize=13, fontweight='bold')
axes[1].legend(fontsize=10)
axes[1].grid(alpha=0.3)
axes[1].set_ylim([-0.1, 1.1])

plt.tight_layout()
plt.show()

print("\nVisualization complete!")

In [None]:
# Visualize the decision tree structure
plt.figure(figsize=(14, 8))
tree.plot_tree(tree_model, 
               feature_names=['StudyHours', 'PrevExamScore'], 
               class_names=['Fail', 'Pass'], 
               filled=True, 
               rounded=True,
               fontsize=10)
plt.title('Decision Tree Structure for Pass/Fail Classification', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nDecision Tree Visualization Complete!")
print(f"Tree Depth: {tree_model.get_depth()}")
print(f"Number of Leaves: {tree_model.get_n_leaves()}")

In [None]:
# Tuning Decision Tree to prevent overfitting
print("=" * 60)
print("DECISION TREE TUNING FOR OVERFITTING PREVENTION")
print("=" * 60)

# Test different max depths to find optimal
depths = range(1, 6)
train_accuracies = []
test_accuracies = []

for depth in depths:
    tuned_tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
    tuned_tree.fit(X_train, y_train)
    
    train_acc = accuracy_score(y_train, tuned_tree.predict(X_train))
    test_acc = accuracy_score(y_test, tuned_tree.predict(X_test))
    
    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)
    
    print(f"Depth {depth}: Train Acc = {train_acc:.4f}, Test Acc = {test_acc:.4f}")

# Plot tuning results
plt.figure(figsize=(11, 5))
plt.plot(depths, train_accuracies, 'o-', linewidth=2, markersize=8, label='Training Accuracy', color='#2ecc71')
plt.plot(depths, test_accuracies, 's-', linewidth=2, markersize=8, label='Testing Accuracy', color='#e74c3c')
plt.xlabel('Tree Depth', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Decision Tree Tuning: Impact of Max Depth on Model Performance', fontsize=13, fontweight='bold')
plt.xticks(depths)
plt.ylim([0, 1.05])
plt.grid(alpha=0.3)
plt.legend(fontsize=11)
plt.tight_layout()
plt.show()

# Find optimal depth
optimal_depth = depths[test_accuracies.index(max(test_accuracies))]
print(f"\n✓ Optimal max_depth: {optimal_depth} (Test Accuracy: {max(test_accuracies):.4f})")

In [None]:
# Reflection and Analysis
print("\n" + "=" * 60)
print("REFLECTION AND ANALYSIS")
print("=" * 60)

analysis = """
FINDINGS FROM THE PRACTICAL EXERCISE:
=====================================

1. MODEL PERFORMANCE COMPARISON
   - Logistic Regression Accuracy: {logreg_accuracy:.4f}
   - Decision Tree Accuracy:       {tree_accuracy:.4f}
   - Best Performer:               {best_model}

2. KEY OBSERVATIONS:
   
   a) Logistic Regression:
      ✓ Pros:
        • Simple and interpretable model
        • Efficient training and prediction
        • Assumes linear relationship between features and target
        • Less prone to overfitting on small datasets
        • Works well with this dataset (linear separability)
      
      ✗ Cons:
        • Limited flexibility for complex patterns
        • Cannot capture non-linear relationships
        • May underfit on complex datasets
   
   b) Decision Tree:
      ✓ Pros:
        • Can capture non-linear relationships
        • Highly interpretable (easy to understand decision paths)
        • No feature scaling required
        • Can handle both numerical and categorical data
        • Provides feature importance information
      
      ✗ Cons:
        • Prone to overfitting, especially with deeper trees
        • Training complexity O(n*log n) for sorting features
        • Small changes in data can lead to completely different trees
        • Greedy approach may not find truly optimal splits

3. WHEN TO USE EACH MODEL:
   
   Use Logistic Regression when:
   - You need a fast, interpretable model
   - Your data has linear decision boundaries
   - You have limited training data
   - You need to minimize overfitting risk
   - Features have clear linear relationships with the target
   
   Use Decision Trees when:
   - You need to capture complex, non-linear patterns
   - Interpretability of decision paths is important
   - You have sufficient data to prevent overfitting
   - You need to handle mixed feature types
   - You want automatic feature interaction discovery

4. DATA COMPLEXITY IMPACT:
   - With the simple, linearly separable dataset used here,
     Logistic Regression performs competitively
   - Decision Trees don't need feature scaling
   - The small dataset size limits tree depth before overfitting occurs
   - Both models assign the same importance to the features

5. RECOMMENDATIONS:
   - For this particular problem (pass/fail prediction):
     The data is {recommendation_choice}, so {recommendation_detail}.
   - Always use cross-validation for more robust evaluation
   - Consider ensemble methods (Random Forest, Gradient Boosting)
     that combine multiple decision trees
   - Scale features before using Logistic Regression for better results
   - Monitor for overfitting by comparing train/test accuracies

CONCLUSION:
===========
Both Logistic Regression and Decision Trees have their place in
machine learning. The choice depends on your specific problem, data
characteristics, and requirements. Start simple with Logistic Regression,
and move to more complex models like Decision Trees or ensembles if needed.
"""

best_model = "Logistic Regression" if accuracy_logreg >= accuracy_tree else "Decision Tree"
recommendation_choice = "linearly separable" if accuracy_logreg >= accuracy_tree else "complex"
recommendation_detail = ("use Logistic Regression for simplicity and efficiency" 
                        if accuracy_logreg >= accuracy_tree 
                        else "Decision Tree captures the patterns better")

print(analysis.format(
    logreg_accuracy=accuracy_logreg,
    tree_accuracy=accuracy_tree,
    best_model=best_model,
    recommendation_choice=recommendation_choice,
    recommendation_detail=recommendation_detail
))