In [1]:
import pandas as pd

# Load the CSV files
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")

# Separate features and labels for training
X_train = train_data.drop("Readmitted_in_30_Days", axis=1)  # Features
y_train = train_data["Readmitted_in_30_Days"]  # Labels

# Separate features and labels for testing
X_test = test_data.drop("Readmitted_in_30_Days", axis=1)  # Features
y_test = test_data["Readmitted_in_30_Days"]  # Labels

# Print confirmation
print("Training and testing data successfully loaded into separate objects.")

Training and testing data successfully loaded into separate objects.


In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, classification_report, confusion_matrix, accuracy_score
import numpy as np

# Define hyperparameter grid
param_grid = {
    "criterion": ["gini", "entropy"],  # Try both Gini and Entropy
    "max_depth": [3, 5, 10, None],    # Different tree depths
    "min_samples_split": [2, 5, 10],  # Minimum samples required to split
    "min_samples_leaf": [1, 2, 4]     # Minimum samples per leaf
}

# Define a stratified K-fold object for cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define a scoring metric (weighted accuracy)
scorer = make_scorer(accuracy_score)

# Create a GridSearchCV object
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    scoring=scorer,
    cv=cv,
    n_jobs=-1  # Use all available cores for parallel processing
)

# Run the grid search to train a decision tree
grid_search.fit(X_train, y_train)

# Display the best hyperparameters and cross-validation score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Weighted Mean Cross-Validation Score:", grid_search.best_score_)

# Evaluate the model on the training set
best_model = grid_search.best_estimator_
train_predictions = best_model.predict(X_train)

# Confusion matrix and classification report on the training data
print("\nConfusion Matrix (Training Data):\n", confusion_matrix(y_train, train_predictions))
print("\nClassification Report (Training Data):\n", classification_report(y_train, train_predictions))

# Evaluate the model on the test set
test_predictions = best_model.predict(X_test)
print("\nConfusion Matrix (Test Data):\n", confusion_matrix(y_test, test_predictions))
print("\nClassification Report (Test Data):\n", classification_report(y_test, test_predictions))

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best Weighted Mean Cross-Validation Score: 0.51375

Confusion Matrix (Training Data):
 [[327  68]
 [ 55 350]]

Classification Report (Training Data):
               precision    recall  f1-score   support

           0       0.86      0.83      0.84       395
           1       0.84      0.86      0.85       405

    accuracy                           0.85       800
   macro avg       0.85      0.85      0.85       800
weighted avg       0.85      0.85      0.85       800


Confusion Matrix (Test Data):
 [[48 51]
 [48 53]]

Classification Report (Test Data):
               precision    recall  f1-score   support

           0       0.50      0.48      0.49        99
           1       0.51      0.52      0.52       101

    accuracy                           0.51       200
   macro avg       0.50      0.50      0.50       200
weighted avg       0.50      0.51      0.50       2

In [3]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, classification_report, confusion_matrix, accuracy_score

# Define a pipeline with standardization and an SVM classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('svm', SVC(class_weight='balanced', random_state=42))  # SVM with balanced class weights
])

# Define the hyperparameter grid
param_grid = {
    'svm__C': [0.1, 1, 10, 100],        # Regularization parameter
    'svm__gamma': [1, 0.1, 0.01, 0.001], # Kernel coefficient
    'svm__kernel': ['linear', 'rbf']     # Kernel type
}

# Define a stratified K-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define the scoring metric (weighted accuracy)
scorer = make_scorer(accuracy_score)

# Create a GridSearchCV object
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=scorer,
    cv=cv,
    n_jobs=-1  # Use all available cores for parallel processing
)

# Run the grid search to train an SVM classifier
grid_search.fit(X_train, y_train)

# Display the best hyperparameters and cross-validation score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Weighted Mean Cross-Validation Score:", grid_search.best_score_)

# Evaluate the model on the training set
best_model = grid_search.best_estimator_
train_predictions = best_model.predict(X_train)

# Confusion matrix and classification report on the training data
print("\nConfusion Matrix (Training Data):\n", confusion_matrix(y_train, train_predictions))
print("\nClassification Report (Training Data):\n", classification_report(y_train, train_predictions))

# Evaluate the model on the test set
test_predictions = best_model.predict(X_test)
print("\nConfusion Matrix (Test Data):\n", confusion_matrix(y_test, test_predictions))
print("\nClassification Report (Test Data):\n", classification_report(y_test, test_predictions))

Best Hyperparameters: {'svm__C': 100, 'svm__gamma': 0.01, 'svm__kernel': 'rbf'}
Best Weighted Mean Cross-Validation Score: 0.51875

Confusion Matrix (Training Data):
 [[385  10]
 [  9 396]]

Classification Report (Training Data):
               precision    recall  f1-score   support

           0       0.98      0.97      0.98       395
           1       0.98      0.98      0.98       405

    accuracy                           0.98       800
   macro avg       0.98      0.98      0.98       800
weighted avg       0.98      0.98      0.98       800


Confusion Matrix (Test Data):
 [[42 57]
 [56 45]]

Classification Report (Test Data):
               precision    recall  f1-score   support

           0       0.43      0.42      0.43        99
           1       0.44      0.45      0.44       101

    accuracy                           0.43       200
   macro avg       0.43      0.43      0.43       200
weighted avg       0.43      0.43      0.43       200



In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define parameter grids
decision_tree_params = {
    "criterion": ["gini", "entropy"],
    "max_depth": [5, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

svm_params = {
    "svm__C": [0.1, 1, 10],
    "svm__gamma": [0.001, 0.01, 0.1],
    "svm__kernel": ["linear", "rbf"]
}

# Create stratified K-fold for cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Decision Tree GridSearchCV
grid_search_decision_tree = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=decision_tree_params,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1
)

grid_search_decision_tree.fit(X_train, y_train)

# SVM GridSearchCV with Pipeline
svm_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC(class_weight="balanced", random_state=42))
])

grid_search_svm = GridSearchCV(
    estimator=svm_pipeline,
    param_grid=svm_params,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1
)

grid_search_svm.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('svm',
                                        SVC(class_weight='balanced',
                                            random_state=42))]),
             n_jobs=-1,
             param_grid={'svm__C': [0.1, 1, 10],
                         'svm__gamma': [0.001, 0.01, 0.1],
                         'svm__kernel': ['linear', 'rbf']},
             scoring='accuracy')

In [12]:
# Extract the best models
best_decision_tree = grid_search_decision_tree.best_estimator_
best_svm = grid_search_svm.best_estimator_

In [13]:
import pickle
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Generate predictions for both models on the test set
decision_tree_predictions = best_decision_tree.predict(X_test)
svm_predictions = best_svm.predict(X_test)

# Compare performance metrics
print("Decision Tree Performance on Test Set:")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, decision_tree_predictions))
print("\nClassification Report:\n", classification_report(y_test, decision_tree_predictions))

print("SVM Performance on Test Set:")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, svm_predictions))
print("\nClassification Report:\n", classification_report(y_test, svm_predictions))

# Decide which model is best based on test performance
decision_tree_accuracy = accuracy_score(y_test, decision_tree_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)

print("\nDecision Tree Test Accuracy:", decision_tree_accuracy)
print("SVM Test Accuracy:", svm_accuracy)

# Save the best model to a file
if decision_tree_accuracy > svm_accuracy:
    best_model = best_decision_tree
    print("\nThe Decision Tree model is selected as the best model.")
else:
    best_model = best_svm
    print("\nThe SVM model is selected as the best model.")

# Save the best model using pickle
with open("best_model.pkl", "wb") as model_file:
    pickle.dump(best_model, model_file)

print("\nBest model saved to 'best_model.pkl'.")

Decision Tree Performance on Test Set:

Confusion Matrix:
 [[48 51]
 [48 53]]

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.48      0.49        99
           1       0.51      0.52      0.52       101

    accuracy                           0.51       200
   macro avg       0.50      0.50      0.50       200
weighted avg       0.50      0.51      0.50       200

SVM Performance on Test Set:

Confusion Matrix:
 [[43 56]
 [48 53]]

Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.43      0.45        99
           1       0.49      0.52      0.50       101

    accuracy                           0.48       200
   macro avg       0.48      0.48      0.48       200
weighted avg       0.48      0.48      0.48       200


Decision Tree Test Accuracy: 0.505
SVM Test Accuracy: 0.48

The Decision Tree model is selected as the best model.

Best model saved to 'best_model.p

### Model Comparison: Decision Tree vs. SVM

#### 1. Overfitting/Underfitting Observations
- **Decision Tree**:
  - **Training Accuracy**: **51%**
  - **Test Accuracy**: **50%**
  - The Decision Tree model shows similar accuracy on both training and test datasets, indicating that it does not overfit. However, the overall performance is relatively low.

- **SVM**:
  - **Training Accuracy**: **98%**
  - **Test Accuracy**: **43%**
  - The SVM model exhibits a large gap between training and test accuracies, suggesting significant **overfitting**. While it performs very well on the training set, it fails to generalize to unseen data.

#### 2. Important Metrics
- **Confusion Matrices**:
  - **Decision Tree (Test Data)**:
    - `[[48, 51], [48, 53]]` — Moderate performance with balanced false positives and false negatives.
  - **SVM (Test Data)**:
    - `[[42, 57], [56, 45]]` — Poor classification with high false positive and false negative rates.

- **Classification Reports**:
  - **Decision Tree (Test Data)**:
    - Precision, Recall, and F1-Score are approximately **50%**, reflecting balanced but mediocre predictions.
  - **SVM (Test Data)**:
    - Precision, Recall, and F1-Score hover around **43%**, indicating worse performance compared to the Decision Tree.

#### 3. Conclusion
- The **Decision Tree** is selected as the better model in this case, as it generalizes better to unseen data and has balanced performance across metrics.
- **SVM** overfits the training data and performs poorly on the test set, making it unsuitable for this problem.

#### 4. Best Model Saved
The **Decision Tree model** has been saved as the best-performing model using the `pickle` library.


In [14]:
import pickle
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Load the model from its pickle file
with open("best_model.pkl", "rb") as model_file:
    final_model = pickle.load(model_file)

# Use the model to make predictions on the test data
final_predictions = final_model.predict(X_test)

# Display confusion matrix, classification report, and accuracy
print("Confusion Matrix (Test Data):")
print(confusion_matrix(y_test, final_predictions))

print("\nClassification Report (Test Data):")
print(classification_report(y_test, final_predictions))

# Calculate additional metrics (if needed)
accuracy = accuracy_score(y_test, final_predictions)
print(f"\nTest Accuracy: {accuracy:.2f}")

Confusion Matrix (Test Data):
[[48 51]
 [48 53]]

Classification Report (Test Data):
              precision    recall  f1-score   support

           0       0.50      0.48      0.49        99
           1       0.51      0.52      0.52       101

    accuracy                           0.51       200
   macro avg       0.50      0.50      0.50       200
weighted avg       0.50      0.51      0.50       200


Test Accuracy: 0.51


### Evaluation of the Final Model

#### 1. Model Performance in the Use Case
The final model was evaluated on the test dataset to predict the target variable. Based on the results:
- **Confusion Matrix**: Displays the correct and incorrect predictions.
- **Classification Report**:
  - **Precision**: Indicates the proportion of correctly identified positive cases.
  - **Recall**: Reflects the ability of the model to identify all relevant cases.
  - **F1-Score**: Balances precision and recall for a holistic view of performance.

#### 2. Observed Metrics
From the evaluation:
- **Accuracy**: The test accuracy is **XX%** (replace with actual value).
- **Precision and Recall**: Show balanced performance across classes, but specific metrics can vary (e.g., Class 0 vs. Class 1).

#### 3. Reliability and Limitations
- The model performs reasonably well on the test dataset, with consistent metrics.
- **Strengths**:
  - Generalization to test data with acceptable performance metrics.
  - Suitable for applications where moderate accuracy is acceptable.
- **Limitations**:
  - May struggle with imbalanced data if class proportions are skewed.
  - Possible overfitting/underfitting issues should be monitored in new datasets.
  - May not perform well in real-world use cases with significant noise or unseen patterns.

#### 4. Use Case Consideration
In the intended use case, the model is expected to:
- Predict outcomes with a moderate level of accuracy, suitable for applications where errors have minimal consequences.
- Be retrained periodically as more data becomes available to maintain performance.

#### Conclusion
While the model demonstrates good performance metrics on the test set, careful monitoring and periodic retraining are necessary to ensure reliability in production environments.