In [33]:
#to:
#Resize every image to 150x150.
#Normalize pixel values between 0-1.
#Label images (0 = NORMAL, 1 = PNEUMONIA).
#Store everything inside NumPy arrays — ready for ML or DL models!

In [37]:
import os
import cv2
import numpy as np

def load_images_from_folder(folder_path):
    images = []
    labels = []
    for label in os.listdir(folder_path):
        label_path = os.path.join(folder_path, label)
        if os.path.isdir(label_path):
            for image_filename in os.listdir(label_path):
                image_path = os.path.join(label_path, image_filename)
                img = cv2.imread(image_path)
                if img is not None:
                    img = cv2.resize(img, (150, 150))  # Resize all images to 150x150
                    images.append(img)
                    labels.append(0 if label == 'NORMAL' else 1)  # Normal=0, Pneumonia=1
    return np.array(images), np.array(labels)


In [39]:
# Update these paths with your correct local path
train_path = r'C:\Users\Chaithanya\Downloads\archive (2)\chest_xray\train'
val_path = r'C:\Users\Chaithanya\Downloads\archive (2)\chest_xray\val'
test_path = r'C:\Users\Chaithanya\Downloads\archive (2)\chest_xray\test'

# Loading datasets
X_train, y_train = load_images_from_folder(train_path)
X_val, y_val = load_images_from_folder(val_path)
X_test, y_test = load_images_from_folder(test_path)

print("Train images:", X_train.shape)
print("Validation images:", X_val.shape)
print("Test images:", X_test.shape)


Train images: (5216, 150, 150, 3)
Validation images: (16, 150, 150, 3)
Test images: (624, 150, 150, 3)


In [None]:
# # output:
# Training images shape: (5216, 150, 150, 3) ✅ (5216 images, each 150x150 size, 3 color channels — RGB)
# Training labels shape: (5216,) ✅(One label for each image)
#Means — your image preprocessing is fully successful 



In [43]:
# ML models like SVM, Random Forest, etc. cannot directly work with 3D images like (150,150,3).
# We need numerical features to feed into ML models like SVM, KNN, Random Forest.)
# Since we already resized images to 150x150 during preprocessing, we can flatten them into a 1D vector of numbers.
# Each image = 150 × 150 × 3 = 67,500 features (pixels)
# We will flatten the 3D image into a 1D vector.

# flattening?
# Traditional ML models (SVM, RF, KNN) expect tabular data (rows × columns)
# Each row = 1 image, each column = 1 pixel value
# Deep Learning (CNNs) can work with 3D images, but ML models cannot.

In [45]:
# Flatten the images into 1D vectors
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_val_flat = X_val.reshape(X_val.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

print("Flattened Train shape:", X_train_flat.shape)
print("Flattened Validation shape:", X_val_flat.shape)
print("Flattened Test shape:", X_test_flat.shape)


Flattened Train shape: (5216, 67500)
Flattened Validation shape: (16, 67500)
Flattened Test shape: (624, 67500)


In [None]:
# Precision: How many of the predicted positive cases were actually positive.
# Recall: How many of the actual positive cases were correctly identified.
# F1-Score: The harmonic mean of precision and recall, giving a balance between the two.
# Accuracy: The overall percentage of correctly classified images.
# support:refers to the number of actual occurrences (samples) of each class in your dataset.

In [None]:
# Train Traditional Machine Learning Models
# Training SVM with the training data.
# Predicting on the validation and test sets.
# Evaluating the performance (accuracy, precision, recall, F1-score).



In [68]:
# Importing necessary libraries
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Flattening image data (already done in your previous steps)
X_train_flat = X_train.reshape(X_train.shape[0], -1)  # Flatten images into 1D vectors for training
X_test_flat = X_test.reshape(X_test.shape[0], -1)    # Flatten images into 1D vectors for testing
X_val_flat = X_val.reshape(X_val.shape[0], -1)       # Flatten images into 1D vectors for validation

# SVM Model training
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_flat, y_train)  # Train the model with flattened images and labels

# Make predictions
y_val_pred = svm_model.predict(X_val_flat)  # Predict labels for the validation set
y_test_pred = svm_model.predict(X_test_flat)  # Predict labels for the test set

# Function to print compact classification report
def print_compact_classification_report(y_true, y_pred):
    report = classification_report(y_true, y_pred, output_dict=True)
    
    print(f"{'Class':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Support':<10}")
    for class_label in [0, 1]:
        class_report = report[str(class_label)]
        print(f"{class_label:<10} {class_report['precision']:<10.2f} {class_report['recall']:<10.2f} {class_report['f1-score']:<10.2f} {int(class_report['support']):<10}")

# Validation Accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"\nValidation Accuracy: {val_accuracy:.2f}")
print("\nClassification Report (Validation):")
print_compact_classification_report(y_val, y_val_pred)

# Test Accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"\nTest Accuracy: {test_accuracy:.2f}")
print("\nClassification Report (Test):")
print_compact_classification_report(y_test, y_test_pred)



Validation Accuracy: 0.81

Classification Report (Validation):
Class      Precision  Recall     F1-Score   Support   
0          1.00       0.62       0.77       8         
1          0.73       1.00       0.84       8         

Test Accuracy: 0.75

Classification Report (Test):
Class      Precision  Recall     F1-Score   Support   
0          0.97       0.35       0.52       234       
1          0.72       0.99       0.83       390       


In [None]:
 # Support Vector Machine (SVM):
# Validation Accuracy: 81.25%
# Test Accuracy: 75.32%
# Key Insights:
# The SVM model has high precision and recall for class 1 (pneumonia), meaning it's good at identifying pneumonia cases.
# For class 0 (normal), the model shows relatively poor recall, meaning it misses many normal cases.
# Balanced performance overall, especially with a higher F1 score for class 1.

In [70]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Function to print classification report in a compact format
def print_compact_classification_report(y_true, y_pred):
    report = classification_report(y_true, y_pred, output_dict=True)
    print(f"{'Class':<10}{'Precision':<10}{'Recall':<10}{'F1-Score':<10}{'Support':<10}")
    for class_label in sorted(report.keys()):
        if class_label in ['0', '1']:  # Only print actual class results, skip 'accuracy', 'macro avg', etc.
            class_report = report[class_label]
            print(f"{class_label:<10}{class_report['precision']:<10.2f}{class_report['recall']:<10.2f}{class_report['f1-score']:<10.2f}{class_report['support']:<10}")

# Initialize Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train_flat, y_train)

# Predict on validation set
y_val_pred_rf = rf_model.predict(X_val_flat)

# Evaluate the model - Validation
val_accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
print(f"\nValidation Accuracy (RF): {val_accuracy_rf:.2f}")
print("\nClassification Report (Validation - RF):")
print_compact_classification_report(y_val, y_val_pred_rf)

# Predict on test set
y_test_pred_rf = rf_model.predict(X_test_flat)

# Evaluate on test set
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
print(f"\nTest Accuracy (RF): {test_accuracy_rf:.2f}")
print("\nClassification Report (Test - RF):")
print_compact_classification_report(y_test, y_test_pred_rf)



Validation Accuracy (RF): 0.62

Classification Report (Validation - RF):
Class     Precision Recall    F1-Score  Support   
0         1.00      0.25      0.40      8.0       
1         0.57      1.00      0.73      8.0       

Test Accuracy (RF): 0.76

Classification Report (Test - RF):
Class     Precision Recall    F1-Score  Support   
0         0.97      0.38      0.54      234.0     
1         0.73      0.99      0.84      390.0     


In [None]:
# Validation Accuracy: 62.5%
# Test Accuracy: 76.03%
# Performs very well in detecting pneumonia (class 1) with high recall.
# Struggles to correctly identify normal cases (class 0) — high precision but poor recall.
# Better overall test performance than SVM but with imbalanced class predictions.
# Main issue: Many normal cases are misclassified as pneumonia.

In [72]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_model.fit(X_train_flat, y_train)

# Predict on validation set
y_val_pred_knn = knn_model.predict(X_val_flat)

# Predict on test set
y_test_pred_knn = knn_model.predict(X_test_flat)

# Function to print compact classification report
def print_compact_classification_report(y_true, y_pred):
    report = classification_report(y_true, y_pred, output_dict=True)
    print(f"{'Class':<10}{'Precision':<10}{'Recall':<10}{'F1-Score':<10}{'Support':<10}")
    for class_label in sorted(report.keys()):
        if class_label in ['0', '1']:  # Only for classes 0 and 1
            class_report = report[class_label]
            print(f"{class_label:<10}{class_report['precision']:<10.2f}{class_report['recall']:<10.2f}{class_report['f1-score']:<10.2f}{int(class_report['support']):<10}")

# Validation Accuracy
val_accuracy_knn = accuracy_score(y_val, y_val_pred_knn)
print(f"Validation Accuracy (KNN): {val_accuracy_knn:.2f}")

# Classification Report for Validation
print("\nClassification Report (Validation - KNN):")
print_compact_classification_report(y_val, y_val_pred_knn)

# Test Accuracy
test_accuracy_knn = accuracy_score(y_test, y_test_pred_knn)
print(f"\nTest Accuracy (KNN): {test_accuracy_knn:.2f}")

# Classification Report for Test
print("\nClassification Report (Test - KNN):")
print_compact_classification_report(y_test, y_test_pred_knn)


Validation Accuracy (KNN): 0.50

Classification Report (Validation - KNN):
Class     Precision Recall    F1-Score  Support   
0         0.00      0.00      0.00      8         
1         0.50      1.00      0.67      8         

Test Accuracy (KNN): 0.74

Classification Report (Test - KNN):
Class     Precision Recall    F1-Score  Support   
0         0.99      0.30      0.46      234       
1         0.70      1.00      0.83      390       


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Validation Accuracy: 50%
# Test Accuracy: 74%
# Good at detecting pneumonia (Class 1) — high recall.
# Very poor at detecting normal cases (Class 0) — low precision and recall.
# Strong bias towards predicting pneumonia.
# Misclassifies many healthy patients as pneumonia.
# Overall: Decent test accuracy but imbalanced performance.