# Exploratory Data Analysis (EDA)
## COVID-19 Pneumonia Detection Dataset

This notebook performs EDA on the dataset to gain insights, including data distribution, sample images, basic statistics, and pixel value distribution.

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from collections import Counter

### Function to Load Image Data

In [None]:
def load_image_data(data_dir, img_size=(224, 224)):
    images = []
    labels = []
    label_names = ['Normal', 'COVID', 'Pneumonia']

    for label in label_names:
        path = os.path.join(data_dir, label)
        for img_name in os.listdir(path):
            img_path = os.path.join(path, img_name)
            img = load_img(img_path, target_size=img_size)
            img = img_to_array(img)
            images.append(img)
            labels.append(label)
    
    images = np.array(images)
    labels = np.array(labels)
    return images, labels

### Load Training Data

In [None]:
data_dir = '../data/train/'  # Update path as necessary
X_train, y_train = load_image_data(data_dir)

### 1. Visualize Data Distribution

In [None]:
def visualize_data_distribution(labels):
    label_counts = Counter(labels)
    plt.figure(figsize=(8, 6))
    plt.bar(label_counts.keys(), label_counts.values(), color=['blue', 'red', 'green'])
    plt.title('Class Distribution of COVID-19 Pneumonia Dataset')
    plt.ylabel('Number of Images')
    plt.xlabel('Class Labels')
    plt.show()

In [None]:
# Visualize label distribution
visualize_data_distribution(y_train)

### 2. Display Sample Images

In [None]:
def plot_sample_images(images, labels, label_names=['Normal', 'COVID', 'Pneumonia']):
    plt.figure(figsize=(10, 10))
    
    for i, label in enumerate(label_names):
        label_indices = np.where(labels == label)[0]
        selected_idx = np.random.choice(label_indices, 1, replace=False)[0]
        plt.subplot(1, len(label_names), i+1)
        plt.imshow(images[selected_idx].astype('uint8'))
        plt.title(label)
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Display sample images for each class
plot_sample_images(X_train, y_train)

### 3. Calculate Basic Image Statistics

In [None]:
def calculate_image_stats(images):
    print(f"Total number of images: {images.shape[0]}")
    print(f"Image dimensions: {images.shape[1:]}")

    mean_image = np.mean(images, axis=0)
    std_image = np.std(images, axis=0)
    
    print(f"Mean pixel value: {mean_image.mean():.2f}")
    print(f"Standard deviation of pixel values: {std_image.mean():.2f}")
    
    # Display the average image
    plt.figure(figsize=(6, 6))
    plt.imshow(mean_image.astype('uint8'))
    plt.title('Average Image (Mean Pixel Value)')
    plt.axis('off')
    plt.show()

In [None]:
# Calculate basic statistics for the images
calculate_image_stats(X_train)

### 4. Plot Pixel Value Distribution

In [None]:
def plot_pixel_distribution(images, num_bins=50):
    plt.figure(figsize=(10, 6))
    flattened_images = images.flatten()
    plt.hist(flattened_images, bins=num_bins, color='purple', alpha=0.75)
    plt.title('Distribution of Pixel Values')
    plt.xlabel('Pixel Intensity')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Plot pixel value distribution
plot_pixel_distribution(X_train)