<a href="https://colab.research.google.com/github/drgnhunter/bioFusionGoogleColab/blob/main/biofusion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pandas

In [None]:
pip install numpy

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install torch torchvision

In [None]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Preprocessing: resized to 224x224 as required for most CNNs
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(), # Augmentation to prevent overfitting
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Use the paths confirmed by your os.walk loop
train_dataset = datasets.ImageFolder('chest_xray/train', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Validation set as required by the "Validation Approach" criteria
val_dataset = datasets.ImageFolder('chest_xray/val', transform=transform)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch.nn as nn

class MedicalCNN(nn.Module):
    def __init__(self):
        super(MedicalCNN, self).__init__()
        # Soundness and originality of architecture
        self.features = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 28 * 28, 128), nn.ReLU(),
            nn.Dropout(0.5), # Prevents overfitting
            nn.Linear(128, 1), nn.Sigmoid() # Binary output for Normal vs Pneumonia
        )

    def forward(self, x):
        return self.classifier(self.features(x))

model = MedicalCNN()

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001) # Hyperparameter choice
criterion = nn.BCELoss() # Loss computation

for epoch in range(15):
    model.train()
    for images, labels in train_loader:
        # 1. Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels.float().unsqueeze(1))

        # 2. Backpropagation
        optimizer.zero_grad()
        loss.backward()

        # 3. Optimizer update
        optimizer.step()

    print(f"Epoch {epoch+1} complete.")

In [None]:
import torch

# Define the file name based on your team name as per submission conventions
model_save_path = "pneumoniaDetectorModel.pth"

# Save the model state_dict (weights)
torch.save(model.state_dict(), model_save_path)

print(f"Model successfully saved to {model_save_path}")

In [None]:
pip install scikit-learn seaborn matplotlib

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Preprocessing for testing (must match training resize/normalization) [cite: 89, 161]
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Loading the test set [cite: 126]
# Note: Use the path confirmed in your directory: 'chest_xray/test'
test_dataset = datasets.ImageFolder('chest_xray/test', transform=test_transform)

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Test loader defined with {len(test_dataset)} images.")

In [None]:
import torch
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

model.eval() # Set model to evaluation mode
all_preds = []
all_labels = []

with torch.no_grad(): # Disable gradient calculation for testing
    for images, labels in test_loader:
        outputs = model(images)
        # Convert probabilities to binary predictions (0 or 1)
        preds = (outputs > 0.5).float()

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate Accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Detailed Classification Report (Precision, Recall, F1-Score)
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=['Normal', 'Pneumonia']))

In [None]:
import torch
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

model.eval() # Set to evaluation mode (freezes BatchNorm and Dropout) [cite: 125, 220]
y_true, y_pred = [], []

with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        predictions = (outputs > 0.5).float()

        y_true.extend(labels.tolist())
        y_pred.extend(predictions.reshape(-1).tolist())

# 1. Primary & Secondary Metrics [cite: 135, 136]
print(f"Test Accuracy: {accuracy_score(y_true, y_pred) * 100:.2f}%")
print("\nDetailed Report:\n", classification_report(y_true, y_pred, target_names=['Normal', 'Pneumonia']))

# 2. Mandatory Confusion Matrix [cite: 137]
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
            xticklabels=['Normal', 'Pneumonia'],
            yticklabels=['Normal', 'Pneumonia'])
plt.title('Confusion Matrix - Pneumonia Detection Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# 1. ADVANCED DATA AUGMENTATION (Training Set Only) [cite: 89, 208]
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    # Randomly rotate images by 15 degrees to simulate different X-ray angles
    transforms.RandomRotation(15),
    # Randomly zoom and crop to focus on different lung areas
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    # Randomly flip images horizontally
    transforms.RandomHorizontalFlip(),
    # Adjust brightness and contrast to simulate different X-ray exposures
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    # Standard normalization for medical imaging [cite: 161]
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# 2. STANDARD PREPROCESSING (Validation/Test Sets) [cite: 161]
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Loading datasets using the updated transforms [cite: 126]
train_dataset = datasets.ImageFolder('chest_xray/train', transform=train_transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = datasets.ImageFolder('chest_xray/val', transform=test_transform)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

test_dataset = datasets.ImageFolder('chest_xray/test', transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
import torch
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

model.eval() # Set to evaluation mode (freezes BatchNorm and Dropout) [cite: 125, 220]
y_true, y_pred = [], []

with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        predictions = (outputs > 0.5).float()

        y_true.extend(labels.tolist())
        y_pred.extend(predictions.reshape(-1).tolist())

# 1. Primary & Secondary Metrics [cite: 135, 136]
print(f"Test Accuracy: {accuracy_score(y_true, y_pred) * 100:.2f}%")
print("\nDetailed Report:\n", classification_report(y_true, y_pred, target_names=['Normal', 'Pneumonia']))

# 2. Mandatory Confusion Matrix [cite: 137]
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
            xticklabels=['Normal', 'Pneumonia'],
            yticklabels=['Normal', 'Pneumonia'])
plt.title('Confusion Matrix - Pneumonia Detection Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle # creating .kaggle folder where the key should be placed


In [None]:
!cp kaggle.json ~/.kaggle/ # move the key to the folder


In [None]:
!pwd # checking the present working directory



In [None]:
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets list



In [None]:
!kaggle datasets download -d paultimothymooney/chest-xray-pneumonia

In [None]:
!unzip -q chest-xray-pneumonia.zip -d .

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Preprocessing: resized to 224x224 as required for most CNNs
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(), # Augmentation to prevent overfitting
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Use the paths confirmed by your os.walk loop
train_dataset = datasets.ImageFolder('chest_xray/train', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Validation set as required by the "Validation Approach" criteria
val_dataset = datasets.ImageFolder('chest_xray/val', transform=transform)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
import torch.nn as nn

class MedicalCNN(nn.Module):
    def __init__(self):
        super(MedicalCNN, self).__init__()
        # Soundness and originality of architecture
        self.features = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 28 * 28, 128), nn.ReLU(),
            nn.Dropout(0.5), # Prevents overfitting
            nn.Linear(128, 1), nn.Sigmoid() # Binary output for Normal vs Pneumonia
        )

    def forward(self, x):
        return self.classifier(self.features(x))

model = MedicalCNN()

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=0.001) # Hyperparameter choice
criterion = nn.BCELoss() # Loss computation

for epoch in range(15):
    model.train()
    for images, labels in train_loader:
        # 1. Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels.float().unsqueeze(1))

        # 2. Backpropagation
        optimizer.zero_grad()
        loss.backward()

        # 3. Optimizer update
        optimizer.step()

    print(f"Epoch {epoch+1} complete.")

In [None]:
import torch

# Define the file name based on your team name as per submission conventions
model_save_path = "pneumoniaDetectorModelv1.pth"

# Save the model state_dict (weights)
torch.save(model.state_dict(), model_save_path)

print(f"Model successfully saved to {model_save_path}")

In [None]:
pip install scikit-learn seaborn matplotlib

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Preprocessing for testing (must match training resize/normalization) [cite: 89, 161]
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Loading the test set [cite: 126]
# Note: Use the path confirmed in your directory: 'chest_xray/test'
test_dataset = datasets.ImageFolder('chest_xray/test', transform=test_transform)

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Test loader defined with {len(test_dataset)} images.")

In [None]:
import torch
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

model.eval() # Set model to evaluation mode
all_preds = []
all_labels = []

with torch.no_grad(): # Disable gradient calculation for testing
    for images, labels in test_loader:
        outputs = model(images)
        # Convert probabilities to binary predictions (0 or 1)
        preds = (outputs > 0.5).float()

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate Accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Detailed Classification Report (Precision, Recall, F1-Score)
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=['Normal', 'Pneumonia']))

In [None]:
import torch
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

model.eval() # Set to evaluation mode (freezes BatchNorm and Dropout) [cite: 125, 220]
y_true, y_pred = [], []

with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        predictions = (outputs > 0.5).float()

        y_true.extend(labels.tolist())
        y_pred.extend(predictions.reshape(-1).tolist())

# 1. Primary & Secondary Metrics [cite: 135, 136]
print(f"Test Accuracy: {accuracy_score(y_true, y_pred) * 100:.2f}%")
print("\nDetailed Report:\n", classification_report(y_true, y_pred, target_names=['Normal', 'Pneumonia']))

# 2. Mandatory Confusion Matrix [cite: 137]
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
            xticklabels=['Normal', 'Pneumonia'],
            yticklabels=['Normal', 'Pneumonia'])
plt.title('Confusion Matrix - Pneumonia Detection Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Training transforms with Augmentation
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomRotation(15),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Test transforms (No Augmentation)
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

train_loader = DataLoader(datasets.ImageFolder('chest_xray/train', transform=train_transform), batch_size=32, shuffle=True)
test_loader = DataLoader(datasets.ImageFolder('chest_xray/test', transform=test_transform), batch_size=32, shuffle=False)

In [None]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Training transforms with Augmentation
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomRotation(15),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Test transforms (No Augmentation)
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

train_loader = DataLoader(datasets.ImageFolder('chest_xray/train', transform=train_transform), batch_size=32, shuffle=True)
test_loader = DataLoader(datasets.ImageFolder('chest_xray/test', transform=test_transform), batch_size=32, shuffle=False)

In [None]:
import torch.nn as nn

class MedicalCNN(nn.Module):
    def __init__(self):
        super(MedicalCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1), nn.BatchNorm2d(16), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2)
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 28 * 28, 128), nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 1) # Outputting logits for BCEWithLogitsLoss
        )

    def forward(self, x):
        return self.classifier(self.features(x))

model = MedicalCNN()

In [None]:
import torch.optim as optim

# pos_weight < 1 makes the model more 'conservative' about predicting Pneumonia
# This specifically targets your low Normal recall
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([0.7]).to(device))
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Scheduler reduces LR by half if validation loss doesn't improve for 3 epochs
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

for epoch in range(25): # Increased epochs for stabilization
    model.train()
    train_loss = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels.float().unsqueeze(1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Placeholder for validation loss check to step the scheduler
    # scheduler.step(val_loss)
    print(f"Epoch {epoch+1} | Loss: {train_loss/len(train_loader):.4f}")

In [None]:
import torch

# Define the file name based on your team name as per submission conventions
model_save_path = "pneumoniaDetectorModelv2.pth"

# Save the model state_dict (weights)
torch.save(model.state_dict(), model_save_path)

print(f"Model successfully saved to {model_save_path}")