# **DATALOADING**

In [20]:
#####################################################
################## PACKAGES #########################
#####################################################
import pandas as pd
import sys 
import base64
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import random
from sklearn.metrics import accuracy_score, roc_auc_score
from datetime import datetime, timedelta, date
from itertools import combinations
from numpy.linalg import norm
import pickle
import os
from torch.nn import Linear
import torch.nn.functional as F
import altair as alt
from altair import expr, datum
from vega_datasets import data
from geopy.geocoders import Nominatim
from tqdm import tqdm
import os
import random
import shutil
import time
import PIL

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_auc_score
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, datasets, models
import torch.optim as optim

from utils import *
tqdm.pandas()
DATA_PATH = './data'

In [21]:
split_total_images_folder(DATA_PATH)

Split completed successfully.


# **DATA AUGMENTATION-PREPROCESS**

In [22]:
# Set the paths to the training and validation data
train_data_dir = './data/train_images'
valid_data_dir = './data/val_images'

# Set the image size, batch size
img_size = (64, 64)
batch_size = 32

train_transform = transforms.Compose([
    transforms.RandomRotation(degrees=15),
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(size=img_size, padding=2),
    transforms.ToTensor(),
])

valid_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
])

train_dataset = datasets.ImageFolder(train_data_dir,transform=train_transform)
valid_dataset = datasets.ImageFolder(valid_data_dir,transform=valid_transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

# **CNN**

In [23]:
# Define the CNN model architecture
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(128 * (img_size[0] // 8) * (img_size[1] // 8), 128)
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.conv1(x)
        x = nn.functional.relu(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = nn.functional.relu(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = nn.functional.relu(x)
        x = self.pool3(x)
        x = x.view(-1, 128 * (img_size[0] // 8) * (img_size[1] // 8))
        x = self.fc1(x)
        x = nn.functional.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [24]:
# Initialize the model
model = CNN()
epochs= 5

# Define the loss function, optimizer, and device
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

CNN(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=8192, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [25]:
# Train the model
for epoch in range(epochs):
    running_loss = 0.0
    correct = 0
    total = 0
    predictions = []
    targets = []
    model.train()  # Set the model to training mode
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels.float().unsqueeze(1))
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Calculate training accuracy
        predicted = outputs
        correct += (predicted == labels.float().unsqueeze(1)).sum().item()
        total += labels.size(0)

        predictions.extend(predicted.detach().cpu().numpy().flatten())
        targets.extend(labels.detach().cpu().numpy().flatten())

    # Calculate training accuracy and AUC
    train_acc = correct / total
    train_auc = roc_auc_score(targets, predictions)

    # Validate the model
    model.eval()
    valid_loss = 0.0
    valid_acc = 0.0
    predictions = []
    targets = []
    
    with torch.no_grad():
        for inputs, labels in valid_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels.float().unsqueeze(1))
            valid_loss += loss.item()

            preds = outputs
            valid_acc += torch.sum(preds == labels.float().unsqueeze(1))

            predictions.extend(preds.detach().cpu().numpy().flatten())
            targets.extend(labels.detach().cpu().numpy().flatten())

    # Calculate validation AUC
    valid_auc = roc_auc_score(targets, predictions)
    
    # Print the loss, accuracy, and AUC for each epoch
    print(f'Epoch {epoch+1} - Training Loss: {running_loss/len(train_loader):.4f} - Validation Loss: {valid_loss/len(valid_loader):.4f} - Training AUC: {train_auc:.4f} - Validation AUC: {valid_auc:.4f}')

Epoch 1 - Training Loss: 0.7050 - Validation Loss: 0.6934 - Training AUC: 0.4760 - Validation AUC: 0.6977
Epoch 2 - Training Loss: 0.6936 - Validation Loss: 0.6895 - Training AUC: 0.5109 - Validation AUC: 0.7382
Epoch 3 - Training Loss: 0.6871 - Validation Loss: 0.6816 - Training AUC: 0.5957 - Validation AUC: 0.7615
Epoch 4 - Training Loss: 0.6880 - Validation Loss: 0.6871 - Training AUC: 0.5988 - Validation AUC: 0.6652
Epoch 5 - Training Loss: 0.6697 - Validation Loss: 0.6653 - Training AUC: 0.6692 - Validation AUC: 0.6571


In [26]:
predictions_df = pd.DataFrame({'prediction_score': predictions})

# Save the DataFrame to a CSV file
predictions_df.to_csv('predictions.csv', index=False)