# 1. Business Understanding

# 2. Data Understanding

### 2.1 Import library

In [None]:
# Install the splitfolders library
!pip install split-folders

# Import required libraries
import os
import shutil
import random
import zipfile
import requests

import splitfolders
import numpy as np

from PIL import Image
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow.keras import models, layers
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import load_img, img_to_array

### 2.2 Download and Extract the Dataset

In [None]:
!gdown --fuzzy 'https://drive.google.com/file/d/1_WYeNf9kNRYflBcNeXRU6wLw3jdzqOCx/view?usp=sharing'

In [None]:
# Open the ZIP file containing the dataset
zip_ref = zipfile.ZipFile('./lung_cancer_lite.zip', 'r')

# Extract all files from the ZIP archive
zip_ref.extractall()

# Close the ZIP file
zip_ref.close()

### 2.3 Check Dataset Size and Classes

In [None]:
# Define the dataset directory
dataset_dir = './lung_cancer_lite'

# Count the total number of images in the dataset
total_images = sum([len(files) for r, d, files in os.walk(dataset_dir)])
classes = sorted([name for name in os.listdir(dataset_dir) if os.path.isdir(os.path.join(dataset_dir, name))])

print(f"Total Images in Dataset: {total_images} Images")
print(f"Dataset Class: {classes}")

### 2.4 Check Images Distribution

In [None]:
# Count the number of images per class
class_counts = {}
for class_name in os.listdir(dataset_dir):
    class_dir = os.path.join(dataset_dir, class_name)
    if os.path.isdir(class_dir):
        num_images = len(os.listdir(class_dir))
        class_counts[class_name] = num_images

# Plot the distribution
plt.figure(figsize=(6, 4))
sns.barplot(x=classes, y=list(class_counts.values()), palette="coolwarm")
plt.title("Class Distribution in Image Dataset")
plt.xlabel("Class")
plt.ylabel("Number of Images")
plt.tight_layout()
plt.show()

### 2.5 Check Images Format

In [None]:
# Load a random image from the dataset
label = random.choice(classes)
img_path = os.path.join(dataset_dir, label, random.choice(os.listdir(os.path.join(dataset_dir, label))))
img = Image.open(img_path)

# Get image properties
width, height = img.size
image_format = img.format

# Check color mode
mode = img.mode
if mode == 'RGB':
    color_type = 'RGB (Color)'
elif mode == 'L':
    color_type = 'Grayscale'
else:
    color_type = mode

# Print image details
print(f"Image size: {width} x {height} pixels")
print(f"Image format: {image_format}")
print(f"Color type: {color_type}")

### 2.6 Image Samples for Each Class

In [None]:
plt.figure(figsize=(7, 5))
for i, class_name in enumerate(classes):
    class_dir = os.path.join(dataset_dir, class_name)
    image_name = os.listdir(class_dir)[0]
    image_path = os.path.join(class_dir, image_name)

    # Open the image
    img = Image.open(image_path)

    # Display the image
    plt.subplot(1, 3, i + 1)
    plt.imshow(img)
    plt.title(class_name)
    plt.axis('off')

plt.show()

# 3. Data Preparation

### 3.1 Data Splitting

In [None]:
# Split dataset into training (80%), validation (10%), and testing (10%)
splitfolders.ratio('./lung_cancer_lite',
                   output='./lung_cancer_lite_split',
                   seed=1337, ratio=(.7, .15, .15),
                   group_prefix=None, move=False)

# Define directories for training, validation, and testing data
train_dir = './lung_cancer_lite_split/train'
val_dir = './lung_cancer_lite_split/val'
test_dir = './lung_cancer_lite_split/test'

### 3.2 Data Augmentation

In [None]:
# Function to create data generators for training, validation, and testing
def train_val_generators(TRAINING_DIR, VALIDATION_DIR, TESTING_DIR):
    # Augment and normalize training data
    train_datagen = ImageDataGenerator(rescale=1.0/255,
                                     rotation_range=15,
                                     width_shift_range=0.1,
                                     height_shift_range=0.1,
                                     shear_range=0.1,
                                     zoom_range=0.1,
                                     horizontal_flip=True,
                                     fill_mode='nearest')

    # Create generator for training data
    train_generator = train_datagen.flow_from_directory(directory=TRAINING_DIR,
                                                      batch_size=32,
                                                      class_mode='categorical',
                                                      target_size=(224, 224))

    # Normalize validation data without augmentation
    validation_datagen = ImageDataGenerator(rescale=1.0/255)

    # Create generator for validation data
    validation_generator = validation_datagen.flow_from_directory(directory=VALIDATION_DIR,
                                                                    batch_size=32,
                                                                    class_mode='categorical',
                                                                    target_size=(224, 224))

    # Normalize test data without augmentation
    testing_datagen = ImageDataGenerator(rescale=1.0/255)

    # Create generator for test data
    testing_generator = testing_datagen.flow_from_directory(directory=TESTING_DIR,
                                                  batch_size=32,
                                                  class_mode='categorical',
                                                  target_size=(224, 224),
                                                  shuffle=False)

    return train_generator, validation_generator, testing_generator

# Generate training, validation, and testing data
train_generator, validation_generator, testing_generator = train_val_generators(train_dir, val_dir, test_dir)

In [None]:
print("Class Indices:", train_generator.class_indices)
print("Total Images in Training Set:", train_generator.samples)
print("Total Images in Validation Set:", validation_generator.samples)
print("Total Images in Test Set:", testing_generator.samples)

In [None]:
batch_images, batch_labels = next(train_generator)
print("Batch Shape:", batch_images.shape)
print("First 5 Labels:", batch_labels[:5])

In [None]:
import matplotlib.pyplot as plt

# Get a batch
batch_images, batch_labels = next(train_generator)

# Show first 5 images
plt.figure(figsize=(10, 5))
for i in range(5):
    plt.subplot(1, 5, i+1)
    plt.imshow(batch_images[i])
    plt.axis('off')
plt.show()

# 4. Modeling

### 4.1 ConvNext Model

In [None]:
from tensorflow.keras.applications import ConvNeXtBase

# Memuat model MobileNetV2 dari Keras
base_model = ConvNeXtBase(input_shape=(224, 224, 3),
              include_top=False,
              weights='imagenet')

# Membekukan (Freeze) semua lapisan (layer) dalam model agar tidak dilatih ulang
for layer in base_model.layers:
  layer.trainable = False

### 4.2 Final Model for Transfer Learning

In [None]:
inputs = tf.keras.Input(shape=(224, 224, 3))
x = base_model(inputs)
# Flatten layer untuk mengubah output 2D menjadi 1D
x = layers.Flatten()(x)

# Fully connected layer dengan 512 neuron dan activation function ReLU
x = layers.Dense(256, activation='relu')(x)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dropout(0.2)(x)

# Output layer dengan 3 neuron dan activation function softmax
outputs = layers.Dense(3, activation='softmax')(x)

final_model = Model(inputs, outputs)

# Menampilkan ringkasan arsitektur model
final_model.summary()

### 4.3 Compiling Final Model

In [None]:
final_model.compile(
    loss=tf.keras.losses.CategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.AdamW(learning_rate=0.0001),
    metrics=['accuracy'])

### 4.4 Training Final Model

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='best_model82.keras',
    save_best_only=True,
    save_weights_only=False,
    monitor='val_accuracy',
    mode='max')

In [None]:
%%time
history = final_model.fit(
    train_generator,
    epochs=55,
    batch_size=32,
    validation_data=validation_generator,
    verbose=2,
    callbacks=[model_checkpoint_callback])

In [None]:
# Retrieve accuracy and loss values from training history
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(acc))

# Create two subplots (1 row, 2 columns) for training history visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

# Plot training and validation accuracy
ax1.plot(epochs, acc, 'r', label='Training Accuracy')
ax1.plot(epochs, val_acc, 'b', label='Validation Accuracy')
ax1.set_title('Model Training and Validation Accuracy')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Accuracy')
ax1.legend()

# Plot training and validation loss
ax2.plot(epochs, loss, 'r', label='Training Loss')
ax2.plot(epochs, val_loss, 'b', label='Validation Loss')
ax2.set_title('Model Training and Validation Loss')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Loss')
ax2.legend()

# Adjust plot layout
plt.tight_layout()

# Save plots
plt.savefig('train_acc_loss_plot.png')
# Show plots
plt.show()

In [None]:
from IPython.display import FileLink
FileLink('best_model82.keras')

In [None]:
# final_model.save('final_model82.keras')

# 5. Evaluation

### 5.1 Evaluating Final Model with Test Set

In [None]:
model_saved = tf.keras.models.load_model('./best_model82.keras')

In [None]:
# Evaluasi model menggunakan test set
print('Evaluate model accuracy on test data')
results = model_saved.evaluate(testing_generator, batch_size=32)
print('test loss, test acc:', results)

### 5.2 Confusion Matrix

In [None]:
# Melakukan prediksi pada data uji
predictions = model_saved.predict(testing_generator)
predicted_classes = np.argmax(predictions, axis=1)

# Mendapatkan label sebenarnya dari data uji
true_classes = testing_generator.classes
class_labels = list(testing_generator.class_indices.keys())

# Membuat confusion matrix
conf_matrix = confusion_matrix(true_classes, predicted_classes)

# Visualisasi confusion matrix
plt.figure(figsize=(7, 5))
sns.heatmap(conf_matrix,
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=class_labels,
            yticklabels=class_labels)

# Label dan judul plot
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix ConvNeXt')

# Save plots
plt.savefig('conf_matrix.png')
plt.show()

### 5.3 Classification Report

In [None]:
from sklearn.metrics import classification_report

# Classification report
class_report = classification_report(true_classes, predicted_classes, target_names=class_labels)

print("Classification Report:")
print(class_report)

In [None]:
import json

class_report2 = classification_report(true_classes, predicted_classes, target_names=class_labels, output_dict=True)

with open("class_report_test.json", "w") as f:
    json.dump({"classification_report": class_report2}, f)

### 5.4 Predicting Test Data

In [None]:
# Reset generator agar mulai dari awal
testing_generator.reset()

# Mengambil seluruh gambar dan label dari generator
all_test_images = []
all_test_labels = []

for i in range(len(testing_generator)):
    batch_images, batch_labels = testing_generator[i]
    all_test_images.extend(batch_images)
    all_test_labels.extend(batch_labels)

# Konversi ke array
all_test_images = np.array(all_test_images)
all_test_labels = np.array(all_test_labels)

# Ambil 3 indeks acak dari seluruh data
random_indices = np.random.choice(len(all_test_images), size=3, replace=False)

# Prediksi untuk seluruh data (agar hemat waktu)
predictions = model_saved.predict(all_test_images)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(all_test_labels, axis=1)

# Visualisasi 3 gambar acak
plt.figure(figsize=(10, 7))
for i, index in enumerate(random_indices):
    ax = plt.subplot(1, 3, i + 1)
    plt.imshow(all_test_images[index])

    true_label = class_labels[true_classes[index]]
    predicted_label = class_labels[predicted_classes[index]]
    accuracy = np.max(predictions[index]) * 100

    plt.title(f'True Label: {true_label}\nPred Label: {predicted_label}\nAccuracy: {accuracy:.2f}%')
    plt.axis('off')

plt.tight_layout()
plt.savefig('test_predict.png')
plt.show()