# SetUp

In [9]:
import time
import psutil
import threading
import os
import sys
import numpy as np
from PIL import Image, ImageOps, ImageChops
import argparse
import random
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use('TkAgg') 
from matplotlib import gridspec
from preprocess import preprocess, augment_image, add_gaussian_noise, apply_elastic_distortion
from collections import Counter


In [10]:
random.seed(42)
np.random.seed(42)
input_dir = "/home/meos/Documents/MapReduceNeuralNetwork/data/1/mnist_png/"
output_dir = "/home/meos/Documents/MapReduceNeuralNetwork/data/processed/"

# Original Data Visulization

In [11]:
def check_broken_files(data_dir):
    broken_files = []
    for root, dirs, files in os.walk(data_dir):
        for filename in files:
            if filename.endswith('.png'):
                fpath = os.path.join(root, filename)
                try:
                    im = Image.open(fpath)
                    im.verify() 
                except Exception:
                    print("Broken or invalid image:", fpath)
                    broken_files.append(fpath)
    print(broken_files)


In [12]:
check_broken_files("/home/meos/Documents/MapReduceNeuralNetwork/data/1/mnist_png/train")
check_broken_files("/home/meos/Documents/MapReduceNeuralNetwork/data/1/mnist_png/test")

[]
[]


In [32]:
preprocess(input_dir, output_dir, percent=100, augment=False)

In [13]:
# Single Test
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np

image_path = "/home/meos/Documents/MapReduceNeuralNetwork/data/1/mnist_png/train/0/59878.png"

image = Image.open(image_path).convert("L")  
image_array = np.array(image)

plt.imshow(image_array, cmap="gray")
plt.title("Sample Image")
plt.axis("off")
plt.show()


invalid command name "133955808007168process_stream_events"
    while executing
"133955808007168process_stream_events"
    ("after" script)
can't invoke "event" command: application has been destroyed
    while executing
"event generate $w <<ThemeChanged>>"
    (procedure "ttk::ThemeChanged" line 6)
    invoked from within
"ttk::ThemeChanged"


In [16]:
def load_data_from_text(file_path, num_samples=5):
    """
    Load data from the processed text file.
    Returns a list of tuples: (label, image_array)
    """
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            label = int(parts[0])
            pixels = np.array(list(map(float, parts[1:]))).reshape(28, 28)
            data.append((label, pixels))
            if len(data) >= num_samples:
                break
    return data

def visualize_random_image(input_dir, subset='train', num_samples=5):
    """
    Visualize randomly MNIST data from PNG images.
    
    Parameters:
    - input_dir (str): Path to the MNIST PNG data directory.
    - subset (str): 'train' or 'test'.
    - num_samples (int): Number of samples to display.
    """
    subset_dir = os.path.join(input_dir, subset)
    if not os.path.isdir(subset_dir):
        print(f"Subset directory {subset_dir} does not exist.", file=sys.stderr)
        return
    
    fig, axes = plt.subplots(1, num_samples, figsize=(num_samples * 2, 2))
    fig.suptitle(f"Original MNIST {subset.capitalize()} Images", fontsize=16)
    
    if num_samples == 1:
        axes = [axes]
    
    for i in range(num_samples):
        ax = axes[i]
        
        # Randomly select a label
        label = random.randint(0, 9)
        label_dir = os.path.join(subset_dir, str(label))
        
        if not os.path.isdir(label_dir):
            print(f"Directory {label_dir} does not exist. Skipping.", file=sys.stderr)
            ax.axis('off')
            continue
        
        # List all PNG files in the label directory
        files = [file for file in os.listdir(label_dir) if file.endswith('.png')]
        
        if not files:
            print(f"No PNG files in directory {label_dir}. Skipping.", file=sys.stderr)
            ax.axis('off')
            continue
        
        # Randomly select a file
        file = random.choice(files)
        file_path = os.path.join(label_dir, file)
        
        try:
            # Open and convert the image
            image = Image.open(file_path).convert('L')
            image = image.resize((28, 28), resample=Image.LANCZOS) 
            pixels = np.array(image)
            
            print(f"Displaying image: {file_path}")
            print(f"Pixels type: {type(pixels)}, shape: {pixels.shape}, dtype: {pixels.dtype}")
            
            if not isinstance(pixels, np.ndarray):
                raise ValueError(f"Pixels is not a numpy array for file {file_path}")
            if pixels.shape != (28, 28):
                raise ValueError(f"Pixels shape is incorrect for file {file_path}: {pixels.shape}")
            
            # Display the image
            ax.imshow(pixels, cmap='gray', vmin=0, vmax=255)
            ax.set_title(f"Label: {label}")
            ax.axis('off')
        except Exception as e:
            print(f"Error processing {file_path}: {e}", file=sys.stderr)
            ax.axis('off')
    
    plt.tight_layout()
    plt.show()

def visualize_one_image_per_class(input_dir, subset='train'):
    """
    Visualize one image for each class in the MNIST dataset.
    
    Parameters:
    - input_dir (str): Path to the MNIST PNG data directory.
    - subset (str): 'train' or 'test'.
    """
    subset_dir = os.path.join(input_dir, subset)
    if not os.path.isdir(subset_dir):
        print(f"Subset directory {subset_dir} does not exist.", file=sys.stderr)
        return
    
    num_classes = 10 
    fig, axes = plt.subplots(1, num_classes, figsize=(num_classes * 2, 2))
    fig.suptitle(f"One Image Per Class - {subset.capitalize()} Dataset", fontsize=16)
    
    for label in range(num_classes):
        ax = axes[label]
        label_dir = os.path.join(subset_dir, str(label))
        
        if not os.path.isdir(label_dir):
            print(f"Directory {label_dir} does not exist. Skipping class {label}.", file=sys.stderr)
            ax.axis('off')
            continue
        
        # List all PNG files in the label directory
        files = [file for file in os.listdir(label_dir) if file.endswith('.png')]
        
        if not files:
            print(f"No PNG files in directory {label_dir}. Skipping class {label}.", file=sys.stderr)
            ax.axis('off')
            continue
        
        # Select the first file or a random file
        file = files[0]  # First File
        file_path = os.path.join(label_dir, file)
        
        try:
            # Open and convert the image
            image = Image.open(file_path).convert('L')
            image = image.resize((28, 28), resample=Image.LANCZOS)
            pixels = np.array(image)
            
            # Show
            ax.imshow(pixels, cmap='gray', vmin=0, vmax=255)
            ax.set_title(f"Label: {label}")
            ax.axis('off')
        except Exception as e:
            print(f"Error processing {file_path}: {e}", file=sys.stderr)
            ax.axis('off')
    
    plt.tight_layout()
    plt.show()

def display_data_statistics(input_dir, subset='train', num_samples=None, show_plot=True):
    """
    Display statistics about the dataset by loading data from a text file.
    
    Parameters:
    - input_dir (str): Path to the directory containing MNIST text data.
    - subset (str): 'train' or 'test' to specify which subset to analyze.
    - num_samples (int): Optional. Limit the number of samples to load.
    """
    file_path = os.path.join(input_dir, f"mnist_{subset}.txt")
    
    if not os.path.exists(file_path):
        print(f"File {file_path} does not exist.", file=sys.stderr)
        return

    def load_data_from_text(file_path, num_samples=None):
        """
        Load data from the processed text file.
        Returns a list of tuples: (label, image_array)
        """
        data = []
        with open(file_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                label = int(parts[0])
                pixels = np.array(list(map(float, parts[1:]))).reshape(28, 28)
                data.append((label, pixels))
                if num_samples and len(data) >= num_samples:
                    break
        return data

    data = load_data_from_text(file_path, num_samples)

    total_samples = len(data)
    labels = [item[0] for item in data]
    label_counts = Counter(labels)
    
    print(f"Total: {total_samples}")
    print("Class:")
    for label in sorted(label_counts.keys()):
        print(f"  Label {label}: {label_counts[label]} samples")
    
    if show_plot:
        labels = sorted(label_counts.keys())
        counts = [label_counts[label] for label in labels]
        
        plt.figure(figsize=(10, 6))
        plt.bar(labels, counts, color='skyblue')
        plt.title(f"Class Distribution in {subset.capitalize()} Dataset", fontsize=16)
        plt.xlabel("Class Labels", fontsize=14)
        plt.ylabel("Number of Samples", fontsize=14)
        plt.xticks(labels, fontsize=12)
        plt.yticks(fontsize=12)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.show()


def visualize_augmentations(image_path):
    """
    Visualize the different augmentations applied to an image.
    
    Parameters:
    - image_path (str): Path to the original image file.
    """
    # Load the original image
    original_image = Image.open(image_path).convert('L').resize((28, 28))
    
    augmentations = {
        "Original": lambda img: img,
        "Rotation": lambda img: img.rotate(random.uniform(-20, 20), fillcolor=0),
        "Shift": lambda img: ImageChops.offset(img, random.randint(-3, 3), random.randint(-3, 3)),
        "Scaling": lambda img: ImageOps.fit(
            img.resize(
                (int(28 * random.uniform(0.9, 1.1)), int(28 * random.uniform(0.9, 1.1))), 
                Image.LANCZOS
            ), (28, 28), Image.LANCZOS
        ),
        "Shearing": lambda img: img.transform(
            img.size, Image.AFFINE, 
            (1, random.uniform(-0.1, 0.1), 0, 0, 1, 0), Image.BICUBIC, fillcolor=0
        ),
        "Elastic Distortion": lambda img: apply_elastic_distortion(img),
        "Gaussian Noise": lambda img: add_gaussian_noise(img)
    }

    # Visualize each augmentation
    num_augmentations = len(augmentations)
    fig, axes = plt.subplots(2, num_augmentations, figsize=(num_augmentations * 2, 4))
    fig.suptitle("Visualizing Augmentations", fontsize=16)
    
    for i, (name, augment_func) in enumerate(augmentations.items()):
        # Apply augmentation twice
        for j in range(2):
            ax = axes[j, i]
            try:
                augmented_image = augment_func(original_image)
                ax.imshow(np.array(augmented_image), cmap='gray', vmin=0, vmax=255)
                if j == 0:
                    ax.set_title(name, fontsize=10)
                ax.axis('off')
            except Exception as e:
                print(f"Error applying {name} augmentation: {e}", file=sys.stderr)
                ax.axis('off')
    
    plt.tight_layout()
    plt.subplots_adjust(top=0.85)
    plt.show()

def visualize_augmentations_per_class(input_dir, subset='train'):
    """
    Visualize all augmentations applied to a sample image from each class (0-9).
    
    Parameters:
    - input_dir (str): Path to the MNIST PNG data directory.
    - subset (str): 'train' or 'test'.
    """
    subset_dir = os.path.join(input_dir, subset)
    if not os.path.isdir(subset_dir):
        print(f"Subset directory {subset_dir} does not exist.", file=sys.stderr)
        return

    # Define augmentations
    augmentations = {
        "Original": lambda img: img,
        "Rotation": lambda img: img.rotate(random.uniform(-20, 20), fillcolor=0),
        "Shift": lambda img: ImageChops.offset(img, random.randint(-3, 3), random.randint(-3, 3)),
        "Scaling": lambda img: ImageOps.fit(
            img.resize(
                (int(28 * random.uniform(0.9, 1.1)), int(28 * random.uniform(0.9, 1.1))), 
                Image.LANCZOS
            ), (28, 28), Image.LANCZOS
        ),
        "Shearing": lambda img: img.transform(
            img.size, Image.AFFINE, 
            (1, random.uniform(-0.1, 0.1), 0, 0, 1, 0), Image.BICUBIC, fillcolor=0
        ),
        "Elastic Distortion": lambda img: apply_elastic_distortion(img),
        "Gaussian Noise": lambda img: add_gaussian_noise(img)
    }

    num_classes = 10  # Hiển thị 10 lớp
    num_augmentations = len(augmentations)

    # Create a figure
    fig, axes = plt.subplots(num_classes, num_augmentations, figsize=(num_augmentations * 3, num_classes * 3))
    fig.suptitle(f"Visualizing Augmentations for Classes 0-{num_classes - 1} ({subset.capitalize()})", fontsize=16)
    
    for label in range(num_classes):
        label_dir = os.path.join(subset_dir, str(label))
        if not os.path.isdir(label_dir):
            print(f"Directory for class {label} does not exist. Skipping.", file=sys.stderr)
            continue
        
        # List all PNG files in the label directory
        files = [file for file in os.listdir(label_dir) if file.endswith('.png')]
        if not files:
            print(f"No PNG files in directory for class {label}. Skipping.", file=sys.stderr)
            continue

        # Select the first file for this label
        file_path = os.path.join(label_dir, files[0])
        try:
            original_image = Image.open(file_path).convert('L').resize((28, 28))
            
            for i, (name, augment_func) in enumerate(augmentations.items()):
                ax = axes[label, i]
                try:
                    augmented_image = augment_func(original_image)
                    ax.imshow(np.array(augmented_image), cmap='gray', vmin=0, vmax=255)
                    if label == 0:
                        ax.set_title(name, fontsize=10)
                    ax.axis('off')
                except Exception as e:
                    print(f"Error applying {name} augmentation to class {label}: {e}", file=sys.stderr)
                    ax.axis('off')
        except Exception as e:
            print(f"Error processing image for class {label}: {e}", file=sys.stderr)
    
    plt.tight_layout()
    plt.subplots_adjust(top=0.92)
    plt.show()



In [17]:
visualize_random_image(input_dir, subset='train', num_samples=10)

Displaying image: /home/meos/Documents/MapReduceNeuralNetwork/data/1/mnist_png/train/1/41723.png
Pixels type: <class 'numpy.ndarray'>, shape: (28, 28), dtype: uint8
Displaying image: /home/meos/Documents/MapReduceNeuralNetwork/data/1/mnist_png/train/4/34780.png
Pixels type: <class 'numpy.ndarray'>, shape: (28, 28), dtype: uint8
Displaying image: /home/meos/Documents/MapReduceNeuralNetwork/data/1/mnist_png/train/3/25553.png
Pixels type: <class 'numpy.ndarray'>, shape: (28, 28), dtype: uint8
Displaying image: /home/meos/Documents/MapReduceNeuralNetwork/data/1/mnist_png/train/1/49113.png
Pixels type: <class 'numpy.ndarray'>, shape: (28, 28), dtype: uint8
Displaying image: /home/meos/Documents/MapReduceNeuralNetwork/data/1/mnist_png/train/8/45116.png
Pixels type: <class 'numpy.ndarray'>, shape: (28, 28), dtype: uint8
Displaying image: /home/meos/Documents/MapReduceNeuralNetwork/data/1/mnist_png/train/9/10410.png
Pixels type: <class 'numpy.ndarray'>, shape: (28, 28), dtype: uint8
Displaying

In [18]:
visualize_one_image_per_class(input_dir, subset='train')

In [30]:
display_data_statistics(output_dir, subset='train')

Total: 240000
Class:
  Label 0: 23692 samples
  Label 1: 26968 samples
  Label 2: 23832 samples
  Label 3: 24524 samples
  Label 4: 23368 samples
  Label 5: 21684 samples
  Label 6: 23672 samples
  Label 7: 25060 samples
  Label 8: 23404 samples
  Label 9: 23796 samples


In [31]:
display_data_statistics(output_dir, subset='test')

Total: 40000
Class:
  Label 0: 3920 samples
  Label 1: 4540 samples
  Label 2: 4128 samples
  Label 3: 4040 samples
  Label 4: 3928 samples
  Label 5: 3568 samples
  Label 6: 3832 samples
  Label 7: 4112 samples
  Label 8: 3896 samples
  Label 9: 4036 samples


invalid command name "133954640013504process_stream_events"
    while executing
"133954640013504process_stream_events"
    ("after" script)
can't invoke "event" command: application has been destroyed
    while executing
"event generate $w <<ThemeChanged>>"
    (procedure "ttk::ThemeChanged" line 6)
    invoked from within
"ttk::ThemeChanged"


In [29]:
visualize_augmentations("/home/meos/Documents/MapReduceNeuralNetwork/data/1/mnist_png/test/9/16.png")


In [26]:
visualize_augmentations_per_class(input_dir, subset='train')


# Monitor Performance Tracker

In [24]:
def monitor_performance(func, *args, **kwargs):
    """
    Monitors the performance of a function, capturing execution time and resource usage.
    """
    process = psutil.Process(os.getpid())
    
    # Record initial CPU and memory usage
    cpu_before = process.cpu_percent(interval=None)
    mem_before = process.memory_info().rss  # in bytes
    
    # Record start time
    start_time = time.time()
    
    # Execute the function
    func(*args, **kwargs)
    
    # Record end time
    end_time = time.time()
    
    # Record final CPU and memory usage
    cpu_after = process.cpu_percent(interval=None)
    mem_after = process.memory_info().rss  # in bytes
    
    # Calculate metrics
    cpu_usage = cpu_after - cpu_before  # CPU usage percentage
    mem_usage = (mem_after - mem_before) / (1024 ** 2)  # Memory usage in MB
    elapsed_time = end_time - start_time  # Time in seconds
    
    print(f"Function '{func.__name__}' executed in {elapsed_time:.2f} seconds")
    print(f"CPU Usage: {cpu_usage:.2f}%")
    print(f"Memory Usage: {mem_usage:.2f} MB")

def monitor_resources(interval=1, cpu_usage_list=None, mem_usage_list=None):
    """
    Continuously monitors CPU and memory usage at specified intervals.    
    Parameters:
    - interval (int)
    - cpu_usage_list (list)
    - mem_usage_list (list)
    """
    process = psutil.Process(os.getpid())
    while True:
        cpu = process.cpu_percent(interval=interval)
        mem = process.memory_info().rss / (1024 ** 2)  # Convert to MB
        if cpu_usage_list is not None and mem_usage_list is not None:
            cpu_usage_list.append(cpu)
            mem_usage_list.append(mem)
        # print(f"CPU Usage: {cpu}% | Memory Usage: {mem} MB")

def monitor_performance_with_continuous_tracking(func, *args, **kwargs):
    cpu_usage_list = []
    mem_usage_list = []
    
    monitor_thread = threading.Thread(
        target=monitor_resources, 
        args=(1, cpu_usage_list, mem_usage_list), 
        daemon=True
    )
    monitor_thread.start()
    start_time = time.time()
    func(*args, **kwargs)
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    if cpu_usage_list:
        cpu_average = sum(cpu_usage_list) / len(cpu_usage_list)
        cpu_peak = max(cpu_usage_list)
    else:
        cpu_average = 0
        cpu_peak = 0
    
    if mem_usage_list:
        mem_peak = max(mem_usage_list)
    else:
        mem_peak = 0
    
    print(f"\nFunction '{func.__name__}' executed in {elapsed_time:.2f} seconds")
    print(f"CPU Usage: Average = {cpu_average:.2f}% | Highest = {cpu_peak:.2f}%")
    print(f"Memory Usage: Highest = {mem_peak:.2f} MB")




In [23]:
monitor_performance(preprocess, input_dir, output_dir, percent=100, augment=True)

KeyboardInterrupt: 

In [25]:
monitor_performance_with_continuous_tracking(preprocess, input_dir, output_dir, percent=100, augment=True)


Function 'preprocess' executed in 206.61 seconds
CPU Usage: Average = 73.43% | Highest = 100.90%
Memory Usage: Highest = 2271.71 MB
