<!-- Team Banner -->
<div align="center">
  <h1>ðŸŒŸ<strong>EVER LEARNERS</strong>ðŸŒŸ</h1>
</div>


<!-- Team Banner -->
<div align="center">
  <h2><strong>TRACK 5 CODE FOR MPEG DIALOGUE COMPETITIONâ€“ OPEN-ENDED DISCOVERY (PYGENIE TO IMAGE CONVERSION)</strong></h2>
</div>

<div align="center">
  <img src="https://zindi-public-release.s3.eu-west-2.amazonaws.com/uploads/competition/image/591/thumb_f7f99618-bc3d-44db-acfa-39b971d7472f.png" alt="Description" style="width:50%;"/>
</div>


In [1]:
%%capture
!apt-get update && apt-get install -y libhts-dev zlib1g libzstd1 liblzma5

# DO THIS ONLY ON KAGGLE IF YOU HAVR THE PYGENIE DATASET
!cp -r /kaggle/input/pygenie/ /kaggle/working/pygenie

# CONTINUE FROM HERE
!pip install /kaggle/working/pygenie --quiet
!pip install Bio pyfiglet --quiet

In [2]:
import pygenie
from pygenie import *

stdout, stderr, code = genie_help()
print(stdout)

[INFO,      0.000s, App]:    ______           _
[INFO,      0.000s, App]:   / ____/__  ____  (_)__
[INFO,      0.000s, App]:  / / __/ _ \/ __ \/ / _ \
[INFO,      0.000s, App]: / /_/ /  __/ / / / /  __/
[INFO,      0.000s, App]: \____/\___/_/ /_/_/\___/
[INFO,      0.000s, App]: Command: /usr/local/lib/python3.11/dist-packages/pygenie/genie help 
[ERROR,      0.000s, App]: Usage: 
[ERROR,      0.000s, App]: genie <operation> <operation specific options> 
[ERROR,      0.000s, App]: 
[ERROR,      0.000s, App]: List of operations:
[ERROR,      0.000s, App]: help
[ERROR,      0.000s, App]: run
[ERROR,      0.000s, App]: transcode-fastq
[ERROR,      0.000s, App]: transcode-sam
[ERROR,      0.000s, App]: 
[ERROR,      0.000s, App]: To learn more about an operation, type "genie <operation> --help".



In [3]:
import os
import numpy as np
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend for parallel processing
import matplotlib.pyplot as plt
from itertools import product
from Bio import SeqIO
from pygenie import run
from PIL import Image
from multiprocessing import Pool, cpu_count
from functools import partial
from tqdm import tqdm

# ---------- helper: 5-mer counting ----------
def kmer_features_from_file(fastq_file, k=5):
    """Extract k-mer frequency features from FASTQ file"""
    kmers = [''.join(p) for p in product("ATGC", repeat=k)]
    kmer_index = {kmer: i for i, kmer in enumerate(kmers)}
    counts = np.zeros(len(kmers))
    
    for record in SeqIO.parse(fastq_file, "fastq"):
        seq = str(record.seq)
        for i in range(len(seq) - k + 1):
            kmer = seq[i:i+k]
            if kmer in kmer_index:
                counts[kmer_index[kmer]] += 1
    return counts

# ---------- convert to image ----------
def counts_to_log_scaled_image(counts, k=5):
    """Convert k-mer counts to log-scaled image matrix"""
    counts = np.log1p(counts)
    if counts.max() > 0:
        counts = counts / counts.max()
    size = int(np.sqrt(len(counts)))
    mat = counts.reshape(size, size)
    return mat

# ---------- save image with colormap ----------
def save_image_with_cmap(matrix, output_path, cmap='viridis', target_size=224):
    """Save matrix as image with specified colormap and resize to target_size"""
    fig, ax = plt.subplots(figsize=(target_size/100, target_size/100), dpi=100)
    ax.imshow(matrix, cmap=cmap, interpolation='bilinear')
    ax.axis('off')
    plt.tight_layout(pad=0)
    plt.savefig(output_path, bbox_inches='tight', pad_inches=0, dpi=100)
    plt.close(fig)
    
    # Resize to exact target size using PIL
    img = Image.open(output_path)
    img = img.resize((target_size, target_size), Image.Resampling.LANCZOS)
    img.save(output_path)

# ---------- parallel colormap saver ----------
def save_single_colormap(args):
    """Helper function for parallel colormap saving"""
    img_matrix, base_name, output_dir, cmap, target_size = args
    output_path = os.path.join(output_dir, f"{base_name}_{cmap}.png")
    save_image_with_cmap(img_matrix, output_path, cmap=cmap, target_size=target_size)
    return output_path

# ---------- main pipeline ----------
def process_mpeg_to_images(mgb_path, output_dir, colormaps=['viridis', 'gray', 'plasma', 'inferno'], 
                           target_size=224, cleanup=True, n_jobs=None, verbose=False):
    """
    Complete pipeline: MPEG-G â†’ FASTQ â†’ Images â†’ Cleanup
    
    Parameters:
    -----------
    mgb_path : str
        Path to input .mgb file
    output_dir : str
        Directory to save images
    colormaps : list
        List of matplotlib colormaps to use
    target_size : int
        Target image size (will be square)
    cleanup : bool
        Whether to delete FASTQ file after processing
    n_jobs : int
        Number of parallel jobs (None = all cores)
    verbose : bool
        Print detailed progress messages
    """
    os.makedirs(output_dir, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(mgb_path))[0]
    
    # Step 1: Convert MPEG-G to FASTQ (use absolute path)
    # Create temp directory for FASTQ files
    temp_dir = os.path.join(output_dir, 'temp_fastq')
    os.makedirs(temp_dir, exist_ok=True)
    fastq_path = os.path.join(temp_dir, f'{base_name}.fastq')
    run(input_file=mgb_path, output_file=fastq_path, force=True)
    
    # Step 2: Extract k-mer features
    vec5 = kmer_features_from_file(fastq_path, k=5)
    
    # Step 3: Convert to log-scaled matrix
    img_matrix = counts_to_log_scaled_image(vec5, k=5)
    
    # Step 4: Save images with different colormaps IN PARALLEL
    n_jobs = n_jobs or cpu_count()
    
    # Prepare arguments for parallel processing
    args_list = [(img_matrix, base_name, output_dir, cmap, target_size) 
                 for cmap in colormaps]
    
    # Use multiprocessing pool
    with Pool(processes=min(n_jobs, len(colormaps))) as pool:
        saved_files = pool.map(save_single_colormap, args_list)
    
    # Step 5: Cleanup FASTQ file (DELETE FASTQ, KEEP IMAGES)
    if cleanup:
        try:
            if os.path.exists(fastq_path):
                os.remove(fastq_path)
                if verbose:
                    tqdm.write(f"  âœ“ Deleted FASTQ: {fastq_path}")
        except Exception as e:
            if verbose:
                tqdm.write(f"  âœ— Warning: Could not delete {fastq_path}: {e}")
    
    if verbose:
        for f in saved_files:
            tqdm.write(f"  âœ“ Saved: {f}")
    
    return saved_files

# ---------- parallel file processor ----------
def process_single_mgb_file(args):
    """Wrapper for parallel processing of individual MPEG-G files"""
    mgb_path, output_dir, colormaps, target_size = args
    try:
        saved = process_mpeg_to_images(
            mgb_path, 
            output_dir, 
            colormaps=colormaps,
            target_size=target_size,
            cleanup=True,  # DELETE FASTQ, KEEP IMAGES
            n_jobs=1,  # Each file process uses 1 core, parallelism at file level
            verbose=False  # Suppress detailed output
        )
        return (mgb_path, saved, None)
    except Exception as e:
        return (mgb_path, None, str(e))

# ---------- batch processing with file-level parallelism ----------
def batch_process_mpeg_files(input_dir, output_dir, pattern="*.mgb", 
                             colormaps=['viridis', 'gray', 'plasma', 'inferno'],
                             target_size=224, n_jobs=None):
    """
    Process all MPEG-G files in parallel
    
    Parameters:
    -----------
    input_dir : str
        Directory containing .mgb files
    output_dir : str
        Directory to save all images
    pattern : str
        File pattern to match
    colormaps : list
        List of matplotlib colormaps
    target_size : int
        Target image size
    n_jobs : int
        Number of parallel jobs (None = all cores)
    """
    import glob
    
    mgb_files = sorted(glob.glob(os.path.join(input_dir, pattern)))
    print(f"Found {len(mgb_files)} MPEG-G files to process")
    
    n_jobs = n_jobs or cpu_count()
    print(f"Using {n_jobs} CPU cores for parallel processing\n")
    
    # Prepare arguments
    args_list = [(mgb_path, output_dir, colormaps, target_size) 
                 for mgb_path in mgb_files]
    
    # Process files in parallel with progress bar
    all_saved = []
    with Pool(processes=n_jobs) as pool:
        results = list(tqdm(
            pool.imap(process_single_mgb_file, args_list),
            total=len(mgb_files),
            desc="Processing files",
            unit="file"
        ))
    
    # Collect results
    successful = 0
    failed = 0
    print()  # New line after progress bar
    for mgb_path, saved, error in results:
        if error:
            print(f"âœ— Error processing {os.path.basename(mgb_path)}: {error}")
            failed += 1
        else:
            all_saved.extend(saved)
            successful += 1
    
    print(f"\n{'='*60}")
    print(f"Processed: {successful} successful, {failed} failed")
    print(f"Total images saved: {len(all_saved)}")
    print(f"Output directory: {output_dir}")
    print(f"{'='*60}")
    
    # Clean up temp directory
    temp_dir = os.path.join(output_dir, 'temp_fastq')
    if os.path.exists(temp_dir):
        try:
            remaining = os.listdir(temp_dir)
            if remaining:
                print(f"\nWarning: {len(remaining)} FASTQ files remain in {temp_dir}")
                print("Cleaning up...")
                for f in remaining:
                    os.remove(os.path.join(temp_dir, f))
            os.rmdir(temp_dir)
            print("âœ“ Temp directory cleaned up")
        except Exception as e:
            print(f"Warning: Could not clean temp directory: {e}")
    
    return all_saved

# ---------- hybrid parallel processing ----------
def batch_process_hybrid(input_dir, output_dir, pattern="*.mgb", 
                        colormaps=['viridis', 'gray', 'plasma', 'inferno', 'cividis', 'magma'],
                        target_size=224):
    """
    Hybrid approach: Process files sequentially, but colormaps in parallel
    Best for fewer files with many colormaps
    """
    import glob
    
    mgb_files = sorted(glob.glob(os.path.join(input_dir, pattern)))
    print(f"Found {len(mgb_files)} MPEG-G files")
    print(f"Using all {cpu_count()} cores for colormap parallelization\n")
    
    all_saved = []
    for mgb_path in tqdm(mgb_files, desc="Processing files", unit="file"):
        base_name = os.path.basename(mgb_path)
        try:
            saved = process_mpeg_to_images(
                mgb_path, 
                output_dir, 
                colormaps=colormaps,
                target_size=target_size,
                cleanup=True,  # DELETE FASTQ, KEEP IMAGES
                n_jobs=None,  # Use all cores for colormaps
                verbose=False  # Suppress detailed output
            )
            all_saved.extend(saved)
        except Exception as e:
            tqdm.write(f"  âœ— Error processing {base_name}: {str(e)}")
    
    print(f"\n{'='*60}")
    print(f"Total images saved: {len(all_saved)}")
    print(f"{'='*60}")
    
    # Clean up temp directory
    temp_dir = os.path.join(output_dir, 'temp_fastq')
    if os.path.exists(temp_dir):
        try:
            remaining = os.listdir(temp_dir)
            if remaining:
                print(f"\nWarning: {len(remaining)} FASTQ files remain in {temp_dir}")
                print("Cleaning up...")
                for f in remaining:
                    os.remove(os.path.join(temp_dir, f))
            os.rmdir(temp_dir)
            print("âœ“ Temp directory cleaned up")
        except Exception as e:
            print(f"Warning: Could not clean temp directory: {e}")
    
    return all_saved

# ---------- Example Usage ----------
if __name__ == "__main__":
    print(f"System has {cpu_count()} CPU cores available\n")
    
    # Option 1: File-level parallelism (best for many files)
    # Processes multiple MPEG-G files simultaneously
    input_dir = '/kaggle/input/mpeg-g-dialogue/TrainFiles/TrainFiles/'
    output_dir = '/kaggle/working/5mer_images_parallel'
    
    # batch_process_mpeg_files(
    #     input_dir=input_dir,
    #     output_dir=output_dir,
    #     colormaps=['viridis', 'gray', 'plasma', 'inferno', 'cividis', 'magma'],
    #     target_size=224,
    #     n_jobs=None  # Use all cores
    # )
    
    # Option 2: Hybrid approach (best for few files, many colormaps)
    # Processes files one at a time, but colormaps in parallel
    batch_process_hybrid(
        input_dir=input_dir,
        output_dir=output_dir,
        colormaps=['viridis', 'gray', 'plasma', 'inferno', 'cividis', 'magma'],
        target_size=224
    )

System has 4 CPU cores available

Found 2901 MPEG-G files
Using all 4 cores for colormap parallelization



Processing files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2901/2901 [10:55:06<00:00, 13.55s/file]


Total images saved: 17406
âœ“ Temp directory cleaned up





In [4]:
# # âœ… Safely remove all files and subfolders in /kaggle/working/
# !rm -rf /kaggle/working/*


In [1]:
import pyfiglet
from rich.console import Console

console = Console()
# Banner
banner = pyfiglet.figlet_format("EVER LEARNERS", font="slant")
console.print(banner, style="bold cyan")