In [None]:
import os
import time
import datetime
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
import concurrent.futures

lock_file = 'job-multipool-sentiment.lock'
timeout_seconds = 3600  # Timeout period to consider a job as halted, e.g., 10 minutes
num_parallel_executions = 5  # Number of parallel executions
delay_between_starts = 2  # Delay in seconds between starts

def create_lock_file():
    with open(lock_file, 'w') as f:
        f.write(datetime.datetime.now().isoformat())

def update_lock_file():
    with open(lock_file, 'w') as f:
        f.write(datetime.datetime.now().isoformat())

def check_lock_file():
    if os.path.exists(lock_file):
        with open(lock_file, 'r') as f:
            timestamp = f.read()
            last_run_time = datetime.datetime.fromisoformat(timestamp)
            if (datetime.datetime.now() - last_run_time).total_seconds() < timeout_seconds:
                return True  # Job is still running
    return False  # Job is not running or has halted

def run_notebook(instance_id, notebook_path, timeout=600):
    try:
        print(f"Instance {instance_id} started.")
        with open(notebook_path) as f:
            nb = nbformat.read(f, as_version=4)
        ep = ExecutePreprocessor(timeout=timeout, kernel_name='python3')
        ep.preprocess(nb, {'metadata': {'path': '/home/ubuntu/ml_project/notebook_directory/multipool/ai-case-study/'}})
        with open(notebook_path, 'w') as f:
            nbformat.write(nb, f)
        print(f"Instance {instance_id} finished.")
    except Exception as e:
        print(f"Instance {instance_id} failed: {e}")

# Check if the lock file exists and the job is still running
if check_lock_file():
    print("Previous job is still running. Exiting.")
else:
    # Create a lock file to indicate that the job is running
    create_lock_file()
    print("Job started.")
    try:
        # Run the notebook in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_parallel_executions) as executor:
            futures = {}
            for i in range(num_parallel_executions):
                futures[executor.submit(run_notebook, i, '/home/ubuntu/ml_project/notebook_directory/multipool/ai-case-study/google-sentiment.ipynb')] = i
                time.sleep(delay_between_starts)  # Delay between starting each instance
            for future in concurrent.futures.as_completed(futures):
                instance_id = futures[future]
                try:
                    future.result()
                except Exception as e:
                    print(f"Instance {instance_id} generated an exception: {e}")
        print("All instances finished.")
    finally:
        # Remove the lock file when the job is done
        if os.path.exists(lock_file):
            os.remove(lock_file)


Job started.
Instance 0 started.
Instance 1 started.


2024-08-04 07:35:54.227268: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-04 07:35:54.249392: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-04 07:35:54.255109: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-04 07:35:54.268208: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instance 2 started.




Instance 3 started.


2024-08-04 07:35:57.000495: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-04 07:35:57.022671: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-04 07:35:57.028396: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-04 07:35:57.041562: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instance 4 started.


2024-08-04 07:35:59.218778: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-04 07:35:59.247046: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-04 07:35:59.255431: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-04 07:35:59.274855: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-04 07:36:02.362331: E external/local_xla/xla/

Instance 1 finished.
Instance 0 finished.
Instance 2 finished.
Instance 3 finished.
