Navigation Menu

Skip to content

Commit

Permalink
Initialize/finalize SIRIUS only when used
Browse files Browse the repository at this point in the history
  • Loading branch information
mkrack committed Oct 14, 2021
1 parent 180a1bf commit d3703bf
Showing 1 changed file with 47 additions and 52 deletions.
99 changes: 47 additions & 52 deletions src/start/cp2k_runs.F
Expand Up @@ -204,7 +204,7 @@ RECURSIVE SUBROUTINE cp2k_run(input_declaration, input_file_name, output_unit, m
accdrv_active_device_id=offload_get_device_id())
ELSE
CALL dbcsr_init_lib(mpi_comm, io_unit=output_unit)
ENDIF
END IF
#else
CALL dbcsr_init_lib(mpi_comm, io_unit=output_unit)
#endif
Expand All @@ -213,8 +213,6 @@ RECURSIVE SUBROUTINE cp2k_run(input_declaration, input_file_name, output_unit, m

CALL pw_fpga_init()

CALL cp_sirius_init()

NULLIFY (globenv, force_env)

CALL cite_reference(Hutter2014)
Expand All @@ -240,13 +238,11 @@ RECURSIVE SUBROUTINE cp2k_run(input_declaration, input_file_name, output_unit, m
i_val=prog_name_id)
CALL section_vals_val_get(input_file, "GLOBAL%RUN_TYPE", &
i_val=run_type_id)
CALL section_vals_val_get(root_section, "FORCE_EVAL%METHOD", i_val=method_name_id)

IF (prog_name_id /= do_cp2k) THEN
! initial setup (cp2k does in in the creation of the force_env)
CALL globenv_create(globenv)
! XXXXXXXXX
! root_section => input_file
! XXXXXXXXX
CALL section_vals_retain(input_file)
CALL cp2k_init(para_env, output_unit, globenv, input_file_name=input_file_name)
CALL cp2k_read(root_section, para_env, globenv)
Expand All @@ -258,7 +254,7 @@ RECURSIVE SUBROUTINE cp2k_run(input_declaration, input_file_name, output_unit, m
cp_logger_would_log(logger, cp_note_level)) THEN
CALL dbcsr_print_config(unit_nr=output_unit)
WRITE (UNIT=output_unit, FMT='()')
ENDIF
END IF

! Configure the grid library.
CALL section_vals_val_get(root_section, "GLOBAL%GRID%BACKEND", i_val=grid_backend)
Expand All @@ -282,7 +278,7 @@ RECURSIVE SUBROUTINE cp2k_run(input_declaration, input_file_name, output_unit, m
! But, we don't want to change the public f77_interface.
! TODO: refactor cp2k's startup code
CALL dbcsr_finalize_lib()
CALL cp_sirius_finalize()
IF (method_name_id == do_sirius) CALL cp_sirius_finalize()
CALL pw_cuda_finalize()
CALL pw_fpga_finalize()
CALL farming_run(input_declaration, root_section, para_env, initial_variables)
Expand All @@ -292,13 +288,13 @@ RECURSIVE SUBROUTINE cp2k_run(input_declaration, input_file_name, output_unit, m
accdrv_active_device_id=offload_get_device_id())
ELSE
CALL dbcsr_init_lib(mpi_comm, io_unit=output_unit)
ENDIF
END IF
#else
CALL dbcsr_init_lib(mpi_comm, io_unit=output_unit)
#endif
CALL pw_cuda_init()
CALL pw_fpga_init()
CALL cp_sirius_init()
IF (method_name_id == do_sirius) CALL cp_sirius_init()
CASE (do_opt_basis)
CALL run_optimize_basis(input_declaration, root_section, para_env)
globenv%run_type_id = none_run
Expand All @@ -315,8 +311,7 @@ RECURSIVE SUBROUTINE cp2k_run(input_declaration, input_file_name, output_unit, m
force_env => f_env%force_env
CALL force_env_get(force_env, globenv=globenv)
CALL globenv_retain(globenv)
CALL section_vals_val_get(force_env%force_env_section, "METHOD", i_val=method_name_id)

IF (method_name_id == do_sirius) CALL cp_sirius_init()
CASE (do_test)
CALL lib_test(root_section, para_env, globenv)
CASE (do_tree_mc) ! TMC entry point
Expand Down Expand Up @@ -420,7 +415,7 @@ RECURSIVE SUBROUTINE cp2k_run(input_declaration, input_file_name, output_unit, m
!sample peak memory
CALL m_memory()

CALL cp_sirius_finalize()
IF (method_name_id == do_sirius) CALL cp_sirius_finalize()

CALL pw_cuda_finalize()

Expand All @@ -436,7 +431,7 @@ RECURSIVE SUBROUTINE cp2k_run(input_declaration, input_file_name, output_unit, m
WRITE (output_unit, *)
WRITE (output_unit, '(T2,"MEMORY| Estimated peak process memory [MiB]",T73,I8)') &
(m_memory_max_mpi + (1024*1024) - 1)/(1024*1024)
ENDIF
END IF

IF (prog_name_id == do_cp2k) THEN
f_env%force_env => force_env ! for mc
Expand Down Expand Up @@ -545,14 +540,14 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
CPASSERT(num_slaves == 1)
num_slaves = para_env%num_pe - 1
slave_rank = -1
ENDIF
END IF
CPASSERT(num_slaves == para_env%num_pe - 1)
ELSE
! all processes are slaves
IF (output_unit > 0) WRITE (output_unit, FMT="(T2,A)") "FARMING| using a slave-only setup"
CALL mp_comm_dup(para_env%group, slave_group)
CALL mp_environ(num_slaves, slave_rank, slave_group)
ENDIF
END IF
IF (output_unit > 0) WRITE (output_unit, FMT="(T2,A,I0)") "FARMING| number of slaves ", num_slaves

! keep track of which para_env rank is which slave/master
Expand All @@ -564,7 +559,7 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
primus_slave = 0
DO i = 1, para_env%num_pe - 1
IF (slave_distribution(i) == 0) primus_slave = i
ENDDO
END DO

! split the current communicator for the slaves
! in a new_group, new_size and new_rank according to the number of groups required according to the input
Expand All @@ -583,12 +578,12 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
ELSE
CALL mp_comm_split(slave_group, new_group, ngroups, group_distribution, &
n_subgroups=farming_env%ngroup_wish, stride=farming_env%stride)
ENDIF
END IF
ELSE
CPABORT("must set either group_size_wish or ngroup_wish")
ENDIF
END IF
CALL mp_environ(new_size, new_rank, new_group)
ENDIF
END IF

! transfer the info about the slave group distribution to the master
IF (farming_env%master_slave) THEN
Expand All @@ -597,14 +592,14 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
CALL mp_send(group_distribution, 0, tag, para_env%group)
tag = 2
CALL mp_send(ngroups, 0, tag, para_env%group)
ENDIF
END IF
IF (para_env%mepos == 0) THEN
tag = 1
CALL mp_recv(group_distribution, primus_slave, tag, para_env%group)
tag = 2
CALL mp_recv(ngroups, primus_slave, tag, para_env%group)
ENDIF
ENDIF
END IF
END IF

! write info on group distribution
IF (output_unit > 0) THEN
Expand All @@ -617,7 +612,7 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
END DO
WRITE (output_unit, *)
CALL m_flush(output_unit)
ENDIF
END IF

! protect about too many jobs being run in single go. Not more jobs are allowed than the number in the input file
! and determine the future restart point
Expand All @@ -628,14 +623,14 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
n_jobs_to_run = MIN(farming_env%njobs, farming_env%max_steps*ngroups)
n_jobs_to_run = MIN(n_jobs_to_run, farming_env%njobs - farming_env%restart_n + 1)
i_job_to_restart = n_jobs_to_run + farming_env%restart_n
ENDIF
END IF

! and write the restart now, that's the point where the next job starts, even if this one is running
iunit = cp_print_key_unit_nr(logger, root_section, "FARMING%RESTART", &
extension=".restart")
IF (iunit > 0) THEN
WRITE (iunit, *) i_job_to_restart
ENDIF
END IF
CALL cp_print_key_finished_output(iunit, logger, root_section, "FARMING%RESTART")
! this is the job range to be executed.
Expand All @@ -646,7 +641,7 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
WRITE (output_unit, FMT="(T2,A)") "FARMING| is the cycle keyword required ?"
WRITE (output_unit, FMT="(T2,A)") "FARMING| or is a stray RESTART file present ?"
WRITE (output_unit, FMT="(T2,A)") "FARMING| or is the group_size requested smaller than the number of CPUs?"
ENDIF
END IF
! actual executions of the jobs in two different modes
IF (farming_env%master_slave) THEN
Expand All @@ -670,7 +665,7 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
CALL mp_bcast(todo, 0, new_group)
ELSE
CALL mp_bcast(todo, 0, new_group)
ENDIF
END IF

! if the todo is do_nothing we are flagged to quit. Otherwise it is the job number
SELECT CASE (todo)
Expand All @@ -680,13 +675,13 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
DO
t2 = m_walltime()
IF (t2 - t1 > farming_env%wait_time) EXIT
ENDDO
END DO
CASE (do_nothing)
EXIT
CASE (1:)
CALL execute_job(todo)
END SELECT
ENDDO
END DO
ELSE ! master
ALLOCATE (slave_status(0:ngroups - 1))
slave_status = slave_status_wait
Expand All @@ -704,8 +699,8 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
IF (output_unit > 0) THEN
WRITE (output_unit, FMT=*) "Job finished: ", todo
CALL m_flush(output_unit)
ENDIF
ENDIF
END IF
END IF
! get the next job in line, this could be do_nothing, if we're finished
CALL get_next_job(farming_env, ijob_start, ijob_end, ijob_current, todo)
Expand All @@ -719,31 +714,31 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
WRITE (output_unit, FMT=*) "Job: ", todo, " Dir: ", TRIM(farming_env%Job(todo)%cwd), &
" assigned to group ", group_distribution(slave_distribution(dest))
CALL m_flush(output_unit)
ENDIF
END IF
ELSE
IF (todo == do_nothing) THEN
slave_status(group_distribution(slave_distribution(dest))) = slave_status_done
IF (output_unit > 0) THEN
WRITE (output_unit, FMT=*) "group done: ", group_distribution(slave_distribution(dest))
CALL m_flush(output_unit)
ENDIF
ENDIF
END IF
END IF
IF (todo == do_deadlock) THEN
IF (output_unit > 0) THEN
WRITE (output_unit, FMT=*) ""
WRITE (output_unit, FMT=*) "FARMING JOB DEADLOCKED ... CIRCULAR DEPENDENCIES"
WRITE (output_unit, FMT=*) ""
CALL m_flush(output_unit)
ENDIF
END IF
CPASSERT(todo .NE. do_deadlock)
ENDIF
ENDIF
END IF
END IF

ENDDO
END DO

DEALLOCATE (slave_status)

ENDIF
END IF
ELSE
! this is the non-master-slave mode way of executing the jobs
! the i-th job in the input is always executed by the MODULO(i-1,ngroups)-th group
Expand All @@ -755,10 +750,10 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
i = MODULO(ijob - 1, farming_env%njobs) + 1
WRITE (output_unit, FMT=*) "Job: ", i, " Dir: ", TRIM(farming_env%Job(i)%cwd), " Input: ", &
TRIM(farming_env%Job(i)%input), " MPI group:", MODULO(i - 1, ngroups)
ENDDO
ENDIF
END DO
END IF
CALL m_flush(output_unit)
ENDIF
END IF
DO ijob = ijob_start, ijob_end
i = MODULO(ijob - 1, farming_env%njobs) + 1
Expand All @@ -768,15 +763,15 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
WRITE (output_unit, FMT="(T2,A,I5.5,A)", ADVANCE="NO") " Running Job ", i, &
" in "//TRIM(farming_env%Job(i)%cwd)//"."
CALL m_flush(output_unit)
ENDIF
END IF
CALL execute_job(i)
IF (output_unit > 0) THEN
WRITE (output_unit, FMT="(A)") " Done, output in "//TRIM(output_file)
CALL m_flush(output_unit)
ENDIF
ENDIF
ENDDO
ENDIF
END IF
END IF
END DO
END IF
! keep information about how long each process has to wait
! i.e. the load imbalance
Expand All @@ -793,9 +788,9 @@ RECURSIVE SUBROUTINE farming_run(input_declaration, root_section, para_env, init
WRITE (output_unit, FMT='(A2,I6,A3,F8.3,A1)', ADVANCE="NO") &
" (", i, " : ", waittime(i), ")"
IF (MOD(i + 1, 4) == 0) WRITE (output_unit, '(A)') ""
ENDDO
END DO
CALL m_flush(output_unit)
ENDIF
END IF
DEALLOCATE (waittime)
! give back the communicators of the split groups
Expand Down Expand Up @@ -855,7 +850,7 @@ RECURSIVE SUBROUTINE execute_job(i)
CALL parser_release(my_parser)
ELSE
output_file = farming_env%Job(i)%output
ENDIF
END IF
CALL open_file(file_name=TRIM(output_file), &
file_action="WRITE", &
Expand All @@ -867,7 +862,7 @@ RECURSIVE SUBROUTINE execute_job(i)
! start writing output (to the same file, adding to confusion).
! error handling should be careful, asking for a local output unit if required
new_output_unit = -1
ENDIF
END IF
CALL cp2k_run(input_declaration, TRIM(farming_env%Job(i)%input), new_output_unit, new_group, initial_variables)
Expand Down

0 comments on commit d3703bf

Please sign in to comment.