! ! Copyright (c) 2015, Intel Corporation ! ! Redistribution and use in source and binary forms, with or without ! modification, are permitted provided that the following conditions ! are met: ! ! * Redistributions of source code must retain the above copyright ! notice, this list of conditions and the following disclaimer. ! * Redistributions in binary form must reproduce the above ! copyright notice, this list of conditions and the following ! disclaimer in the documentation and/or other materials provided ! with the distribution. ! * Neither the name of Intel Corporation nor the names of its ! contributors may be used to endorse or promote products ! derived from this software without specific prior written ! permission. ! ! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ! POSSIBILITY OF SUCH DAMAGE. !******************************************************************* ! ! NAME: transpose ! ! PURPOSE: This program measures the time for the transpose of a ! column-major stored matrix into a row-major stored matrix. ! ! USAGE: Program input is the matrix order and the number of times to ! repeat the operation: ! ! transpose <# iterations> [tile size] ! ! An optional parameter specifies the tile size used to divide the ! individual matrix blocks for improved cache and TLB performance. ! ! The output consists of diagnostics to make sure the ! transpose worked and timing statistics. ! ! HISTORY: Written by Rob Van der Wijngaart, February 2009. ! Converted to Fortran by Jeff Hammond, January 2015 ! ******************************************************************* function prk_get_wtime() result(t) use iso_fortran_env real(kind=REAL64) :: t integer(kind=INT64) :: c, r call system_clock(count = c, count_rate = r) t = real(c,REAL64) / real(r,REAL64) end function prk_get_wtime program main use iso_fortran_env #ifdef _OPENMP use omp_lib #endif implicit none real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen character(len=32) :: argtmp ! problem definition integer(kind=INT32) :: iterations ! number of times to do the transpose integer(kind=INT32) :: order ! order of a the matrix !dec$ attributes align:64 :: A, B real(kind=REAL64), allocatable :: A(:,:) ! buffer to hold original matrix real(kind=REAL64), allocatable :: B(:,:) ! buffer to hold transposed matrix integer(kind=INT64) :: bytes ! combined size of matrices ! runtime variables integer(kind=INT32) :: i, j, k integer(kind=INT32) :: it, jt, tile_size real(kind=REAL64) :: abserr, addit, temp ! squared error real(kind=REAL64) :: t0, t1, trans_time, avgtime ! timing parameters real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance ! ******************************************************************** ! read and test input parameters ! ******************************************************************** #ifndef PRKVERSION #warning Your common/make.defs is missing PRKVERSION #define PRKVERSION "N/A" #endif write(*,'(a34,a10)') 'Parallel Research Kernels version ', PRKVERSION #ifdef _OPENMP write(*,'(a40)') 'Fortran OpenMP Matrix transpose: B = A^T' #else write(*,'(a34)') 'Fortran Matrix transpose: B = A^T' #endif if (command_argument_count().lt.2) then write(*,'(a,i1)') 'argument count = ', command_argument_count() write(*,'(a)') 'Usage: ./transpose <# iterations> []' stop 1 endif iterations = 1 call get_command_argument(1,argtmp,arglen,err) if (err.eq.0) read(argtmp,'(i32)') iterations if (iterations .lt. 1) then write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations stop 1 endif order = 1 call get_command_argument(2,argtmp,arglen,err) if (err.eq.0) read(argtmp,'(i32)') order if (order .lt. 1) then write(*,'(a,i5)') 'ERROR: order must be >= 1 : ', order stop 1 endif ! same default as the C implementation tile_size = 32 if (command_argument_count().gt.2) then call get_command_argument(3,argtmp,arglen,err) if (err.eq.0) read(argtmp,'(i32)') tile_size endif if ((tile_size .lt. 1).or.(tile_size.gt.order)) then write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& ' must be >= 1 and <= ',order tile_size = order ! no tiling endif ! ******************************************************************** ! ** Allocate space for the input and transpose matrix ! ******************************************************************** allocate( A(order,order), stat=err) if (err .ne. 0) then write(*,'(a,i3)') 'allocation of A returned ',err stop 1 endif allocate( B(order,order), stat=err ) if (err .ne. 0) then write(*,'(a,i3)') 'allocation of B returned ',err stop 1 endif bytes = 2 * int(order,INT64) * int(order,INT64) * storage_size(A)/8 #ifdef _OPENMP write(*,'(a,i8)') 'Number of threads = ',omp_get_max_threads() #endif write(*,'(a,i8)') 'Matrix order = ', order write(*,'(a,i8)') 'Tile size = ', tile_size write(*,'(a,i8)') 'Number of iterations = ', iterations #ifdef _OPENMP !$omp parallel default(none) & !$omp& shared(A,B,t0,t1) & !$omp& firstprivate(order,iterations,tile_size) & !$omp& private(i,j,it,jt,k) #endif ! Fill the original matrix, set transpose to known garbage value. if (tile_size.lt.order) then #ifdef _OPENMP #if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_BUILD_DATE) \ && (__INTEL_COMPILER==1600) && (__INTEL_COMPILER_BUILD_DATE<20160101) #warning Disabling collapse because of IPS6000153696 !$omp do #else !$omp do collapse(2) #endif do jt=1,order,tile_size do it=1,order,tile_size #else do concurrent (jt=1:order:tile_size) do concurrent (it=1:order:tile_size) #endif do j=jt,min(order,jt+tile_size-1) do i=it,min(order,it+tile_size-1) A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) B(i,j) = 0.0 enddo enddo enddo enddo #ifdef _OPENMP !$omp end do nowait #endif else #ifdef _OPENMP !$omp do collapse(2) do j=1,order do i=1,order #else do concurrent (j=1:order) do concurrent (i=1:order) #endif A(i,j) = real(order,REAL64) * real(j-1,REAL64) + real(i-1,REAL64) B(i,j) = 0.0 enddo enddo #ifdef _OPENMP !$omp end do nowait #endif endif ! need this because otherwise no barrier between initialization ! and iteration 0 (warmup), which will lead to incorrectness. !$omp barrier do k=0,iterations ! start timer after a warmup iteration if (k.eq.1) then #ifdef _OPENMP !$omp barrier !$omp master #endif t0 = prk_get_wtime() #ifdef _OPENMP !$omp end master #endif endif ! Transpose the matrix; only use tiling if the tile size is smaller than the matrix if (tile_size.lt.order) then #ifdef _OPENMP #if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_BUILD_DATE) \ && (__INTEL_COMPILER==1600) && (__INTEL_COMPILER_BUILD_DATE<20160101) !$omp do #else !$omp do collapse(2) #endif do jt=1,order,tile_size do it=1,order,tile_size #else do concurrent (jt=1:order:tile_size) do concurrent (it=1:order:tile_size) #endif do j=jt,min(order,jt+tile_size-1) do i=it,min(order,it+tile_size-1) B(j,i) = B(j,i) + A(i,j) A(i,j) = A(i,j) + 1.0 enddo enddo enddo enddo #ifdef _OPENMP !$omp end do nowait #endif else #ifdef _OPENMP !$omp do collapse(2) do j=1,order do i=1,order #else do concurrent (j=1:order) do concurrent (i=1:order) #endif B(j,i) = B(j,i) + A(i,j) A(i,j) = A(i,j) + 1.0 enddo enddo #ifdef _OPENMP !$omp end do nowait #endif endif enddo ! iterations #ifdef _OPENMP !$omp barrier !$omp master #endif t1 = prk_get_wtime() #ifdef _OPENMP !$omp end master #endif #ifdef _OPENMP !$omp end parallel #endif trans_time = t1 - t0 ! ******************************************************************** ! ** Analyze and output results. ! ******************************************************************** abserr = 0.0 ! this will overflow if iterations>>1000 addit = (0.5*iterations) * (iterations+1) do j=1,order do i=1,order temp = ((real(order,REAL64)*real(i-1,REAL64))+real(j-1,REAL64)) & * real(iterations+1,REAL64) abserr = abserr + abs(B(i,j) - (temp+addit)) enddo enddo deallocate( B ) deallocate( A ) if (abserr .lt. epsilon) then write(*,'(a)') 'Solution validates' avgtime = trans_time/iterations write(*,'(a,f13.6,a,f10.6)') 'Rate (MB/s): ',(1.d-6*bytes)/avgtime, & ' Avg time (s): ', avgtime else write(*,'(a,f30.15,a,f30.15)') 'ERROR: Aggregate squared error ',abserr, & 'exceeds threshold ',epsilon stop 1 endif end program main