Skip to content

Commit

Permalink
Made framework changes to initialize specific cache block sizes for T…
Browse files Browse the repository at this point in the history
…RSM.

Details:
-This commit addresses the performance optimization(single-thread and
 multi-thread) for DTRSM on zen2.
-This new optimization employs different MC, KC & NC values for TRSM than
 what is being used in other Level-3 routines like DGEMM.
-Changed TRSM framework code to choose these blocksizes for TRSM
 on zen family configurations.
-Added a new field called "trsm_blkszs" to cntx structure in order to
 store TRSM specific block sizes.
-Implemented routines to initialize, set and query the TRSM-specific
 block sizes.
-Defined a new macro "AOCL_BLIS_ZEN" in configure script.
 This macro is automatically defined for zen family architectures.
 It enables us to choose different cache block sizes for TRSM instead of common level-3 block sizes.

Change-Id: Id8557b1c962a316b1edecca9cd582675eaf35fe6
Signed-off-by: Meghana Vankadari <meghana.vankadari@amd.com>
AMD-Internal: [CPUPL-656]
  • Loading branch information
Meghana-vankadari committed Oct 28, 2021
1 parent e8caf20 commit 2643db0
Show file tree
Hide file tree
Showing 13 changed files with 314 additions and 19 deletions.
6 changes: 6 additions & 0 deletions build/bli_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@
#define BLIS_DISABLE_SYSTEM
#endif

//This macro is enabled only for ZEN family configurations.
//This enables us to use different cache-blocking sizes for TRSM instead of common level-3 cache-block sizes.
#if @enable_aocl_zen@
#define AOCL_BLIS_ZEN
#endif

#if @enable_openmp@
#define BLIS_ENABLE_OPENMP
#endif
Expand Down
14 changes: 14 additions & 0 deletions config/zen/bli_cntx_init_zen.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,20 @@ void bli_cntx_init_zen( cntx_t* cntx )
cntx
);

// Update the context with the current architecture's register and cache
// blocksizes for level-3 TRSM execution.
bli_cntx_set_trsm_blkszs
(
5,
// level-3
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);

// -------------------------------------------------------------------------

// Initialize sup thresholds with architecture-appropriate values.
Expand Down
36 changes: 25 additions & 11 deletions config/zen2/bli_cntx_init_zen2.c
Original file line number Diff line number Diff line change
Expand Up @@ -174,17 +174,31 @@ void bli_cntx_init_zen2( cntx_t* cntx )

// -------------------------------------------------------------------------

// Initialize sup thresholds with architecture-appropriate values.
// s d c z
#if 1
bli_blksz_init_easy( &thresh[ BLIS_MT ], 500, 249, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 500, 249, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 500, 249, -1, -1 );
#else
bli_blksz_init_easy( &thresh[ BLIS_MT ], 100000, 100000, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 100000, 100000, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 100000, 100000, -1, -1 );
#endif
//Initialize TRSM blocksize objects with architecture-specific values.
//Using different cache block sizes for TRSM instead of common level-3 block sizes.
//Tuning is done for double-precision only.
// s d c z
bli_blksz_init_easy( &blkszs[ BLIS_MC ], 144, 72, 144, 72 );
bli_blksz_init_easy( &blkszs[ BLIS_KC ], 256, 492, 256, 256 );
bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4080, 1600, 4080, 4080 );

// Update the context with the current architecture's register and cache
// blocksizes for level-3 TRSM problems.
bli_cntx_set_trsm_blkszs
(
5,
BLIS_NC, &blkszs[ BLIS_NC ],
BLIS_KC, &blkszs[ BLIS_KC ],
BLIS_MC, &blkszs[ BLIS_MC ],
BLIS_NR, &blkszs[ BLIS_NR ],
BLIS_MR, &blkszs[ BLIS_MR ],
cntx
);

// Initialize sup thresholds with architecture-appropriate values. s d c z
bli_blksz_init_easy( &thresh[ BLIS_MT ], 512, 256, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_NT ], 200, 100, -1, -1 );
bli_blksz_init_easy( &thresh[ BLIS_KT ], 240, 120, -1, -1 );

// Initialize the context with the sup thresholds.
bli_cntx_set_l3_sup_thresh
Expand Down
13 changes: 13 additions & 0 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -3282,6 +3282,18 @@ main()
uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]')
config_name_define="#define BLIS_FAMILY_${uconf}\n"

#create a AOCL specific #define
#This macro is enabled only for zen family configurations.
#This enables us to use different cache block sizes for TRSM instead of common level-3 block sizes.
uconf=$(echo ${config_name} | grep -c 'zen' | cut -d. -f1)
if [[ $uconf == 1 ]]; then
enable_aocl_zen='yes'
enable_aocl_zen_01=1
else
enable_aocl_zen = 'no';
enable_aocl_zen_01=0;
fi

# Create a list of #defines, one for each configuration in config_list.
config_list_defines=""
for conf in ${config_list}; do
Expand Down Expand Up @@ -3395,6 +3407,7 @@ main()
| perl -pe "s/\@config_list_defines\@/${config_list_defines}/g" \
| perl -pe "s/\@kernel_list_defines\@/${kernel_list_defines}/g" \
| sed -e "s/@enable_system@/${enable_system_01}/g" \
| sed -e "s/\@enable_aocl_zen\@/${enable_aocl_zen_01}/g" \
| sed -e "s/@enable_openmp@/${enable_openmp_01}/g" \
| sed -e "s/@enable_pthreads@/${enable_pthreads_01}/g" \
| sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \
Expand Down
6 changes: 3 additions & 3 deletions frame/3/bli_l3_blocksize.c
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
/*
/*
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
Expand Down Expand Up @@ -34,7 +35,6 @@

#include "blis.h"


dim_t bli_l3_determine_kc
(
dir_t direct,
Expand Down Expand Up @@ -311,7 +311,7 @@ dim_t PASTEMAC0(opname) \
/* Extract the execution datatype and use it to query the corresponding
blocksize and blocksize maximum values from the blksz_t object. */ \
dt = bli_obj_exec_dt( a ); \
bsize = bli_cntx_get_blksz( bszid, cntx ); \
bsize = TRSM_BLKSZ_FUNC( bszid, cntx ); \
b_alg = bli_blksz_get_def( dt, bsize ); \
b_max = bli_blksz_get_max( dt, bsize ); \
\
Expand Down
10 changes: 8 additions & 2 deletions frame/3/trsm/bli_trsm_blk_var1.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
Expand Down Expand Up @@ -80,9 +80,15 @@ void bli_trsm_blk_var1
{
obj_t a11_1, c1_1;

//For zen architectures, TRSM uses different MC, KC and NC blocking sizes than other Level-3 routines.
//Hence calling a different function to query TRSM-specific block sizes for zen family.
#ifdef AOCL_BLIS_ZEN
b_alg = bli_determine_blocksize_trsm( direct, i, my_end, &a11,
bli_cntl_bszid( cntl ), cntx );
#else
b_alg = bli_determine_blocksize( direct, i, my_end, &a11,
bli_cntl_bszid( cntl ), cntx );

#endif
// Acquire partitions for A1 and C1.
bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
i, b_alg, &a11, &a11_1 );
Expand Down
9 changes: 8 additions & 1 deletion frame/3/trsm/bli_trsm_blk_var2.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
Expand Down Expand Up @@ -67,8 +67,15 @@ void bli_trsm_blk_var2
for ( dim_t i = my_start; i < my_end; i += b_alg )
{
// Determine the current algorithmic blocksize.
//For zen family, TRSM uses different MC, KC and NC blocksizes than Level-3 routines.
//Hence calling a different function to query TRSM-specific block sizes for zen family.
#ifdef AOCL_BLIS_ZEN
b_alg = bli_determine_blocksize_trsm( direct, i, my_end, b,
bli_cntl_bszid( cntl ), cntx );
#else
b_alg = bli_determine_blocksize( direct, i, my_end, b,
bli_cntl_bszid( cntl ), cntx );
#endif

// Acquire partitions for B1 and C1.
bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
Expand Down
73 changes: 73 additions & 0 deletions frame/base/bli_blksz.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
Expand Down Expand Up @@ -301,6 +302,78 @@ dim_t bli_determine_blocksize_b
return b_use;
}

#ifdef AOCL_BLIS_ZEN

dim_t bli_determine_blocksize_trsm
(
dir_t direct,
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
)
{
if ( direct == BLIS_FWD )
return bli_determine_blocksize_trsm_f( i, dim, obj, bszid, cntx );
else
return bli_determine_blocksize_trsm_b( i, dim, obj, bszid, cntx );
}

dim_t bli_determine_blocksize_trsm_f
(
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
)
{
num_t dt;
blksz_t* bsize;
dim_t b_alg, b_max;
dim_t b_use;

// Extract the execution datatype and use it to query the corresponding
// blocksize and blocksize maximum values from the blksz_t object.
dt = bli_obj_exec_dt( obj );
bsize = bli_cntx_get_trsm_blksz( bszid, cntx );
b_alg = bli_blksz_get_def( dt, bsize );
b_max = bli_blksz_get_max( dt, bsize );

b_use = bli_determine_blocksize_f_sub( i, dim, b_alg, b_max );

return b_use;
}

dim_t bli_determine_blocksize_trsm_b
(
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
)
{
num_t dt;
blksz_t* bsize;
dim_t b_alg, b_max;
dim_t b_use;

// Extract the execution datatype and use it to query the corresponding
// blocksize and blocksize maximum values from the blksz_t object.
dt = bli_obj_exec_dt( obj );
bsize = bli_cntx_get_trsm_blksz( bszid, cntx );
b_alg = bli_blksz_get_def( dt, bsize );
b_max = bli_blksz_get_max( dt, bsize );

b_use = bli_determine_blocksize_b_sub( i, dim, b_alg, b_max );

return b_use;
}

#endif

dim_t bli_determine_blocksize_f_sub
(
dim_t i,
Expand Down
33 changes: 33 additions & 0 deletions frame/base/bli_blksz.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
libraries.
Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2020, Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
Expand Down Expand Up @@ -278,6 +279,38 @@ dim_t bli_determine_blocksize_b
cntx_t* cntx
);

#ifdef AOCL_BLIS_ZEN

dim_t bli_determine_blocksize_trsm
(
dir_t direct,
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
);

dim_t bli_determine_blocksize_trsm_f
(
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
);

dim_t bli_determine_blocksize_trsm_b
(
dim_t i,
dim_t dim,
obj_t* obj,
bszid_t bszid,
cntx_t* cntx
);

#endif

dim_t bli_determine_blocksize_f_sub
(
dim_t i,
Expand Down

0 comments on commit 2643db0

Please sign in to comment.