@@ -512,6 +512,14 @@ static size_t g_scratch_offset = 0;
512512
513513static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
514514
515+ [[noreturn]]
516+ static __device__ void bad_arch () {
517+ printf (" ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n " );
518+ __trap ();
519+
520+ (void ) bad_arch; // suppress unused function warning
521+ }
522+
515523static __device__ __forceinline__ float warp_reduce_sum (float x) {
516524#pragma unroll
517525 for (int mask = 16 ; mask > 0 ; mask >>= 1 ) {
@@ -1972,8 +1980,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
19721980 // second part effectively subtracts 8 from each quant value
19731981 return d4 * (sumi * ds8f.x - (8 *vdr/QI4_0) * ds8f.y );
19741982#else
1975- assert (false );
1976- return 0 .0f ; // only to satisfy the compiler
1983+ bad_arch ();
19771984#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
19781985}
19791986
@@ -2010,8 +2017,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
20102017 // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
20112018 return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
20122019#else
2013- assert (false );
2014- return 0 .0f ; // only to satisfy the compiler
2020+ bad_arch ();
20152021#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
20162022}
20172023
@@ -2046,8 +2052,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
20462052 // second part effectively subtracts 16 from each quant value
20472053 return d5 * (sumi * ds8f.x - (16 *vdr/QI5_0) * ds8f.y );
20482054#else
2049- assert (false );
2050- return 0 .0f ; // only to satisfy the compiler
2055+ bad_arch ();
20512056#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
20522057}
20532058
@@ -2092,8 +2097,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
20922097 return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
20932098
20942099#else
2095- assert (false );
2096- return 0 .0f ; // only to satisfy the compiler
2100+ bad_arch ();
20972101#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
20982102}
20992103
@@ -2114,8 +2118,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
21142118
21152119 return d8_0*d8_1 * sumi;
21162120#else
2117- assert (false );
2118- return 0 .0f ; // only to satisfy the compiler
2121+ bad_arch ();
21192122#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
21202123}
21212124
@@ -2145,8 +2148,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
21452148 // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
21462149 return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
21472150#else
2148- assert (false );
2149- return 0 .0f ; // only to satisfy the compiler
2151+ bad_arch ();
21502152#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
21512153}
21522154
@@ -2181,8 +2183,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
21812183
21822184 return dm2f.x *sumf_d - dm2f.y *sumf_m;
21832185#else
2184- assert (false );
2185- return 0 .0f ; // only to satisfy the compiler
2186+ bad_arch ();
21862187#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
21872188}
21882189
@@ -2219,8 +2220,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
22192220
22202221 return d8 * (dm2f.x *sumi_d - dm2f.y *sumi_m);
22212222#else
2222- assert (false );
2223- return 0 .0f ; // only to satisfy the compiler
2223+ bad_arch ();
22242224#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
22252225}
22262226
@@ -2260,8 +2260,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
22602260
22612261 return d3 * sumf;
22622262#else
2263- assert (false );
2264- return 0 .0f ; // only to satisfy the compiler
2263+ bad_arch ();
22652264#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
22662265}
22672266
@@ -2286,8 +2285,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
22862285
22872286 return d3*d8 * sumi;
22882287#else
2289- assert (false );
2290- return 0 .0f ; // only to satisfy the compiler
2288+ bad_arch ();
22912289#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
22922290}
22932291
@@ -2320,8 +2318,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
23202318 return dm4f.x *sumf_d - dm4f.y *sumf_m;
23212319
23222320#else
2323- assert (false );
2324- return 0 .0f ; // only to satisfy the compiler
2321+ bad_arch ();
23252322#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
23262323}
23272324
@@ -2354,8 +2351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
23542351 return dm4f.x *sumf_d - dm4f.y *sumf_m;
23552352
23562353#else
2357- assert (false );
2358- return 0 .0f ; // only to satisfy the compiler
2354+ bad_arch ();
23592355#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
23602356}
23612357
@@ -2395,8 +2391,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
23952391 return dm5f.x *sumf_d - dm5f.y *sumf_m;
23962392
23972393#else
2398- assert (false );
2399- return 0 .0f ; // only to satisfy the compiler
2394+ bad_arch ();
24002395#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
24012396}
24022397
@@ -2429,8 +2424,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
24292424 return dm4f.x *sumf_d - dm4f.y *sumf_m;
24302425
24312426#else
2432- assert (false );
2433- return 0 .0f ; // only to satisfy the compiler
2427+ bad_arch ();
24342428#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
24352429}
24362430
@@ -2460,8 +2454,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
24602454
24612455 return d*sumf;
24622456#else
2463- assert (false );
2464- return 0 .0f ; // only to satisfy the compiler
2457+ bad_arch ();
24652458#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
24662459}
24672460
@@ -2492,8 +2485,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
24922485 return d6 * sumf_d;
24932486
24942487#else
2495- assert (false );
2496- return 0 .0f ; // only to satisfy the compiler
2488+ bad_arch ();
24972489#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
24982490}
24992491
@@ -3359,8 +3351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
33593351 return dall * sumf_d - dmin * sumf_m;
33603352
33613353#else
3362- assert (false );
3363- return 0 .0f ; // only to satisfy the compiler
3354+ bad_arch ();
33643355#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
33653356
33663357#endif
@@ -3543,8 +3534,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
35433534 return d * sumf_d;
35443535
35453536#else
3546- assert (false );
3547- return 0 .0f ; // only to satisfy the compiler
3537+ bad_arch ();
35483538#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
35493539
35503540#endif
@@ -3954,7 +3944,7 @@ template <bool need_check> static __global__ void
39543944 (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
39553945#else
39563946 (void ) vec_dot_q4_0_q8_1_mul_mat;
3957- assert ( false );
3947+ bad_arch ( );
39583948#endif // __CUDA_ARCH__ >= CC_VOLTA
39593949}
39603950
@@ -4023,7 +4013,7 @@ template <bool need_check> static __global__ void
40234013 (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
40244014#else
40254015 (void ) vec_dot_q4_1_q8_1_mul_mat;
4026- assert ( false );
4016+ bad_arch ( );
40274017#endif // __CUDA_ARCH__ >= CC_VOLTA
40284018}
40294019
@@ -4090,7 +4080,7 @@ template <bool need_check> static __global__ void
40904080 (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
40914081#else
40924082 (void ) vec_dot_q5_0_q8_1_mul_mat;
4093- assert ( false );
4083+ bad_arch ( );
40944084#endif // __CUDA_ARCH__ >= CC_VOLTA
40954085}
40964086
@@ -4157,7 +4147,7 @@ mul_mat_q5_1(
41574147 (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
41584148#else
41594149 (void ) vec_dot_q5_1_q8_1_mul_mat;
4160- assert ( false );
4150+ bad_arch ( );
41614151#endif // __CUDA_ARCH__ >= CC_VOLTA
41624152}
41634153
@@ -4224,7 +4214,7 @@ template <bool need_check> static __global__ void
42244214 (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
42254215#else
42264216 (void ) vec_dot_q8_0_q8_1_mul_mat;
4227- assert ( false );
4217+ bad_arch ( );
42284218#endif // __CUDA_ARCH__ >= CC_VOLTA
42294219}
42304220
@@ -4291,7 +4281,7 @@ mul_mat_q2_K(
42914281 (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
42924282#else
42934283 (void ) vec_dot_q2_K_q8_1_mul_mat;
4294- assert ( false );
4284+ bad_arch ( );
42954285#endif // __CUDA_ARCH__ >= CC_VOLTA
42964286}
42974287
@@ -4360,7 +4350,7 @@ template <bool need_check> static __global__ void
43604350 (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
43614351#else
43624352 (void ) vec_dot_q3_K_q8_1_mul_mat;
4363- assert ( false );
4353+ bad_arch ( );
43644354#endif // __CUDA_ARCH__ >= CC_VOLTA
43654355}
43664356
@@ -4429,7 +4419,7 @@ template <bool need_check> static __global__ void
44294419 (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
44304420#else
44314421 (void ) vec_dot_q4_K_q8_1_mul_mat;
4432- assert ( false );
4422+ bad_arch ( );
44334423#endif // __CUDA_ARCH__ >= CC_VOLTA
44344424}
44354425
@@ -4496,7 +4486,7 @@ mul_mat_q5_K(
44964486 (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
44974487#else
44984488 (void ) vec_dot_q5_K_q8_1_mul_mat;
4499- assert ( false );
4489+ bad_arch ( );
45004490#endif // __CUDA_ARCH__ >= CC_VOLTA
45014491}
45024492
@@ -4565,7 +4555,7 @@ template <bool need_check> static __global__ void
45654555 (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
45664556#else
45674557 (void ) vec_dot_q6_K_q8_1_mul_mat;
4568- assert ( false );
4558+ bad_arch ( );
45694559#endif // __CUDA_ARCH__ >= CC_VOLTA
45704560}
45714561
0 commit comments