Skip to content

Latest commit

 

History

History
219 lines (159 loc) · 10.7 KB

20221111_02.md

File metadata and controls

219 lines (159 loc) · 10.7 KB

PostgreSQL SQL case: 优化器为什么不选择索引扫描?

作者

digoal

日期

2022-11-11

标签

PostgreSQL , 优化器 , 索引


背景

经常有同学问到为什么有时候有索引确不用? 值得写一个FAQ来提高ROI.

首先要解决的是第一类问题, 就是不知道自己要什么:

《德说-第167期, 水中捞月》

第二类问题是真的解释一下为什么有索引不用? 又可以分为两类

1、不能用

例如

  • column和索引的collate 不一致
  • pattern 选择不正确. 要么重新创建索引, 更换ops, 要么换SQL写法.
  • column类型和过滤条件类型不一致.
  • 索引不存在(例如过滤条件是表达式, 但是你的索引确不是表达式).
  • 该索引类型 unsupport 你的过滤操作符, 例如gin不支持排序操作, gin不支持比较操作. btree不支持数组过滤等.

2、能用但是不用

优化器认为使用索引不划算, 代价比其他方法更大, 执行时间更久.

例如

  • random page cost 设置过大, 与实际存储性能不符, 导致优化器误判. 或者采用 bitmap scan 代替index scan
  • where条件过滤性差, 需要过滤大量的无效数据, 由于索引本身需要随机访问, 同时回表需要多次IO, 代价大还不如来个全表扫描
  • 字段值与物理存储的相关性差, 如果排序或者范围扫描需要访问大量数据, 造成大量随机IO.
    • 可以尝试通过 cluster操作提升相关性
  • array 相交匹配, 随着element增加, 过滤变差

例子

1、过滤性差, 相关性差, 离散IO多

create unlogged table t (id int, info text);      
insert into t select random()*100, md5(random()::text) from generate_series(1,10000000);      
      
create index idx_t_id on t(id);      
      
explain (analyze) select count(*) from t where id >=10 and id<=100;      
      
      
postgres=# analyze t;      
ANALYZE      
postgres=# explain (analyze) select count(*) from t where id >=10 and id<=100;      
                                                     QUERY PLAN                                                            
---------------------------------------------------------------------------------------------------------------------      
 Aggregate  (cost=256068.26..256068.27 rows=1 width=8) (actual time=1694.248..1694.249 rows=1 loops=1)      
   ->  Seq Scan on t  (cost=0.00..233331.87 rows=9094554 width=0) (actual time=0.022..1171.259 rows=9049901 loops=1)      
         Filter: ((id >= 10) AND (id <= 100))      
         Rows Removed by Filter: 950099      
 Planning Time: 1.074 ms      
 Execution Time: 1694.304 ms      
(6 rows)      

使用cluster提升where条件字段存储与索引顺序的逻辑相关性, 使用索引

《PostgreSQL 统计信息之 - 逻辑与物理存储的线性相关性》

postgres=# cluster t using idx_t_id ;      
CLUSTER      
postgres=# vacuum analyze t;      
VACUUM      
postgres=# explain (analyze) select count(*) from t where id >=10 and id<=100;      
                                                                QUERY PLAN                                                                      
------------------------------------------------------------------------------------------------------------------------------------------      
 Aggregate  (cost=213612.70..213612.71 rows=1 width=8) (actual time=1296.871..1296.873 rows=1 loops=1)      
   ->  Index Only Scan using idx_t_id on t  (cost=0.43..190821.55 rows=9116456 width=0) (actual time=0.013..745.756 rows=9049901 loops=1)      
         Index Cond: ((id >= 10) AND (id <= 100))      
         Heap Fetches: 0      
 Planning Time: 0.362 ms      
 Execution Time: 1296.898 ms      
(6 rows)      

2、数组, 相交element选择太多, 导致过滤性差

create or replace function gen_rand_array(elements int, i_min int, i_max int) returns int[] as $$      
  select array(select i_min+(random()*(i_max-i_min))::int from generate_series(1,elements)) ;      
$$ language sql strict;      
      
postgres=# select gen_rand_array(10,1,100);      
         gen_rand_array                
---------------------------------      
 {73,76,47,41,19,32,26,82,76,68}      
(1 row)      
      
      
create unlogged table t1 (ids int[], info text);      
insert into t1 select gen_rand_array(10,1,20) , md5(random()::text) from generate_series(1,1000000);      
      
create index idx_t1_ids on t1 using gin (ids);      
      
analyze t1;      
      
postgres=# explain (analyze) select count(*) from t1 where ids && array[1];      
                                                               QUERY PLAN                                                                     
----------------------------------------------------------------------------------------------------------------------------------------      
 Aggregate  (cost=20858.38..20858.39 rows=1 width=8) (actual time=52.235..52.252 rows=1 loops=1)      
   ->  Bitmap Heap Scan on t1  (cost=1928.38..20267.55 rows=236333 width=0) (actual time=20.861..38.848 rows=234343 loops=1)      
         Recheck Cond: (ids && '{1}'::integer[])      
         Heap Blocks: exact=15385      
         ->  Bitmap Index Scan on idx_t1_ids  (cost=0.00..1869.30 rows=236333 width=0) (actual time=18.608..18.609 rows=234343 loops=1)      
               Index Cond: (ids && '{1}'::integer[])      
 Planning Time: 0.132 ms      
 Execution Time: 52.331 ms      
(8 rows)      
      
postgres=# explain (analyze) select count(*) from t1 where ids && array[1,2,3,4,5,6,7,8,9,10];      
                                                    QUERY PLAN                                                          
------------------------------------------------------------------------------------------------------------------      
 Aggregate  (cost=30370.15..30370.16 rows=1 width=8) (actual time=309.121..309.121 rows=1 loops=1)      
   ->  Seq Scan on t1  (cost=0.00..27885.00 rows=994062 width=0) (actual time=0.018..251.098 rows=999014 loops=1)      
         Filter: (ids && '{1,2,3,4,5,6,7,8,9,10}'::integer[])      
         Rows Removed by Filter: 986      
 Planning Time: 0.132 ms      
 Execution Time: 309.152 ms      
(6 rows)      
      
postgres=# set enable_seqscan =off;      
SET      
postgres=# explain (analyze) select count(*) from t1 where ids && array[1,2,3,4,5,6,7,8,9,10];      
                                                                QUERY PLAN                                                                      
------------------------------------------------------------------------------------------------------------------------------------------      
 Aggregate  (cost=38620.31..38620.32 rows=1 width=8) (actual time=388.240..388.241 rows=1 loops=1)      
   ->  Bitmap Heap Scan on t1  (cost=8324.38..36135.15 rows=994062 width=0) (actual time=259.238..329.995 rows=999014 loops=1)      
         Recheck Cond: (ids && '{1,2,3,4,5,6,7,8,9,10}'::integer[])      
         Heap Blocks: exact=15385      
         ->  Bitmap Index Scan on idx_t1_ids  (cost=0.00..8075.86 rows=994062 width=0) (actual time=257.339..257.339 rows=999014 loops=1)      
               Index Cond: (ids && '{1,2,3,4,5,6,7,8,9,10}'::integer[])      
 Planning Time: 0.100 ms      
 Execution Time: 388.340 ms      
(8 rows)      

3、ops选择错误, 导致不能使用索引

postgres=# create database db1 with template template0 lc_collate 'en_US';      
CREATE DATABASE      
      
db1=# \l+      
                                                               List of databases      
   Name    |  Owner   | Encoding | Collate | Ctype |   Access privileges   |  Size   | Tablespace |                Description                       
-----------+----------+----------+---------+-------+-----------------------+---------+------------+--------------------------------------------      
 db1       | postgres | UTF8     | en_US   | en_US |                       | 8649 kB | pg_default |       
      
      
      
      
db1=# create table t (id int, info text);      
CREATE TABLE      
db1=# insert into t select generate_series(1,100000), md5(random()::text);      
INSERT 0 100000      
db1=# create index idx_t_info on t (info);      
CREATE INDEX      
      
db1=# explain select * from t where info ~ '^abc';      
                      QUERY PLAN                            
------------------------------------------------------      
 Seq Scan on t  (cost=0.00..2084.00 rows=10 width=37)      
   Filter: (info ~ '^abc'::text)      
(2 rows)      

更换ops, 使用索引

db1=# create index idx_t_info on t (info text_pattern_ops);      
CREATE INDEX      
db1=# explain select * from t where info ~ '^abc';      
                              QUERY PLAN                                    
----------------------------------------------------------------------      
 Index Scan using idx_t_info on t  (cost=0.42..2.64 rows=10 width=37)      
   Index Cond: ((info ~>=~ 'abc'::text) AND (info ~<~ 'abd'::text))      
   Filter: (info ~ '^abc'::text)      
(3 rows)      

digoal's wechat