In [152]:
import psycopg2
import pandas as pd
import sqlalchemy

In [153]:
conn = psycopg2.connect(host="127.0.0.1", port="5433", database="online_store", 
                            user="admin", password="root")

In [142]:
cur = conn.cursor()

cur.execute('''
           select* from clients.sessions_info 
            ''')

results = cur.fetchall()
pd.DataFrame(results, columns=['session_id', 'client_id', 'src_id', 'Purchase_flg', 'visit_dttm', 'duration', 
                              'page_count', 'last_page_id'])

Unnamed: 0,session_id,client_id,src_id,Purchase_flg,visit_dttm,duration,page_count,last_page_id
0,2232,100,2,1,2022-09-25,15.0,4,5
1,4232,200,1,0,2022-09-26,2.2,12,1
2,5232,300,3,1,2022-09-24,10.0,2,4
3,7232,300,3,0,2022-09-20,10.0,2,4
4,6232,400,3,1,2022-09-21,10.0,2,4


## Обновление статистики

In [143]:
cur.execute('''
            analyze clients.sessions_info 
            ''')

### Смотрим время выполнения запроса без индексов
#### Посчитать кол-во сессий  источника 3

In [146]:
cur.execute('''
            explain analyze
            select 
            count(*) as sessions_count, 
            src_id
            from clients.sessions_info 
            where src_id = 3
            group by src_id
            ''')
cur.fetchall()

[('GroupAggregate  (cost=0.00..1.10 rows=2 width=12) (actual time=0.301..0.301 rows=1 loops=1)',),
 ('  Group Key: src_id',),
 ('  ->  Seq Scan on sessions_info  (cost=0.00..1.06 rows=3 width=4) (actual time=0.238..0.239 rows=3 loops=1)',),
 ('        Filter: (src_id = 3)',),
 ('        Rows Removed by Filter: 2',),
 ('Planning Time: 1.666 ms',),
 ('Execution Time: 0.760 ms',)]

### Посмотреть стоимость последовательного сканирования 

In [118]:
cur.execute('''
            show seq_page_cost;
            ''')
cur.fetchall()

[('1',)]

### Посмотреть стоимость произвольного чтения 

In [119]:
cur.execute('''
            show random_page_cost;
            ''')
cur.fetchall()

[('4',)]

### Изменить стоимость произвольного чтения 

In [172]:
cur.execute('''
            set random_page_cost = 0.1;
            ''')
cur.execute('''
            show random_page_cost;
            ''')
cur.fetchall()

[('0.1',)]

### Создаем индексы для таблицы sessions_info на поле src_id и смотрим время выполнения запроса

In [147]:
cur.execute('''
            CREATE INDEX idx_src_id
            ON clients.sessions_info (src_id);
            ''')
cur.execute('''
            explain analyze
            select 
            count(*) as sessions_count, 
            src_id
            from clients.sessions_info 
            where src_id = 3
            group by src_id
            ''')
cur.fetchall()

[('GroupAggregate  (cost=0.13..0.52 rows=2 width=12) (actual time=0.170..0.171 rows=1 loops=1)',),
 ('  Group Key: src_id',),
 ('  ->  Index Only Scan using idx_src_id on sessions_info  (cost=0.13..0.49 rows=3 width=4) (actual time=0.164..0.165 rows=3 loops=1)',),
 ('        Index Cond: (src_id = 3)',),
 ('        Heap Fetches: 3',),
 ('Planning Time: 0.875 ms',),
 ('Execution Time: 0.233 ms',)]

####  После добавления индекса скорость значительно увеличилась 

In [63]:
cur.execute('''
           select* from clients.clients_info 
            ''')

results = cur.fetchall()
pd.DataFrame(results, columns=['client_id', 'client_name', 'src_id', 'status_id', 'city', 'address', 
                              'registration_dttm', 'start_dttm', 'end_dttm'])

Unnamed: 0,client_id,client_name,src_id,status_id,city,address,registration_dttm,start_dttm,end_dttm
0,100,Andrew Link,2,1,Moscow,Ul. Zatonnaia 6k1,2022-08-25,2022-08-25,3000-01-01
1,200,German Gaban,1,1,Saint-Petersburg,Ul. Hermulich 15,2022-08-24,2022-08-24,3000-01-01
2,300,Fedor Vlasov,1,1,Tver,Ul. Dorojnaia 8,2022-08-24,2022-08-24,3000-01-01
3,400,Anna Vlasova,3,1,Tver,Ul. Dorojnaia 8,2022-08-21,2022-08-21,3000-01-01


### Смотрим время выполнения запроса без индексов
#### Посмотреть клиентов, проживающих на улице Дорожной

In [148]:
cur.execute('''
            explain analyze
            select 
            client_name
            from clients.clients_info 
            where address ilike ('%Dorojnaia%')
            and city = 'Moscow'
            ''')
cur.fetchall()

[('Seq Scan on clients_info  (cost=0.00..10.90 rows=1 width=118) (actual time=0.618..0.618 rows=0 loops=1)',),
 ("  Filter: (((address)::text ~~* '%Dorojnaia%'::text) AND ((city)::text = 'Moscow'::text))",),
 ('  Rows Removed by Filter: 4',),
 ('Planning Time: 4.649 ms',),
 ('Execution Time: 2.266 ms',)]

### Создаем индекс для полнотекстового поиска

In [158]:
cur.execute('''
            alter table clients.clients_info
            add column tsvector_address tsvector;
            ''')

cur.execute('''
            update clients.clients_info
            set tsvector_address = to_tsvector(address)
            ''')


cur.execute('''
            CREATE INDEX idx_gin_document 
            ON clients.clients_info
            USING gin ("tsvector_address")
            ''')

cur.execute('''
            analyze clients.clients_info 
            ''')

cur.execute('''
            explain analyze
            select 
            client_name
            from clients.clients_info 
            where address ilike ('%Dorojnaia%')
            and city = 'Moscow'
            ''')
cur.fetchall()

[('Seq Scan on clients_info  (cost=0.00..1.06 rows=1 width=12) (actual time=0.011..0.012 rows=0 loops=1)',),
 ("  Filter: (((address)::text ~~* '%Dorojnaia%'::text) AND ((city)::text = 'Moscow'::text))",),
 ('  Rows Removed by Filter: 4',),
 ('Planning Time: 0.354 ms',),
 ('Execution Time: 0.048 ms',)]

####  После добавления индекса скорость значительно увеличилась 

In [92]:
cur.execute('''
            select* from orders.orders_info 
            ''')

results = cur.fetchall()
pd.DataFrame(results, columns=['order_id', 'client_id', 'status_id', 'session_id', 'amount', 'discount_flg', 
                              'order_dttm', 'start_dttm', 'end_dttm'])

Unnamed: 0,order_id,client_id,status_id,session_id,amount,discount_flg,order_dttm,start_dttm,end_dttm
0,2367742,100,1,2232,3500.0,0,2022-09-25,2022-09-25,3000-01-01
1,3327834,300,1,5232,6000.0,0,2022-09-24,2022-09-24,3000-01-01
2,4834594,400,1,6232,1500.0,0,2022-09-21,2022-09-21,3000-01-01


### Смотрим время выполнения запроса без индексов
#### Посмотреть клиентов, c суммой заказов > 3000

In [167]:
cur.execute('''
            explain analyze
            select 
            client_id,
            amount
            from orders.orders_info 
            where amount > 3000
            ''')
cur.fetchall()

[('Seq Scan on orders_info  (cost=0.00..1.04 rows=1 width=50) (actual time=0.157..0.159 rows=2 loops=1)',),
 ("  Filter: (amount > '3000'::numeric)",),
 ('  Rows Removed by Filter: 1',),
 ('Planning Time: 0.621 ms',),
 ('Execution Time: 0.250 ms',)]

### Создаем индекс на часть таблицы 

In [168]:
cur.execute('''
            CREATE INDEX idx_amount_3000
            ON orders.orders_info(amount) 
            where amount > 3000
            ''')

cur.execute('''
            analyze orders.orders_info 
            ''')

cur.execute('''
            explain analyze
            select 
            client_id,
            amount
            from orders.orders_info 
            where amount > 3000
            ''')
cur.fetchall()

[('Index Scan using idx_amount_3000 on orders_info  (cost=0.13..0.46 rows=2 width=9) (actual time=0.123..0.125 rows=2 loops=1)',),
 ('Planning Time: 0.330 ms',),
 ('Execution Time: 0.162 ms',)]

### Смотрим время выполнения запроса без индексов
#### Посчитать кол-во сессий клиентов из источника 3 с флагом покупки

In [171]:
cur.execute('''
            explain analyze
            select 
            count(client_id) 
            from clients.sessions_info
            where src_id = 3
            and Purchase_flg = 1
            ''')
cur.fetchall()

[('Aggregate  (cost=1.08..1.09 rows=1 width=8) (actual time=0.373..0.373 rows=1 loops=1)',),
 ('  ->  Seq Scan on sessions_info  (cost=0.00..1.07 rows=1 width=4) (actual time=0.191..0.193 rows=2 loops=1)',),
 ('        Filter: ((src_id = 3) AND (purchase_flg = 1))',),
 ('        Rows Removed by Filter: 3',),
 ('Planning Time: 7.288 ms',),
 ('Execution Time: 1.661 ms',)]

### Создать индекс на несколько полей


In [173]:
cur.execute('''
            CREATE INDEX idx_amount_order_dttm
            ON clients.sessions_info(src_id, Purchase_flg);
            ''')

cur.execute('''
            analyze clients.sessions_info
            ''')

cur.execute('''
            explain analyze
            select 
            count(client_id) 
            from clients.sessions_info
            where src_id = 3
            and Purchase_flg = 1
            ''')
cur.fetchall()

[('Aggregate  (cost=0.38..0.39 rows=1 width=8) (actual time=0.195..0.196 rows=1 loops=1)',),
 ('  ->  Index Scan using idx_amount_order_dttm on sessions_info  (cost=0.13..0.37 rows=2 width=4) (actual time=0.181..0.182 rows=2 loops=1)',),
 ('        Index Cond: ((src_id = 3) AND (purchase_flg = 1))',),
 ('Planning Time: 1.072 ms',),
 ('Execution Time: 0.718 ms',)]

####  После добавления индекса скорость значительно увеличилась 