### Импорт библиотек для работы с бд и дф

In [3]:
import psycopg2
import pandas as pd
import sqlalchemy

### Кредиты для коннекта к бд

In [4]:
conn = psycopg2.connect(host="127.0.0.1", port="5432", database="Adventureworks", 
                            user="postgres", password="postgres")

In [14]:
query = f'''

select l.storeid, r.salesorderid, r.orderdate
from sales.customer l
join sales.salesorderheader r
on l.customerid = r.customerid
  
'''
df = pd.read_sql_query(query, conn)
df.head()



Unnamed: 0,storeid,salesorderid,orderdate
0,1046.0,43659,2011-05-31
1,722.0,43660,2011-05-31
2,852.0,43661,2011-05-31
3,1418.0,43662,2011-05-31
4,484.0,43663,2011-05-31


### Функция ROW_NUMBER () - это оконная функция, которая присваивает последовательное целое число каждой строке в наборе результатов.


In [16]:
query = f'''

select row_number() over (order by storeid, salesorderid, orderdate),
l.storeid, r.salesorderid, r.orderdate
from sales.customer l
join sales.salesorderheader r
on l.customerid = r.customerid
  
'''
df = pd.read_sql_query(query, conn)
df.head()

Unnamed: 0,row_number,storeid,salesorderid,orderdate
0,1,292.0,44132,2011-08-01
1,2,292.0,45579,2012-01-29
2,3,292.0,46389,2012-04-30
3,4,292.0,47454,2012-07-31
4,5,292.0,48395,2012-10-30


In [18]:
query = f'''

select l.storeid, r.salesorderid, r.orderdate, subtotal
from sales.customer l
join sales.salesorderheader r
on l.customerid = r.customerid
  
'''
df = pd.read_sql_query(query, conn)
df.head()



Unnamed: 0,storeid,salesorderid,orderdate,subtotal
0,1046.0,43659,2011-05-31,20565.6206
1,722.0,43660,2011-05-31,1294.2529
2,852.0,43661,2011-05-31,32726.4786
3,1418.0,43662,2011-05-31,28832.5289
4,484.0,43663,2011-05-31,419.4589


### Функция RANK () показывает ранг связанных строк в связанном ранге.
### поэтому ранги могут быть не последовательными. 
### Кроме того, строки с одинаковыми значениями получат одинаковый ранг.
### вычисляет номер строки от предыдущего набора

In [14]:
query = f'''

select  rank() over (order by subtotal), 
l.storeid,
r.salesorderid, r.orderdate, subtotal
from sales.customer l
join sales.salesorderheader r
on l.customerid = r.customerid
where storeid is not null
order by subtotal

'''
df = pd.read_sql_query(query, conn)
df.head(20)



Unnamed: 0,rank,storeid,salesorderid,orderdate,subtotal
0,1,1904,51782,2013-06-30,1.374
1,2,658,65214,2014-01-29,2.748
2,2,1074,53564,2013-07-31,2.748
3,4,1948,44303,2011-08-31,5.7
4,5,1906,65204,2014-01-29,10.788
5,5,1346,65301,2014-01-29,10.788
6,5,864,71842,2014-05-01,10.788
7,8,850,44080,2011-08-01,11.4
8,9,1814,57033,2013-09-30,12.144
9,9,1078,65169,2014-01-29,12.144


### DENSE_RANK () присваивает последовательный номер каждому набору результатов. В отличие от функции RANK (), 
### функция DENSE_RANK () всегда возвращает последовательные значения ранга. 

In [15]:
query = f'''

select  dense_rank() over (order by subtotal), 
l.storeid,
r.salesorderid, r.orderdate, subtotal
from sales.customer l
join sales.salesorderheader r
on l.customerid = r.customerid
where storeid is not null
order by subtotal

'''
df = pd.read_sql_query(query, conn)
df.head(20)



Unnamed: 0,dense_rank,storeid,salesorderid,orderdate,subtotal
0,1,1904,51782,2013-06-30,1.374
1,2,658,65214,2014-01-29,2.748
2,2,1074,53564,2013-07-31,2.748
3,3,1948,44303,2011-08-31,5.7
4,4,1906,65204,2014-01-29,10.788
5,4,1346,65301,2014-01-29,10.788
6,4,864,71842,2014-05-01,10.788
7,5,850,44080,2011-08-01,11.4
8,6,1814,57033,2013-09-30,12.144
9,6,1078,65169,2014-01-29,12.144


### Функция LAG () обеспечивает доступ к строке, которая предшествует текущей строке с указанным физическим смещением. 
### Другими словами, из текущей строки функция LAG () может получить доступ к данным предыдущей строки или строки перед предыдущей строкой и так далее.
### Функция LAG () будет очень полезна для сравнения значений текущей и предыдущей строки. 
### LAG предыдущее значение в выбираемой строке со смещением
### Lead показывает следующую значение

In [27]:
query = f'''

select 
l.storeid,
r.salesorderid, r.orderdate, subtotal,
lag(subtotal) over (order by orderdate),
lead(subtotal) over (order by orderdate),
lag(subtotal,2) over (order by orderdate) as lag_2,
lead(subtotal,2) over (order by orderdate) as lead_2
from sales.customer l
join sales.salesorderheader r
on l.customerid = r.customerid
where storeid = 292
order by orderdate

'''
df = pd.read_sql_query(query, conn)
df.head(20)



Unnamed: 0,storeid,salesorderid,orderdate,subtotal,lag,lead,lag_2,lead_2
0,292,44132,2011-08-01,4049.988,,4079.988,,1104.9968
1,292,45579,2012-01-29,4079.988,4049.988,1104.9968,,27429.5294
2,292,46389,2012-04-30,1104.9968,4079.988,27429.5294,4049.988,32562.6538
3,292,47454,2012-07-31,27429.5294,1104.9968,32562.6538,4079.988,24232.7654
4,292,48395,2012-10-30,32562.6538,27429.5294,24232.7654,1104.9968,37643.0609
5,292,49495,2013-01-28,24232.7654,32562.6538,37643.0609,27429.5294,
6,292,50756,2013-04-30,37643.0609,24232.7654,,32562.6538,


### partition - группировка
### первое или последнее значение по группировке
### Функция NTH_VALUE () возвращает значение из n-й строки в упорядоченном разделе набора результатов.

In [55]:
query = f'''

select distinct
date_trunc('year',orderdate),
first_value(subtotal) over (partition by date_trunc('year',orderdate)),
last_value(subtotal) over (partition by date_trunc('year',orderdate)),
nth_value(subtotal,2) over (partition by date_trunc('year',orderdate))
from sales.customer l
join sales.salesorderheader r
on l.customerid = r.customerid
where storeid = 292


'''
df = pd.read_sql_query(query, conn)
df.head(20)



Unnamed: 0,date_trunc,first_value,last_value,nth_value
0,2011-01-01,4049.988,4049.988,
1,2012-01-01,4079.988,32562.6538,1104.9968
2,2013-01-01,24232.7654,37643.0609,37643.0609


### Накопительное

In [60]:
query = f'''

select 
salesorderid,
subtotal,
orderdate,
date_trunc('year',orderdate),
sum(subtotal) over (order by date_trunc('year',orderdate))
from sales.customer l
join sales.salesorderheader r
on l.customerid = r.customerid
where storeid = 292


'''
df = pd.read_sql_query(query, conn)
df.head(20)



Unnamed: 0,salesorderid,subtotal,orderdate,date_trunc,sum
0,44132,4049.988,2011-08-01,2011-01-01,4049.988
1,45579,4079.988,2012-01-29,2012-01-01,69227.156
2,46389,1104.9968,2012-04-30,2012-01-01,69227.156
3,47454,27429.5294,2012-07-31,2012-01-01,69227.156
4,48395,32562.6538,2012-10-30,2012-01-01,69227.156
5,49495,24232.7654,2013-01-28,2013-01-01,131102.9823
6,50756,37643.0609,2013-04-30,2013-01-01,131102.9823


In [66]:
4049.9880+4079.9880+1104.9968+27429.5294+32562.6538

69227.156

In [75]:
query = f'''

select 
salesorderid,
subtotal,
orderdate,
date_trunc('year',orderdate),
sum(subtotal) over (partition by date_trunc('year',orderdate)) as prt,
sum(subtotal) over (partition by date_trunc('year',orderdate) order by orderdate) as prt_order
from sales.customer l
join sales.salesorderheader r
on l.customerid = r.customerid
where storeid = 292


'''
df = pd.read_sql_query(query, conn)
df.head(20)



Unnamed: 0,salesorderid,subtotal,orderdate,date_trunc,prt,prt_order
0,44132,4049.988,2011-08-01,2011-01-01,4049.988,4049.988
1,45579,4079.988,2012-01-29,2012-01-01,65177.168,4079.988
2,46389,1104.9968,2012-04-30,2012-01-01,65177.168,5184.9848
3,47454,27429.5294,2012-07-31,2012-01-01,65177.168,32614.5142
4,48395,32562.6538,2012-10-30,2012-01-01,65177.168,65177.168
5,49495,24232.7654,2013-01-28,2013-01-01,61875.8263,24232.7654
6,50756,37643.0609,2013-04-30,2013-01-01,61875.8263,61875.8263


In [68]:
24232.7654 + 37643.0609

61875.8263

In [76]:
4079.9880+1104.9968	

5184.9848

### ntile  grouping
### Функция позволяет вам разделить упорядоченные строки в разделе на указанное количество ранжированных групп максимально равного размера. 
### Эти ранжированные группы называются контейнерами\buckets .
### разбиваем на 10 групп, с максимально равным составом в контейнерах

In [100]:
query = f'''

select 
storeid,
salesorderid,
subtotal,
orderdate,
ntile(4) over (order by subtotal) as groupid
from sales.customer l
join sales.salesorderheader r
on l.customerid = r.customerid
where storeid in  (292, 300,658)


'''
df = pd.read_sql_query(query, conn)
df.head(20)



Unnamed: 0,storeid,salesorderid,subtotal,orderdate,groupid
0,658,65214,2.748,2014-01-29,1
1,658,47388,202.332,2012-07-31,1
2,658,50679,386.2702,2013-04-30,1
3,658,58941,564.624,2013-10-30,1
4,658,71867,858.9,2014-05-01,1
5,658,48329,1070.0565,2012-10-30,2
6,292,46389,1104.9968,2012-04-30,2
7,292,44132,4049.988,2011-08-01,2
8,292,45579,4079.988,2012-01-29,2
9,658,53495,4098.648,2013-07-31,2


### Иногда вам может потребоваться создать отчет, который показывает верхние или нижние значения x% из набора данных, например, верхний 1% продуктов по доходу. 
### PostgreSQL предоставляет нам функцию CUME_DIST () для его вычисления
### возвращает относительную позицию значения в наборе значений. Берется от 1, 1 - это все
### Функция PERCENT_RANK () оценивает относительное положение значения в наборе значений Берется от 1, 1 - это все

In [108]:
query = f'''

select 
storeid,
salesorderid,
subtotal,
orderdate,
cume_dist() over (partition by storeid order by  orderdate, subtotal) as cume_dist_pos_distrib,
percent_rank() over (partition by storeid order by orderdate, subtotal) as percent_rank_distrib
from sales.customer l
join sales.salesorderheader r
on l.customerid = r.customerid
where storeid in  (292, 300,658)


'''
df = pd.read_sql_query(query, conn)
df.head(20)



Unnamed: 0,storeid,salesorderid,subtotal,orderdate,cume_dist_pos_distrib,percent_rank_distrib
0,292,44132,4049.988,2011-08-01,0.142857,0.0
1,292,45579,4079.988,2012-01-29,0.285714,0.166667
2,292,46389,1104.9968,2012-04-30,0.428571,0.333333
3,292,47454,27429.5294,2012-07-31,0.571429,0.5
4,292,48395,32562.6538,2012-10-30,0.714286,0.666667
5,292,49495,24232.7654,2013-01-28,0.857143,0.833333
6,292,50756,37643.0609,2013-04-30,1.0,1.0
7,300,53485,57771.7641,2013-07-31,0.25,0.0
8,300,58931,49053.4638,2013-10-30,0.5,0.333333
9,300,65191,56353.869,2014-01-29,0.75,0.666667


### Максимальный и следующий платеж можно сделать через  rank


In [None]:
select  *
from 	(select customer.customer_id , customer.last_name , customer.first_name , payment.payment_date , payment.amount ,
		 rank() over ( partition by payment .customer_id  order by payment.amount desc) as rnk 
		 from payment 
		 	join customer 
		 		on customer.customer_id = payment.customer_id) as top_client
where rnk <=2;

In [119]:
query = f'''

select* 
    from (
        select 
        storeid,
        salesorderid,
        subtotal,
        orderdate,
        rank() over (partition by storeid order by subtotal desc) as rnk
        from sales.customer l
        join sales.salesorderheader r
        on l.customerid = r.customerid
        where storeid in  (292, 300,658)
    ) as top_amount
where rnk <=2

'''
df = pd.read_sql_query(query, conn)
df.head(20)

Unnamed: 0,storeid,salesorderid,subtotal,orderdate,rnk
0,292,50756,37643.0609,2013-04-30,1
1,292,48395,32562.6538,2012-10-30,2
2,300,71805,57990.6876,2014-05-01,1
3,300,53485,57771.7641,2013-07-31,2
4,658,53495,4098.648,2013-07-31,1
5,658,48329,1070.0565,2012-10-30,2


###  Помимо окна можно задать фрейм и считать по фрейму

In [139]:
query = f'''

select 
storeid,
orderdate,
subtotal,
sum(subtotal) over (partition by storeid) as sum_prt,

sum(subtotal) over (partition by storeid 
order by date_trunc('YEAR', orderdate)) as sum_prt_ord,

---- от текущей и до конца фрейма
sum(subtotal) over 
(partition by storeid 
order by date_trunc('YEAR', orderdate) 
rows between current row and unbounded following) as from_cur_to_end,

--считается текущая и все до нее

sum(subtotal) over 
(partition by storeid 
order by date_trunc('YEAR', orderdate) 
rows unbounded preceding) as cur_plus_prev,

--текущая и тре слелующих

sum(subtotal) over 
(partition by storeid 
order by date_trunc('YEAR', orderdate) 
rows between current row and 3 following) as cur_plus_3,

--две предыдущих и три последующих

sum(subtotal) over 
(partition by storeid 
order by date_trunc('YEAR', orderdate) 
rows between 2 preceding and 3 following ) as two_three

from sales.customer l
join sales.salesorderheader r
on l.customerid = r.customerid
where storeid in  (292, 300, 658)

'''
df = pd.read_sql_query(query, conn)
df.head(20)

Unnamed: 0,storeid,orderdate,subtotal,sum_prt,sum_prt_ord,from_cur_to_end,cur_plus_prev,cur_plus_3,two_three
0,292,2011-08-01,4049.988,131102.9823,4049.988,131102.9823,4049.988,41797.6266,41797.6266
1,292,2012-10-30,32562.6538,131102.9823,69227.156,127052.9943,36612.6418,65177.168,69227.156
2,292,2012-01-29,4079.988,131102.9823,69227.156,94490.3405,40692.6298,70257.5751,106870.2169
3,292,2012-04-30,1104.9968,131102.9823,69227.156,90410.3525,41797.6266,90410.3525,127052.9943
4,292,2012-07-31,27429.5294,131102.9823,69227.156,89305.3557,69227.156,89305.3557,94490.3405
5,292,2013-04-30,37643.0609,131102.9823,131102.9823,61875.8263,106870.2169,61875.8263,90410.3525
6,292,2013-01-28,24232.7654,131102.9823,131102.9823,24232.7654,131102.9823,24232.7654,89305.3557
7,300,2013-10-30,49053.4638,221169.7845,106825.2279,221169.7845,49053.4638,221169.7845,221169.7845
8,300,2013-07-31,57771.7641,221169.7845,106825.2279,172116.3207,106825.2279,172116.3207,221169.7845
9,300,2014-01-29,56353.869,221169.7845,221169.7845,114344.5566,163179.0969,114344.5566,221169.7845


In [128]:
858.9000+2.7480+386.2702

1247.9182

### Агрегатная функция с ORDER BY и определением рамки окна по умолчанию будет вычисляться как «бегущая сумма»
### Чтобы агрегатная функция работала со всем разделом, следует опустить ORDER BY или использовать ROWS BETWEEN 

In [142]:
query = f'''

        select 
        storeid,
        salesorderid,
        subtotal,
        orderdate,
        
        -- от текущей и до конца
        sum(subtotal) over (partition by storeid 
        order  by orderdate range 
        between current row and unbounded following) as range_forward,
        
        -- все предыдущие до текущей
        sum(subtotal) over (partition by storeid 
        order  by orderdate 
        range between unbounded preceding and current row) as range_back


        from sales.customer l
        join sales.salesorderheader r
        on l.customerid = r.customerid
        where storeid in  (292, 300,658)
'''
df = pd.read_sql_query(query, conn)
df.head(20)



Unnamed: 0,storeid,salesorderid,subtotal,orderdate,range_forward,range_back
0,292,44132,4049.988,2011-08-01,131102.9823,4049.988
1,292,45579,4079.988,2012-01-29,127052.9943,8129.976
2,292,46389,1104.9968,2012-04-30,122973.0063,9234.9728
3,292,47454,27429.5294,2012-07-31,121868.0095,36664.5022
4,292,48395,32562.6538,2012-10-30,94438.4801,69227.156
5,292,49495,24232.7654,2013-01-28,61875.8263,93459.9214
6,292,50756,37643.0609,2013-04-30,37643.0609,131102.9823
7,300,53485,57771.7641,2013-07-31,221169.7845,57771.7641
8,300,58931,49053.4638,2013-10-30,163398.0204,106825.2279
9,300,65191,56353.869,2014-01-29,114344.5566,163179.0969


### exclude - Исключение в разделе 

In [147]:
query = f'''

        select 
        storeid,
        salesorderid,
        subtotal,
        orderdate,
        sum(subtotal) over (partition by storeid) as total,
        
        
        -- exclude
        -- одна предыдущая и следущая без текущей 
        sum(subtotal) over (partition by storeid 
        order by subtotal 
        rows BETWEEN 1 PRECEDING AND 1 FOLLOWING EXCLUDE CURRENT ROW) as excl 
       
        from sales.customer l
        join sales.salesorderheader r
        on l.customerid = r.customerid
        where storeid in  (292, 300,658)

'''
df = pd.read_sql_query(query, conn)
df.head(20)

Unnamed: 0,storeid,salesorderid,subtotal,orderdate,total,excl
0,292,46389,1104.9968,2012-04-30,131102.9823,4049.988
1,292,44132,4049.988,2011-08-01,131102.9823,5184.9848
2,292,45579,4079.988,2012-01-29,131102.9823,28282.7534
3,292,49495,24232.7654,2013-01-28,131102.9823,31509.5174
4,292,47454,27429.5294,2012-07-31,131102.9823,56795.4192
5,292,48395,32562.6538,2012-10-30,131102.9823,65072.5903
6,292,50756,37643.0609,2013-04-30,131102.9823,32562.6538
7,300,58931,49053.4638,2013-10-30,221169.7845,56353.869
8,300,65191,56353.869,2014-01-29,221169.7845,106825.2279
9,300,53485,57771.7641,2013-07-31,221169.7845,114344.5566
