# ClickHouse. Функции массивов

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', 250)
pd.set_option('display.max_rows', 100)
pd.get_option('display.precision', 2)
pd.set_option('display.float_format',  '{:,}'.format)

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/data.csv', sep=';', parse_dates=['dt'])

In [4]:
df.head(50)

Unnamed: 0,dt,user,event,source,amount
0,2022-06-23 12:00:00,u1,view,promo_super,
1,2022-06-23 12:05:00,u1,login,,
2,2022-06-23 12:10:00,u1,view,,
3,2022-06-23 12:20:00,u1,view,,
4,2022-06-23 12:25:00,u1,buy,,100.0
5,2022-06-23 12:30:00,u1,exit,,
6,2022-06-23 13:00:00,u1,view,site,
7,2022-06-23 13:10:00,u1,login,,
8,2022-06-23 13:15:00,u1,view,,
9,2022-06-23 13:20:00,u1,exit,,


In [5]:
df_events = df.iloc[:,[0,1,2]].copy()
df_sales = df.iloc[:,[0,1,4]].copy()
df_sources = df.iloc[:,[0,1,3]].copy()

In [6]:
df_sources.dropna(inplace=True)
df_sales.dropna(inplace=True)
df_sales['amount'] = df_sales['amount'].astype(np.int32)

In [7]:
df_events.head()

Unnamed: 0,dt,user,event
0,2022-06-23 12:00:00,u1,view
1,2022-06-23 12:05:00,u1,login
2,2022-06-23 12:10:00,u1,view
3,2022-06-23 12:20:00,u1,view
4,2022-06-23 12:25:00,u1,buy


In [8]:
df_sales.head()

Unnamed: 0,dt,user,amount
4,2022-06-23 12:25:00,u1,100
14,2022-06-23 14:50:00,u2,100
22,2022-06-23 15:35:00,u3,50
28,2022-06-23 16:25:00,u4,100
32,2022-06-23 17:10:00,u5,100


In [9]:
df_sources.head()

Unnamed: 0,dt,user,source
0,2022-06-23 12:00:00,u1,promo_super
6,2022-06-23 13:00:00,u1,site
10,2022-06-23 14:00:00,u2,promo_super
16,2022-06-23 15:00:00,u3,promo_super
19,2022-06-23 15:20:00,u3,site


In [10]:
%%capture
!sudo apt-get install apt-transport-https ca-certificates dirmngr
!sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4

!echo "deb https://repo.clickhouse.tech/deb/stable/ main/" | sudo tee \
    /etc/apt/sources.list.d/clickhouse.list
!sudo apt-get update

!sudo apt-get install -y clickhouse-server clickhouse-client

!sudo service clickhouse-server start
!clickhouse-client

In [11]:
%%capture
!pip install clickhouse-driver

In [12]:
from clickhouse_driver import Client
client = Client(host='localhost')
client.execute('SHOW DATABASES')

[('INFORMATION_SCHEMA',), ('default',), ('information_schema',), ('system',)]

In [13]:
client.execute('DROP DATABASE IF EXISTS db')
client.execute('CREATE DATABASE db')
client.execute('SHOW DATABASES')

[('INFORMATION_SCHEMA',),
 ('db',),
 ('default',),
 ('information_schema',),
 ('system',)]

In [14]:
client = Client(host='localhost', user='default', port='9000', database='db')

In [15]:
client.execute('DROP TABLE IF EXISTS events')
client.execute('DROP TABLE IF EXISTS sales')
client.execute('DROP TABLE IF EXISTS sources')

[]

In [16]:
client.execute('CREATE TABLE events (dt DateTime, \
                                    user String, \
                                    event String \
                                      ) ENGINE = Memory')
client.execute('CREATE TABLE sales (dt DateTime, \
                                    user String, \
                                    amount  Int32 \
                                      ) ENGINE = Memory')
client.execute('CREATE TABLE sources (dt DateTime, \
                                    user String, \
                                    source  String \
                                      ) ENGINE = Memory')

client.execute('SHOW TABLES FROM db')

[('events',), ('sales',), ('sources',)]

In [17]:
client.execute("INSERT INTO events VALUES", df_events.to_dict('records'))
client.execute("INSERT INTO sales VALUES", df_sales.to_dict('records'))
client.execute("INSERT INTO sources VALUES", df_sources.to_dict('records'))

9

In [18]:
def select_clickhouse(sql):
  return client.query_dataframe(sql)

In [19]:
sql = '''SELECT ev.dt as dt,
                ev.user as user, 
                ev.event as event,
                so.source as source, 
                sa.amount as amount
         FROM events as ev left join sources as so 
         on ev.dt = so.dt and ev.user = so.user
         left join sales as sa
         on ev.dt = sa.dt and ev.user = sa.user
         order by ev.dt, ev.user'''

In [20]:
select_clickhouse(sql)

Unnamed: 0,dt,user,event,source,amount
0,2022-06-23 12:00:00,u1,view,promo_super,0
1,2022-06-23 12:05:00,u1,login,,0
2,2022-06-23 12:10:00,u1,view,,0
3,2022-06-23 12:20:00,u1,view,,0
4,2022-06-23 12:25:00,u1,buy,,100
5,2022-06-23 12:30:00,u1,exit,,0
6,2022-06-23 13:00:00,u1,view,site,0
7,2022-06-23 13:10:00,u1,login,,0
8,2022-06-23 13:15:00,u1,view,,0
9,2022-06-23 13:20:00,u1,exit,,0


In [55]:
sql = '''WITH tbl as (SELECT ev.dt as dt,
                            ev.user as user, 
                            ev.event as event,
                            so.source as source,
                            sa.amount as amount
                      FROM events as ev left join sources as so 
                                       on ev.dt = so.dt and ev.user = so.user
                                        left join sales as sa
                                       on ev.dt = sa.dt and ev.user = sa.user
                      ORDER BY ev.dt, ev.user),

              tbl2 as (SELECT t.user, 
                    groupArray(t.dt) as dt_array ,
                    groupArray(t.event) as event_array,
                    groupArray(t.source) as source_array,
                    groupArray(t.amount) as amount_array
              FROM tbl as t
              GROUP BY t.user
              ORDER BY t.user),
              
              tbl3 as (SELECT t.user,
                          t.dt_array,
                          t.event_array,
                          arrayFill(x -> not isNull(x),arrayMap(x->if(length(x)=0,null,x),t.source_array)) as source_array,
                          t.amount_array
                      FROM tbl2 as t),
                                           
              tbl4 as (SELECT t.user, 
                              arrayFilter((i, x) -> assumeNotNull(x)='promo_super', t.dt_array, t.source_array) as dt_array,
                              arrayFilter((i, x) -> assumeNotNull(x)='promo_super', t.event_array, t.source_array) as event_array,
                              arrayFilter((i, x) -> assumeNotNull(x)='promo_super', t.amount_array, t.source_array) as amount_array,
                              arrayEnumerate(amount_array) as index_array
                       FROM tbl3 as t),
              
              tbl5 as (SELECT t.*, 
                              arrayMap(x->date_diff('minute', t.dt_array[x-1],t.dt_array[x]),t.index_array) as delta_dt_array,
                              arrayMap(x->if(x>30,1,0),delta_dt_array) as session_point_array,
                              arrayCumSum(session_point_array) as session_array
                       FROM tbl4 as t)
                       
              SELECT t.*,
                     arrayZip(t.session_array, t.amount_array) as session_amount_array,
                     arrayFilter(x-> x.2>0, session_amount_array) as result_array
              FROM tbl5 as t'''

In [56]:
select_clickhouse(sql)

Unnamed: 0,user,dt_array,event_array,amount_array,index_array,delta_dt_array,session_point_array,session_array,session_amount_array,result_array
0,u1,"[2022-06-23 12:00:00, 2022-06-23 12:05:00, 2022-06-23 12:10:00, 2022-06-23 12:20:00, 2022-06-23 12:25:00, 2022-06-23 12:30:00]","[view, login, view, view, buy, exit]","[0, 0, 0, 0, 100, 0]","[1, 2, 3, 4, 5, 6]","[27599760, 5, 5, 10, 5, 5]","[1, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1]","[(1, 0), (1, 0), (1, 0), (1, 0), (1, 100), (1, 0)]","[(1, 100)]"
1,u2,"[2022-06-23 14:00:00, 2022-06-23 14:05:00, 2022-06-23 14:10:00, 2022-06-23 14:45:00, 2022-06-23 14:50:00, 2022-06-23 14:55:00]","[view, login, view, view, buy, exit]","[0, 0, 0, 0, 100, 0]","[1, 2, 3, 4, 5, 6]","[27599880, 5, 5, 35, 5, 5]","[1, 0, 0, 1, 0, 0]","[1, 1, 1, 2, 2, 2]","[(1, 0), (1, 0), (1, 0), (2, 0), (2, 100), (2, 0)]","[(2, 100)]"
2,u3,"[2022-06-23 15:00:00, 2022-06-23 15:10:00, 2022-06-23 15:15:00]","[login, view, exit]","[0, 0, 0]","[1, 2, 3]","[27599940, 10, 5]","[1, 0, 0]","[1, 1, 1]","[(1, 0), (1, 0), (1, 0)]",[]
3,u4,"[2022-06-23 16:00:00, 2022-06-23 16:05:00, 2022-06-23 16:10:00, 2022-06-23 16:20:00, 2022-06-23 16:25:00, 2022-06-23 16:30:00]","[view, login, view, view, buy, exit]","[0, 0, 0, 0, 100, 0]","[1, 2, 3, 4, 5, 6]","[27600000, 5, 5, 10, 5, 5]","[1, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1]","[(1, 0), (1, 0), (1, 0), (1, 0), (1, 100), (1, 0)]","[(1, 100)]"
4,u5,"[2022-06-23 17:00:00, 2022-06-23 17:05:00, 2022-06-23 17:10:00, 2022-06-23 17:45:00, 2022-06-23 17:50:00, 2022-06-23 17:55:00]","[view, login, buy, view, buy, exit]","[0, 0, 100, 0, 100, 0]","[1, 2, 3, 4, 5, 6]","[27600060, 5, 5, 35, 5, 5]","[1, 0, 0, 1, 0, 0]","[1, 1, 1, 2, 2, 2]","[(1, 0), (1, 0), (1, 100), (2, 0), (2, 100), (2, 0)]","[(1, 100), (2, 100)]"
5,u6,"[2022-06-23 18:00:00, 2022-06-23 18:05:00, 2022-06-23 18:10:00, 2022-06-23 18:20:00, 2022-06-23 18:25:00, 2022-06-23 18:30:00]","[view, login, view, buy, buy, exit]","[0, 0, 0, 100, 100, 0]","[1, 2, 3, 4, 5, 6]","[27600120, 5, 5, 10, 5, 5]","[1, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 1, 1]","[(1, 0), (1, 0), (1, 0), (1, 100), (1, 100), (1, 0)]","[(1, 100), (1, 100)]"
6,u7,"[2022-06-23 19:00:00, 2022-06-23 19:05:00, 2022-06-23 19:10:00]","[login, buy, exit]","[0, 100, 0]","[1, 2, 3]","[27600180, 5, 5]","[1, 0, 0]","[1, 1, 1]","[(1, 0), (1, 100), (1, 0)]","[(1, 100)]"
