# ClickHouse. Агрегатные функции

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', 250)
pd.set_option('display.max_rows', 100)
pd.get_option('display.precision', 2)
pd.set_option('display.float_format',  '{:,}'.format)

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/data.csv', sep=';', parse_dates=['dt'])

In [5]:
df.head(50)

Unnamed: 0,dt,user,event,source,amount
0,2022-06-23 12:00:00,u1,view,promo_super,
1,2022-06-23 12:05:00,u1,login,,
2,2022-06-23 12:10:00,u1,view,,
3,2022-06-23 12:20:00,u1,view,,
4,2022-06-23 12:25:00,u1,buy,,100.0
5,2022-06-23 12:30:00,u1,exit,,
6,2022-06-23 13:00:00,u1,view,site,
7,2022-06-23 13:10:00,u1,login,,
8,2022-06-23 13:15:00,u1,view,,
9,2022-06-23 13:20:00,u1,exit,,


In [6]:
df_events = df.iloc[:,[0,1,2]].copy()
df_sales = df.iloc[:,[0,1,4]].copy()
df_sources = df.iloc[:,[0,1,3]].copy()

In [7]:
df_sources.dropna(inplace=True)
df_sales.dropna(inplace=True)
df_sales['amount'] = df_sales['amount'].astype(np.int32)

In [8]:
df_events.head()

Unnamed: 0,dt,user,event
0,2022-06-23 12:00:00,u1,view
1,2022-06-23 12:05:00,u1,login
2,2022-06-23 12:10:00,u1,view
3,2022-06-23 12:20:00,u1,view
4,2022-06-23 12:25:00,u1,buy


In [9]:
df_sales.head()

Unnamed: 0,dt,user,amount
4,2022-06-23 12:25:00,u1,100
14,2022-06-23 14:50:00,u2,100
22,2022-06-23 15:35:00,u3,50
28,2022-06-23 16:25:00,u4,100
32,2022-06-23 17:10:00,u5,100


In [10]:
df_sources.head()

Unnamed: 0,dt,user,source
0,2022-06-23 12:00:00,u1,promo_super
6,2022-06-23 13:00:00,u1,site
10,2022-06-23 14:00:00,u2,promo_super
16,2022-06-23 15:00:00,u3,promo_super
19,2022-06-23 15:20:00,u3,site


In [11]:
%%capture
!sudo apt-get install apt-transport-https ca-certificates dirmngr
!sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4

!echo "deb https://repo.clickhouse.tech/deb/stable/ main/" | sudo tee \
    /etc/apt/sources.list.d/clickhouse.list
!sudo apt-get update

!sudo apt-get install -y clickhouse-server clickhouse-client

!sudo service clickhouse-server start
!clickhouse-client

In [12]:
%%capture
!pip install clickhouse-driver

In [13]:
from clickhouse_driver import Client
client = Client(host='localhost')
client.execute('SHOW DATABASES')

[('INFORMATION_SCHEMA',), ('default',), ('information_schema',), ('system',)]

In [14]:
client.execute('DROP DATABASE IF EXISTS db')
client.execute('CREATE DATABASE db')
client.execute('SHOW DATABASES')

[('INFORMATION_SCHEMA',),
 ('db',),
 ('default',),
 ('information_schema',),
 ('system',)]

In [15]:
client = Client(host='localhost', user='default', port='9000', database='db')

In [16]:
client.execute('DROP TABLE IF EXISTS events')
client.execute('DROP TABLE IF EXISTS sales')
client.execute('DROP TABLE IF EXISTS sources')

[]

In [17]:
client.execute('CREATE TABLE events (dt DateTime, \
                                    user String, \
                                    event String \
                                      ) ENGINE = Memory')
client.execute('CREATE TABLE sales (dt DateTime, \
                                    user String, \
                                    amount  Int32 \
                                      ) ENGINE = Memory')
client.execute('CREATE TABLE sources (dt DateTime, \
                                    user String, \
                                    source  String \
                                      ) ENGINE = Memory')

client.execute('SHOW TABLES FROM db')

[('events',), ('sales',), ('sources',)]

In [18]:
client.execute("INSERT INTO events VALUES", df_events.to_dict('records'))
client.execute("INSERT INTO sales VALUES", df_sales.to_dict('records'))
client.execute("INSERT INTO sources VALUES", df_sources.to_dict('records'))

9

In [19]:
def select_clickhouse(sql):
  return client.query_dataframe(sql)

In [20]:
sql = '''SELECT ev.dt as dt,
                ev.user as user, 
                ev.event as event,
                so.source as source, 
                sa.amount as amount
         FROM events as ev left join sources as so 
         on ev.dt = so.dt and ev.user = so.user
         left join sales as sa
         on ev.dt = sa.dt and ev.user = sa.user
         order by ev.dt, ev.user'''

In [21]:
select_clickhouse(sql)

Unnamed: 0,dt,user,event,source,amount
0,2022-06-23 12:00:00,u1,view,promo_super,0
1,2022-06-23 12:05:00,u1,login,,0
2,2022-06-23 12:10:00,u1,view,,0
3,2022-06-23 12:20:00,u1,view,,0
4,2022-06-23 12:25:00,u1,buy,,100
5,2022-06-23 12:30:00,u1,exit,,0
6,2022-06-23 13:00:00,u1,view,site,0
7,2022-06-23 13:10:00,u1,login,,0
8,2022-06-23 13:15:00,u1,view,,0
9,2022-06-23 13:20:00,u1,exit,,0


In [28]:
sql = '''SELECT e.user,
                countIf(e.event, e.event='view') as view,
                countIf(e.event, e.event='login') as login,
                countIf(e.event, e.event='buy') as buy,
                countIf(e.event, e.event='exit') as exit
         FROM events as e
         GROUP BY e.user
         ORDER BY e.user'''

In [29]:
select_clickhouse(sql)

Unnamed: 0,user,view,login,buy,exit
0,u1,5,2,1,2
1,u2,3,1,1,1
2,u3,3,2,1,2
3,u4,3,1,1,1
4,u5,2,1,2,1
5,u6,2,1,2,1
6,u7,0,1,1,1


In [30]:
sql = '''SELECT topK(3)(e.event) AS top_event
         FROM events as e'''

In [31]:
select_clickhouse(sql)

Unnamed: 0,top_event
0,"[view, login, buy]"
