### ClickHouse. Шаблон последовательности действий

In [1]:
import yaml
import os
from datetime import datetime

In [2]:
PATH_CONFIG = '/content/settings.yaml'

In [3]:
with open(PATH_CONFIG, encoding='utf8') as f:
    settings = yaml.safe_load(f)

In [4]:
user_ch = settings['DB_CH']['USER']
password_ch = settings['DB_CH']['PASSWORD']
host_ch = settings['DB_CH']['HOST']
port_ch = settings['DB_CH']['PORT']
name_ch = settings['DB_CH']['NAME']

In [5]:
os.environ["CH_PASSWORD"] = password_ch

In [6]:
%%capture
!curl https://clickhouse.com/ | sh

In [7]:
%%capture
!echo ${CH_PASSWORD} | sudo -S ./clickhouse install

In [8]:
%%capture
!sudo clickhouse start

In [9]:
%%capture
%%bash
pip install clickhouse-connect

In [11]:
import clickhouse_connect
client = clickhouse_connect.get_client(host=host_ch,
                                       user=user_ch,
                                       password=password_ch,
                                       port=port_ch)

In [12]:
client.command(f'drop database if exists {name_ch};')
client.command(f'create database {name_ch};')
client.command('show databases;')

'INFORMATION_SCHEMA\ndb\ndefault\ninformation_schema\nsystem'

In [13]:
client = clickhouse_connect.get_client(host=host_ch,
                                       user=user_ch,
                                       password=password_ch,
                                       port=port_ch,
                                       database = name_ch)

In [14]:
client.database

'db'

In [15]:
tbl_name = 'tbl'
column_names = ['uid', 'action', 'dt']
timezone = 'Europe/Moscow'
client.command(f'drop table if exists {tbl_name};')
client.command(f'''create table {tbl_name} ({column_names[0]} Int8, {column_names[1]} String,{column_names[2]} DateTime('{timezone}')) ENGINE Memory;''')

<clickhouse_connect.driver.summary.QuerySummary at 0x78cbfc1b1f30>

In [16]:
row_data = [[1, 2, 3, 4, 1, 2],
            ['pageview', 'pageview', 'pageview', 'pageview', 'submit', 'submit'],
            ['2023-12-14T20:17:21', '2023-12-14T21:17:21','2023-12-14T22:17:21','2023-12-14T23:17:21','2023-12-14T23:25:21','2023-12-14T17:17:21']]

In [17]:
result_data = []
for current_tuple in zip(row_data[0],row_data[1],row_data[2]):
  current_list = list(current_tuple)
  current_list[2] = datetime.fromisoformat(current_list[2])
  result_data.append(current_list)

In [19]:
print(result_data[:1])

[[1, 'pageview', datetime.datetime(2023, 12, 14, 20, 17, 21)]]


In [20]:
client.insert(tbl_name, result_data, column_names=column_names)

<clickhouse_connect.driver.summary.QuerySummary at 0x78cbcb2a2200>

In [21]:
def select_clickhouse(sql):
  return client.query_df(sql)

In [22]:
sql_ch1 = '''select
                   t.uid,
                   t.action,
                   t.dt::datetime as dt
             from tbl as t
             order by t.uid, t.dt::datetime'''

In [23]:
select_clickhouse(sql_ch1)

Unnamed: 0,uid,action,dt
0,1,pageview,2023-12-14 20:17:21
1,1,submit,2023-12-14 23:25:21
2,2,submit,2023-12-14 17:17:21
3,2,pageview,2023-12-14 21:17:21
4,3,pageview,2023-12-14 22:17:21
5,4,pageview,2023-12-14 23:17:21


In [24]:
sql_ch2 = '''select
                   tt.uid,
                   tt.dt::date as date,
                   sequenceMatch('(?1)(?2)')(tt.dt, action='pageview', action='submit') as funnel
             from (select
                         t.uid,
                         t.action,
                         t.dt::datetime as dt
                   from tbl as t) as tt
             group by tt.uid, tt.dt::date'''

In [25]:
select_clickhouse(sql_ch2)

Unnamed: 0,uid,date,funnel
0,1,2023-12-14,1
1,4,2023-12-14,0
2,2,2023-12-14,0
3,3,2023-12-14,0
