ClickHouse · alexey-milovidov · Dec 16, 2016 · Dec 16, 2016 · alexey-milovidov · Dec 16, 2016
diff --git a/dbms/benchmark/greenplum/README b/dbms/benchmark/greenplum/README
@@ -0,0 +1,43 @@
+Folder structure
+______________
+dump_dataset_from_ch.sh - bash script that dumps a dataset from Clickhouse
+schema.sql - schema for a Greenplum cluster to load dumped dataset in
+load_data_set.sql - the script that loads up a dumped dataset 
+queries.sql - SQL statements used in the benchmark
+benchmark.sh - this piece of bash conducts a benchmark
+result_parser.py - script to parse benchmark.sh's output and produce python code to build a graph to compare up to 4 benchmark results. 
+Requirements
+____________
+
+Greenplum uses a separate server as a point of entry, so you need 2 servers at least to run a cluster: master and segment hosts. 2 segments host and 56 segments(28 per host) had been used while conducting the test.
+You has has to put segment hostnames in the benchmark.sh.  
+Greenplum quick installation instructions
+_________________________________________
+
+Obtain a stable Greenplum version here(4.3.9.1 was used while conducting the benchmark):
+https://network.pivotal.io/products/pivotal-gpdb
+
+and install it using this detailed guide: 
+http://gpdb.docs.pivotal.io/4340/install_guide/install_guide.html
+
+You should change gp_interconnect_type to 'tcp' if cluster members are connected via 1GB link or lower. 
+There are some variables that has to be changed prior the first benchmark run: gp_vmem_protect_limit and max_statement_mem to allow each segment to use more virtual memory. Here are commands to change this GUCS that has to be executed as gpadmin at the master host:
+
+    gpconfig -c gp_interconnect_type -v tcp
+    gpconfig -c gp_vmem_protect_limit -v 3000
+    gpconfig -c max_statement_mem -v '4000MB'
+
+How to prepare data
+-------------------
+
+One can prepare datasets to run the benchmark on using dump_dataset_from_ch.sh script from this repo. The script has to be run at at Clickhouse host. It takes a long time to get dumps. 
+
+Upload the datasets into Greenplum master.Then run schema.sql to prepare schema and load_data_set.sql to load data up. This operation also takes a long time.  
+
+How to conduct the benchmark
+__________________________
+There is a benchmark.sh that take some arguments. Here is the syntax: 
+
+./benchmark.sh sql_statements_file tablename dbname orca_switch
+
+If you don't know about the last one then just use a default value. 
diff --git a/dbms/benchmark/greenplum/benchmark.sh b/dbms/benchmark/greenplum/benchmark.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+filename=${1-queries.sql}
+table=$2
+dbname=$3
+orca=${4-on}
+host1=somehost
+host2=somehost
+mem='15GB'
+cat $filename  | sed "s/{table}/$table/g" | while read query ;
+do
+    ssh -n $host1 'echo 3 |  tee /proc/sys/vm/drop_caches; sync' > /dev/null
+    ssh -n $host2 'echo 3 |  tee /proc/sys/vm/drop_caches; sync' > /dev/null 
+    sleep 5
+    echo $query | egrep "SELECT UserID, date_trunc\('minute', EventTime\) AS m|SELECT Referer AS key, avg\(length\(Referer\)\) AS l|SELECT URL, count(1) AS c FROM.*GROUP BY URL|SELECT 1, URL, count\(1\) AS c FROM.*GROUP BY 1" && mem='10GB'
+   echo $query | egrep 'SELECT DISTINCT|GROUP BY UserID, SearchPhrase LIMIT 10|count\(DISTINCT UserID\) AS u' && mem='5GB'
+    echo "####################"
+    echo "$query"
+    echo "Timestamp_begin:$(date)"
+    echo  "\\timing off \\\\set optimizer=$orca; set effective_cache_size='256MB'; set statement_mem='$mem';\\timing on \\\\ $query;"  | psql -p 5432 -h 'localhost' -o /dev/null -U gpadmin ${dbname}
+    echo "Timestamp_end:$(date)"
+    echo "Timestamp_begin:$(date)"
+    echo  "\\timing off \\\\set optimizer=$orca; set effective_cache_size='50GB'; set statement_mem='$mem';\\timing on \\\\ $query;"  | psql -p 5432 -h 'localhost' -o /dev/null -U gpadmin ${dbname}
+    echo "Timestamp_end:$(date)"
+    echo "Timestamp_begin:$(date)"
+    echo  "\\timing off \\\\set optimizer=$orca; set effective_cache_size='50GB'; set statement_mem='$mem';\\timing on \\\\ $query;"  | psql -p 5432 -h 'localhost' -o /dev/null -U gpadmin ${dbname}
+    echo "Timestamp_end:$(date)"
+    echo "$query"
+    echo '####################'
+done
diff --git a/dbms/benchmark/greenplum/dump_dataset_from_ch.sh b/dbms/benchmark/greenplum/dump_dataset_from_ch.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+for table in hits_10m_single hits_100m_single hits_1000m_single; do 
+    clickhouse-client -q "SELECT (round(WatchID/2), JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID,round(UserID/2), CounterClass, OS, UserAgent, URL, Referer, Refresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce,round(FUniqID/2), OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID,round(RefererHash/2),round(URLHash/2), CLID) FROM $table FORMAT CSV" > $table 
+done
diff --git a/dbms/benchmark/greenplum/load_data_set.sql b/dbms/benchmark/greenplum/load_data_set.sql
@@ -0,0 +1,12 @@
+COPY hits_all_10m FROM '/data/hits_10m_single.dump' CSV SEGMENT REJECT LIMIT 30 PERCENT;
+CREATE INDEX pk_counterid_eventdate_userid_10m ON hits_all_10m USING btree (counterid, eventdate, userid);
+CREATE INDEX idx_10m_counterid on hits_all_10m using btree (counterid); CREATE INDEX idx_10m_userid on hits_all_10m using btree (userid);
+ANALYZE  hits_all_10m;
+COPY hits_all_100m  from '/data/hits_100m_single.dump' CSV SEGMENT REJECT LIMIT 30 PERCENT;
+CREATE INDEX pk_counterid_eventdate_userid_100m ON hits_all_100m USING btree (counterid, eventdate, userid);
+CREATE INDEX idx_100m_counterid on hits_all_100m using btree (counterid); CREATE INDEX idx_100m_userid on hits_all_100m using btree (userid);
+ANALYZE  hits_all_100m;
+COPY hits_all_1000m from '/data/hits_1000m_single.dump' CSV SEGMENT REJECT LIMIT 30 PERCENT; 
+CREATE INDEX pk_counterid_eventdate_userid_1000m ON hits_all_1000m USING btree (counterid, eventdate, userid);
+CREATE INDEX idx_1000m_counterid on hits_all_1000m using btree (counterid); CREATE INDEX idx_1000m_userid on hits_all_1000m using btree (userid);
+ANALYZE  hits_all_1000m;
diff --git a/dbms/benchmark/greenplum/queries.sql b/dbms/benchmark/greenplum/queries.sql
@@ -0,0 +1,43 @@
+SELECT count(1) FROM {table}
+SELECT count(1) FROM {table} WHERE AdvEngineID != 0
+SELECT sum(AdvEngineID), count(1), avg(ResolutionWidth) FROM {table}
+SELECT sum(UserID) FROM {table}
+SELECT count(UserID) FROM ( SELECT DISTINCT UserID FROM {table} ) AS d
+SELECT count(SearchPhrase) FROM ( SELECT DISTINCT SearchPhrase FROM {table} ) AS d 
+SELECT min(EventDate), max(EventDate) FROM {table}
+SELECT AdvEngineID, count(1) FROM {table} WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY 2 DESC
+SELECT RegionID, count(DISTINCT UserID) AS u FROM {table} GROUP BY RegionID ORDER BY u DESC LIMIT 10
+SELECT RegionID, sum(AdvEngineID), count(1) AS c, avg(ResolutionWidth), count(DISTINCT UserID) FROM {table} GROUP BY RegionID ORDER BY c DESC LIMIT 10
+SELECT MobilePhoneModel, count(DISTINCT UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10
+SELECT MobilePhone, MobilePhoneModel, count(DISTINCT UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10
+SELECT SearchPhrase, count(1) AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10
+SELECT SearchPhrase, count(DISTINCT UserID) AS u FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10
+SELECT SearchEngineID, SearchPhrase, count(1) AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10
+SELECT UserID, count(1) FROM {table} GROUP BY UserID ORDER BY 2 DESC LIMIT 10
+SELECT UserID, SearchPhrase, count(1) FROM {table} GROUP BY UserID, SearchPhrase ORDER BY 3 DESC LIMIT 10
+SELECT UserID, SearchPhrase, count(1) FROM {table} GROUP BY UserID, SearchPhrase LIMIT 10
+SELECT UserID, date_trunc('minute', EventTime) AS m, SearchPhrase, count(1) FROM {table} GROUP BY UserID, m, SearchPhrase ORDER BY count(1) DESC LIMIT 10
+SELECT UserID FROM {table} WHERE UserID = 12345678901234567890
+SELECT count(1) FROM {table} WHERE URL LIKE '%metrika%'
+SELECT SearchPhrase, max(URL) as URL, count(1) AS c FROM {table} h WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10
+SELECT SearchPhrase, max(URL) as URL, min(Title) as Title, count(1) AS c, count(DISTINCT UserID) FROM {table} WHERE Title LIKE '%\xd0\xaf\xd0\xbd\xd0\xb4\xd0\xb5\xd0\xba\xd1\x81%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
+SELECT count(1) FROM {table}
+SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10
+SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10
+SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10
+SELECT CounterID, avg(length(URL)) AS l, count(1) AS c FROM {table} WHERE URL != '' GROUP BY CounterID HAVING count(1) > 100000 ORDER BY l DESC LIMIT 25
+SELECT Referer AS key, avg(length(Referer)) AS l, count(1) AS c, Referer FROM {table} WHERE Referer != '' GROUP BY key HAVING count(1) > 100000 ORDER BY l DESC LIMIT 25
+SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM {table}
+SELECT SearchEngineID, ClientIP, count(1) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10
+SELECT WatchID, ClientIP, count(1) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10
+SELECT WatchID, ClientIP, count(1) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10
+SELECT URL, count(1) AS c FROM {table} GROUP BY URL ORDER BY c DESC LIMIT 10
+SELECT 1, URL, count(1) AS c FROM {table} GROUP BY 1, URL ORDER BY c DESC LIMIT 10
+SELECT ClientIP AS x, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(1) AS c FROM {table} GROUP BY x, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10
+SELECT    URL,    count(1) AS PageViews FROM {table} WHERE    CounterID = 34    AND EventDate between '2013-07-01'::timestamp   AND '2013-07-31'::timestamp    AND DontCountHits =0    AND Refresh = 0    AND URL <>'' GROUP BY URL ORDER BY PageViews DESC LIMIT 10
+SELECT    Title,    count(1) AS PageViews FROM {table} WHERE    CounterID = 34    AND EventDate BETWEEN '2013-07-01'::timestamp  AND '2013-07-31'::timestamp    AND DontCountHits=0    AND Refresh=0   AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10
+SELECT    URL,    count(1) AS PageViews FROM {table} WHERE    CounterID = 34    AND EventDate between '2013-07-01'::timestamp   AND '2013-07-31'::timestamp    AND Refresh = 0   AND IsLink <> 0    AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
+SELECT    TraficSourceID,    SearchEngineID,    AdvEngineID,   case when (SearchEngineID = 0 AND AdvEngineID = 0)  THEN Referer  ELSE '' END Src,    URL AS Dst,    count(1) AS PageViews FROM {table} WHERE    CounterID = 34    AND eventDate between '2013-07-01'::timestamp   AND '2013-07-31'::timestamp   AND Refresh = 0 GROUP BY    TraficSourceID,    SearchEngineID,    AdvEngineID,    Src,    Dst ORDER BY PageViews DESC LIMIT 1000;
+SELECT    URLHash,    EventDate,    count(1) AS PageViews FROM {table} WHERE    CounterID = 34    AND  eventDate between '2013-07-01'::timestamp   AND '2013-07-31'::timestamp   AND  Refresh =0    AND TraficSourceID IN (-1, 6)    AND RefererHash = 7135345792483900000 GROUP BY    URLHash,    EventDate ORDER BY PageViews DESC LIMIT 100
+SELECT    WindowClientWidth,    WindowClientHeight,    count(1) AS PageViews FROM {table} WHERE    CounterID = 34  AND  eventDate between '2013-07-01'::timestamp   AND '2013-07-31'::timestamp    AND Refresh =0    AND  DontCountHits =0    AND URLHash = 7135345792483900000 GROUP BY    WindowClientWidth,    WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
+SELECT    date_trunc('minute', EventTime) AS Minute,    count(1) AS PageViews FROM {table} WHERE    CounterID = 34 AND  eventDate between '2013-07-01'::timestamp   AND '2013-07-31'::timestamp    AND Refresh =0    AND  DontCountHits =0    GROUP BY    Minute ORDER BY Minute;
diff --git a/dbms/benchmark/greenplum/result_parser.py b/dbms/benchmark/greenplum/result_parser.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+
+from __future__ import print_function 
+import sys
+import json
+
+def parse_block(block=[], options=[]):
+
+    #print('block is here', block)
+    #show_query = False
+    #show_query = options.show_query
+    result = []
+    query = block[0].strip()
+    if len(block) > 4:
+        timing1 = block[1].strip().split()[1]
+        timing2 = block[3].strip().split()[1]
+        timing3 = block[5].strip().split()[1]
+    else:
+        timing1 = block[1].strip().split()[1]
+        timing2 = block[2].strip().split()[1]
+        timing3 = block[3].strip().split()[1]
+    if options.show_queries:    
+        result.append( query )
+    if not options.show_first_timings:
+        result += [ timing1 , timing2, timing3 ] 
+    else:
+        result.append(timing1) 
+    return result
+
+
+def read_stats_file(options, fname):
+    result = []
+    int_result = []
+    block = []
+    time_count = 1
+    with open(fname) as f:
+
+        for line in f.readlines():
+
+            if 'SELECT' in line:
+                if len(block) > 1:
+                    result.append( parse_block(block, options) )
+                block = [ line ]
+            elif 'Time:' in line:
+                block.append( line )
+
+    return result 
+
+
+def compare_stats_files(options, arguments):
+    result = []
+    file_output = []
+    pyplot_colors = ['y', 'b', 'g', 'r']
+    for fname in arguments[1:]:
+        file_output.append((read_stats_file(options, fname)))
+    if len(file_output[0]) > 0:
+        timings_count = len(file_output[0])
+    for idx, data_set in enumerate(file_output):
+        int_result = []
+        for timing in data_set:
+           int_result.append(float(timing[0])) #y values 
+        result.append([[x for x in range(0, len(int_result)) ], int_result, 
+pyplot_colors[idx] + '^' ] )
+#        result.append([x for x in range(1, len(int_result)) ]) #x values 
+#        result.append( pyplot_colors[idx] + '^' )
+
+    return result
+
+def parse_args():
+    from optparse import OptionParser
+    parser = OptionParser(usage='usage: %prog [options] [result_file_path]..')
+    parser.add_option("-q", "--show-queries", help="Show statements along with timings", action="store_true", dest="show_queries")
+    parser.add_option("-f", "--show-first-timings", help="Show only first tries timings", action="store_true", dest="show_first_timings")
+    parser.add_option("-c", "--compare-mode", help="Prepare output for pyplot comparing result files.", action="store", dest="compare_mode")
+    (options, arguments) = parser.parse_args(sys.argv)
+    if len(arguments) < 2:
+        parser.print_usage()
+        sys.exit(1)
+    return ( options, arguments )
+
+def gen_pyplot_code(options, arguments):
+    result = ''
+    data_sets = compare_stats_files(options, arguments)
+    for idx, data_set in enumerate(data_sets, start=0):
+        x_values, y_values, line_style = data_set
+        result += '\nplt.plot('
+        result += '%s, %s, \'%s\'' % ( x_values, y_values, line_style )
+        result += ', label=\'%s try\')' % idx 
+    print('import matplotlib.pyplot as plt')
+    print(result)
+    print( 'plt.xlabel(\'Try number\')' )
+    print( 'plt.ylabel(\'Timing\')' )
+    print( 'plt.title(\'Benchmark query timings\')' )
+    print('plt.legend()')
+    print('plt.show()')
+
+
+def gen_html_json(options, arguments):
+    tuples = read_stats_file(options, arguments[1])
+    print('{')
+    print('"system:       GreenPlum(x2),')
+    print('"version":      "%s",' % '4.3.9.1')
+    print('"data_size":    10000000,')
+    print('"time":         "",')
+    print('"comments":     "",')
+    print('"result":')
+    print('[')
+    for s in tuples:
+        print(s)
+    print(']')
+    print('}')
+
+
+def main():
+    ( options, arguments ) = parse_args()
+    if len(arguments) > 2:
+        gen_pyplot_code(options, arguments)
+    else:
+        gen_html_json(options, arguments)
+
+if __name__ == '__main__':
+    main()