In [22]:
import pandas as pd
import os
import re
import json
from sql_metadata import Parser
from collections import defaultdict
from itertools import combinations


def filter_interesting_queries(queries):
    res = [re.sub(r'^statement: ', '', q) for q in queries if q.startswith('statement')]
    res = [q for q in res if 'pg_' not in q and not q.startswith('SHOW ALL') and not q.startswith('COMMIT') and not q.startswith('SET') and not q.startswith('BEGIN')]
    res = [q for q in res if 'WHERE' in q or 'ORDER' in q or 'JOIN' in q or 'join' in q]
    return res
def cluster_queries(queries):
    group_counts = defaultdict(int)
    group_repr = {}
    gqueries = set()
    for query in queries:
        try:
            generalized = Parser(query).generalize
            group_counts[generalized] += 1
            group_repr[generalized] = query
            gqueries.add(generalized)
        except:
            pass
    return gqueries, group_counts, group_repr
def get_relevant_columns(clusters, cluster_counts):
    column_usage_counts = defaultdict(int)
    tables = []
    table_columns = defaultdict(set)

    for qgroup, query in clusters.items():
        try:
            pq = Parser(query)
            tables.extend(pq.tables)
            columns = pq.columns_dict
            groups = ['where', 'order_by', 'join']
            for group in groups:
                if group in columns:
                    for col in columns[group]:
                        if '.' in col:
                            column_usage_counts[col] += cluster_counts[qgroup]
                            tab, c = col.split('.')
                            table_columns[tab].add(c)
        except:
            pass
    tables = set(tables)
#     print(table_columns)
    ordered_columns = list(reversed(sorted(list(column_usage_counts.items()), key=lambda x:x[1])))
#     print(ordered_columns)
    return ordered_columns

def get_combinations_list(ordered_columns):
    combs = []
    for i in range(1, len(ordered_columns)+1):
        combs.extend(combinations(ordered_columns, i))
    return combs
def get_columns_from_logs(logs_path):
    QCOL = 13
    df = pd.read_csv(logs_path, header=None)
    queries = filter_interesting_queries(df[QCOL].tolist()); queries
    gqueries, group_counts, group_repr = cluster_queries(queries)
    cols = get_relevant_columns(group_repr, group_counts)
    return cols

In [23]:
cols = get_columns_from_logs('data/workload.csv')

  if (await self.run_code(code, result,  async_=asy)):


In [24]:
cmbns = get_combinations_list(cols)
cmbns

[(('review.i_id', 8022),),
 (('trust.source_u_id', 4020),),
 (('review.u_id', 3956),),
 (('review.creation_date', 3919),),
 (('trust.target_u_id', 2017),),
 (('item.i_id', 1980),),
 (('useracct.u_id', 1939),),
 (('review.i_id', 8022), ('trust.source_u_id', 4020)),
 (('review.i_id', 8022), ('review.u_id', 3956)),
 (('review.i_id', 8022), ('review.creation_date', 3919)),
 (('review.i_id', 8022), ('trust.target_u_id', 2017)),
 (('review.i_id', 8022), ('item.i_id', 1980)),
 (('review.i_id', 8022), ('useracct.u_id', 1939)),
 (('trust.source_u_id', 4020), ('review.u_id', 3956)),
 (('trust.source_u_id', 4020), ('review.creation_date', 3919)),
 (('trust.source_u_id', 4020), ('trust.target_u_id', 2017)),
 (('trust.source_u_id', 4020), ('item.i_id', 1980)),
 (('trust.source_u_id', 4020), ('useracct.u_id', 1939)),
 (('review.u_id', 3956), ('review.creation_date', 3919)),
 (('review.u_id', 3956), ('trust.target_u_id', 2017)),
 (('review.u_id', 3956), ('item.i_id', 1980)),
 (('review.u_id', 3956), 