# Get all features together with the evaluation times for all queries

Features based on the query:   
*  number of relations
*  number of conditions
*  number of filters
*  number of joins

Features based on the join tree:
*  depth
*  container count (min, max, mean, median, q1, q3)
*  branching factors (min, max, mean, median, q1, q3)
*  balancedness factor 

Features based on the data in the database (EXPLAIN):
*  estimated total cost
*  estimated single table rows (min, max, mean, median, q1, q3)
*  estimated join rows (min, max, mean, median, q1, q3)

In [1]:
%%bash
pip install pandas
pip install psycopg2-binary









In [2]:
import csv
import re
import pandas as pd
import psycopg2
import numpy as np
import os
import json

### Get the features based on the structure of the query and database information

In [3]:
def iterate_through_plan(plan, table_rows, join_rows):
    if plan["Node Type"] == "Seq Scan":
        table_rows.append(plan["Plan Rows"])
    elif plan["Node Type"] == "Index Only Scan":
        table_rows.append(plan["Plan Rows"])
    elif plan["Node Type"] == "Index Scan":
        table_rows.append(plan["Plan Rows"])
    elif plan["Node Type"] == "Bitmap Index Scan":
        table_rows.append(plan["Plan Rows"])
    
    if plan["Node Type"] == "Hash Join":
        join_rows.append(plan["Plan Rows"])
    elif plan["Node Type"] == "Merge Join":
        join_rows.append(plan["Plan Rows"])
    elif plan["Node Type"] == "Nested Loop":
        join_rows.append(plan["Plan Rows"])

    if "Plans" in plan.keys():
        for i in range(len(plan["Plans"])):
            iterate_through_plan(plan["Plans"][i], table_rows, join_rows)

    return table_rows, join_rows

In [4]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [5]:
# Define input and output file paths
input_file = 'scala_commands_with_MIN.txt'
output_file = 'results/featuresDatabase.csv'

# Open input and output files
with open(input_file, 'r') as f_input, open(output_file, 'w', newline='') as f_output:
    csv_writer = csv.writer(f_output)
    
    # Write header to CSV file
    csv_writer.writerow(['bench', 'query', '#relations', '#conditions', '#filters', '#joins', 'total cost', 'min(table rows)', 'max(table rows)', 'mean(table rows)',
                         'q25(table rows)', 'median(table rows)', 'q75(table rows)', 'min(join rows)', 'max(join rows)', 'mean(join rows)', 
                         'q25(join rows)', 'median(join rows)', 'q75(join rows)', 'list table rows', 'list join rows', 'text'])
    
    # Read input file line by line
    for line in f_input:
        # Split each line into components
        pattern = r'(?<!\\)\"|\"(?<!\\)(?=\s+\"|$)'
        components = re.split(pattern, line)
        
        # Extract relevant information
        benchmark = components[3]
        number = components[5]
        query = components[1].strip()

        # FEATURES BASED ON QUERY STRUCTURE
        # get the number of relations
        query_upper = query.upper()
        from_index = query_upper.find("FROM")
        where_index = query_upper.find("WHERE")
        number_of_relations = query[from_index:where_index].count(",") + 1

        # get the number of conditions
        number_of_conditions = query.count("AND") + 1

        # get how many filter and join conditions
        parts = query_upper.split("WHERE")[1].split("AND")
        filter = 0
        join = 0
        joins = []
        for p in parts:
            partners = []
            p_split = p.split("=")
            if len(p_split) == 2 and p_split[1].count("'") == 0 and p_split[1].count('"') == 0 and is_number(p_split[1].strip()) == False:
                partners = [p2.strip().split(".")[0] for p2 in p.split("=")]
                if partners not in joins and list(reversed(partners)) not in joins:
                    joins.append(partners)
                    join += 1
            else:
                filter += 1

        # FEATURES BASED ON DATABASE
        # get features based on the Postgresql Plan by EXPLAIN
        if benchmark == "JOB":
            database = "imdb"
        else:
            database = benchmark.lower()
        conn = psycopg2.connect(
            host="postgres",
            database=database,
            user=database,
            password=database
        )
        cur = conn.cursor()
        query2 = query.replace('\\\\\\"', '')
        cur.execute("EXPLAIN (format json)" + query2)
        result = cur.fetchall()
        total_cost = result[0][0][0]["Plan"]["Total Cost"]
        # get the estimated rows for each single table and the intermediate results
        table_rows, join_rows = iterate_through_plan(result[0][0][0]["Plan"], [], [])
        # calculate min, max, mean, median, 0.25-quantile and 0.75-quantile
        table_rows_min = np.min(table_rows)
        table_rows_max = np.max(table_rows)
        table_rows_median = np.median(table_rows)
        table_rows_mean = np.mean(table_rows)
        table_rows_q25 = np.quantile(table_rows, 0.25)
        table_rows_q75 = np.quantile(table_rows, 0.75)
        join_rows_min = np.min(join_rows)
        join_rows_max = np.max(join_rows)
        join_rows_median = np.median(join_rows)
        join_rows_mean = np.mean(join_rows)
        join_rows_q25 = np.quantile(join_rows, 0.25)
        join_rows_q75 = np.quantile(join_rows, 0.75)
                
        # Write data to CSV file
        csv_writer.writerow([benchmark, number, number_of_relations, number_of_conditions, filter, join, total_cost, table_rows_min, table_rows_max, 
                             table_rows_mean, table_rows_q25, table_rows_median, table_rows_q75, join_rows_min, join_rows_max, join_rows_mean, 
                             join_rows_q25, join_rows_median, join_rows_q75, table_rows, join_rows, query])

### Get the features based on the join tree structure (calculated in Scala, imported and formated here)

In [6]:
output_file = 'results/featuresScala.csv'
csv_header = ["bench", "query", "depth", "min(container counts)", "max(container counts)", "mean(container counts)", "q25(container counts)",
              "median(container counts)", "q75(container counts)", "min(branching factors)", "max(branching factors)", "mean(branching factors)", 
              "median(branching factors)", "balancedness factor", "container counts list", "branching factors list"]
with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(csv_header)

    directory = 'rewritten_benchmark_queries/'
    for filename in os.listdir(directory):
        if filename.endswith("output.json"):
            filepath = os.path.join(directory, filename)
            benchmark = filename.split("_")[0]
            number = filename.split("_")[1]
            with open(filepath, 'r') as file:
                data = json.load(file)
                # FEATURES BASED ON THE JOIN TREE STRUCTURE
                feature = data.get("features", [])
                features = feature.split("List(")
                depth = int(features[1][:-2])
                container_counts = [int(x) for x in features[2][:-3].split(", ")]
                container_counts_min = np.min(container_counts)
                container_counts_max = np.max(container_counts)
                container_counts_mean = np.mean(container_counts)
                container_counts_median = np.median(container_counts)
                container_counts_q25 = np.quantile(container_counts, 0.25)
                container_counts_q75 = np.quantile(container_counts, 0.25)
                branching_factors = [int(x) for x in features[3].split("), ")[0].split(", ")]
                branching_factors_min = np.min(branching_factors)
                branching_factors_max = np.max(branching_factors)
                branching_factors_mean = np.mean(branching_factors)
                branching_factors_median = np.median(branching_factors)
                balancedness_factor = float(features[3].split("), ")[1][:-1])

                # Write data to CSV file
                csv_row = [benchmark, number, depth, container_counts_min, container_counts_max, container_counts_mean, container_counts_q25,
                           container_counts_median, container_counts_q75, branching_factors_min, branching_factors_max, branching_factors_mean,
                           branching_factors_median, balancedness_factor, container_counts, branching_factors]
                writer.writerow(csv_row)

### Merge all features and evaluation times for each query

In [7]:
df1 = pd.read_csv('results/featuresDatabase.csv')
df2 = pd.read_csv('results/featuresScala.csv')
df3 = pd.read_csv('results/POS_Scala_comparison_TO.csv')
df3['orig mean'] = df3['orig mean'].replace('TO', 1800).astype("float64")
df3['diff rewr+rewr-orig'] = df3['rewr mean+rewr'] - df3['orig mean']
df3['diff rewr-orig'] = df3['rewr mean'] - df3['orig mean']

merged_df = pd.merge(df1, df2, on=["bench", "query"], how='inner').merge(df3, on=["bench", "query"], how='inner')
merged_df[["#relations", "depth", "orig/rewr(mean)", "text"]]

Unnamed: 0,#relations,depth,orig/rewr(mean),text
0,3,1,orig,"SELECT MIN(u.Id) FROM comments as c, votes as ..."
1,5,2,orig,"SELECT MIN(c.Id) FROM comments as c, postHisto..."
2,6,1,orig,"SELECT MIN(c.Id) FROM comments as c, posts as ..."
3,4,2,orig,"SELECT MIN(pl.Id) FROM postLinks as pl, posts ..."
4,3,1,orig,"SELECT MIN(v.Id) FROM votes as v, badges as b,..."
...,...,...,...,...
198,7,4,orig,SELECT MIN(n.name) AS member_in_charnamed_movi...
199,10,2,orig,SELECT MIN(t.title) AS complete_downey_ironman...
200,10,3,orig,SELECT MIN(t.title) AS complete_downey_ironman...
201,10,7,rewr,"SELECT MIN(Country.CountryId) FROM Country, Ci..."


### Save the resulting dataframe as csv

In [8]:
merged_df[["bench", "query", "orig/rewr(mean)", "orig/rewr+rewr(mean)", "orig mean", "rewr mean", "rewr mean+rewr", "diff rewr-orig", 
           "diff rewr+rewr-orig", "#relations", "#conditions", "#filters", "#joins", 'total cost','min(table rows)',
           'max(table rows)', 'mean(table rows)', 'q25(table rows)', 'median(table rows)', 'q75(table rows)', 'min(join rows)', 'max(join rows)', 
           'mean(join rows)', 'q25(join rows)', 'median(join rows)', 'q75(join rows)', "depth", "min(container counts)", "max(container counts)", 
           "mean(container counts)", "q25(container counts)", "median(container counts)", "q75(container counts)", "min(branching factors)", 
           "max(branching factors)", "mean(branching factors)",  "median(branching factors)", "balancedness factor",'list table rows', 
           'list join rows',"container counts list", "branching factors list", "text"]].to_csv('results/features_times.csv', index=False)