# Get all features together with the evaluation times for all queries (SparkSQL)

Features based on the query:   
*  number of relations
*  number of conditions
*  number of filters
*  number of joins

Features based on the join tree:
*  depth
*  container count (min, max, mean, median, q1, q3)
*  branching factors (min, max, mean, median, q1, q3)
*  balancedness factor 

In [1]:
%%bash
pip install pandas==2.2.3
pip install psycopg2-binary==2.9.9









In [2]:
import csv
import re
import pandas as pd
import psycopg2
import numpy as np
import os
import json

### Get the features based on the structure of the query 

In [3]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [4]:
# Define input and output file paths
input_file = 'scala_commands_augment_full_enum.txt'
output_file = 'results/featuresDatabase_SPA_full_enum.csv'

# Open input and output files
with open(input_file, 'r') as f_input, open(output_file, 'w', newline='') as f_output:
    csv_writer = csv.writer(f_output)
    
    # Write header to CSV file
    csv_writer.writerow(['bench', 'query', '#relations', '#conditions', '#filters', '#joins', 'text'])
    
    # Read input file line by line
    for line in f_input:
        # Split each line into components
        pattern = r'(?<!\\)\"|\"(?<!\\)(?=\s+\"|$)'
        components = re.split(pattern, line)
        
        # Extract relevant information
        benchmark = components[3]
        number = components[5]
        query = components[1].strip()

        # FEATURES BASED ON QUERY STRUCTURE
        # get the number of relations
        query_upper = query.upper()
        from_index = query_upper.find("FROM")
        where_index = query_upper.find("WHERE")
        number_of_relations = query[from_index:where_index].count(",") + 1

        # get the number of conditions
        number_of_conditions = query.count("AND") + 1

        # get how many filter and join conditions
        parts = query_upper.split("WHERE")[1].split("AND")
        filter = 0
        join = 0
        joins = []
        for p in parts:
            partners = []
            p_split = p.split("=")
            if len(p_split) == 2 and p_split[1].count("'") == 0 and p_split[1].count('"') == 0 and is_number(p_split[1].strip()) == False:
                partners = [p2.strip().split(".")[0] for p2 in p.split("=")]
                if partners not in joins and list(reversed(partners)) not in joins:
                    joins.append(partners)
                    join += 1
            else:
                filter += 1
                
        # Write data to CSV file
        csv_writer.writerow([benchmark, number, number_of_relations, number_of_conditions, filter, join, query])

### Get the features based on the join tree structure (calculated in Scala, imported and formated here)

In [5]:
output_file = 'results/featuresScala_SPA_full_enum.csv'
csv_header = ["bench", "query", "depth", "min(container counts)", "max(container counts)", "mean(container counts)", "q25(container counts)",
              "median(container counts)", "q75(container counts)", "min(branching factors)", "max(branching factors)", "mean(branching factors)", 
              "median(branching factors)", "q25(branching factors)", "q75(branching factors)", "balancedness factor", "container counts list", 
              "branching factors list"]
with open(output_file, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(csv_header)

    directory = 'output/'
    for filename in os.listdir(directory):
        if filename.endswith("output.json"):
            filepath = os.path.join(directory, filename)
            benchmark = filename.split("_")[0]
            number = filename.split("_")[1]
            with open(filepath, 'r') as file:
                data = json.load(file)
                # FEATURES BASED ON THE JOIN TREE STRUCTURE
                feature = data.get("features", [])
                features = feature.split("List(")
                depth = int(features[1][:-2])
                container_counts = [int(x) for x in features[2][:-3].split(", ")]
                container_counts_min = np.min(container_counts)
                container_counts_max = np.max(container_counts)
                container_counts_mean = np.mean(container_counts)
                container_counts_median = np.median(container_counts)
                container_counts_q25 = np.quantile(container_counts, 0.25)
                container_counts_q75 = np.quantile(container_counts, 0.75)
                branching_factors = [int(x) for x in features[3].split("), ")[0].split(", ")]
                branching_factors_min = np.min(branching_factors)
                branching_factors_max = np.max(branching_factors)
                branching_factors_mean = np.mean(branching_factors)
                branching_factors_median = np.median(branching_factors)
                branching_factors_q25 = np.quantile(branching_factors, 0.25)
                branching_factors_q75 = np.quantile(branching_factors, 0.75)
                balancedness_factor = float(features[3].split("), ")[1][:-1])

                # Write data to CSV file
                csv_row = [benchmark, number, depth, container_counts_min, container_counts_max, container_counts_mean, container_counts_q25,
                           container_counts_median, container_counts_q75, branching_factors_min, branching_factors_max, branching_factors_mean,
                           branching_factors_median, branching_factors_q25, branching_factors_q75, balancedness_factor, container_counts, 
                           branching_factors]
                writer.writerow(csv_row)

### Merge features and evaluation times for each query

1. Merge features of the query structure and the join tree with the evaluation time:

In [6]:
df1 = pd.read_csv('results/featuresDatabase_SPA_full_enum.csv')
df2 = pd.read_csv('results/featuresScala_SPA_full_enum.csv')
df4 = pd.read_csv('results/SPA_Scala_comparison_TO_augment_server_full_enum_infos.csv')
df2["bench"] = df2["bench"].replace("IMDB", "JOB")

merged_df = pd.merge(df1, df2, on=["bench", "query"], how='inner').merge(df4, on=["bench", "query"], how='inner')

2. Merge features of the query structure, the join tree and the PostgreSQL EXPLAIN with the evaluation time:

In [7]:
df0_extra = pd.read_csv('results/featuresDatabase_POS_extra_full_enum.csv')
df0_extra = df0_extra[['bench', 'query', 'total cost', 'min(table rows)', 'max(table rows)', 'mean(table rows)', 'q25(table rows)', 
                       'median(table rows)', 'q75(table rows)', 'min(join rows)', 'max(join rows)', 'mean(join rows)', 'q25(join rows)', 
                       'median(join rows)', 'q75(join rows)']]

In [8]:
df1_extra = pd.read_csv('results/featuresDatabase_SPA_full_enum.csv')
df2_extra = pd.read_csv('results/featuresScala_SPA_full_enum.csv')
df4_extra = pd.read_csv('results/POS_Scala_comparison_TO_augment_server_full_enum_infos.csv')
df2_extra["bench"] = df2_extra["bench"].replace("IMDB", "JOB")

merged_df_extra = pd.merge(pd.merge(df1_extra, df0_extra, on=["bench", "query"], how='inner'), df2_extra, on=["bench", "query"], how='inner').merge(df4_extra, on=["bench", "query"], how='inner')

### Save the resulting dataframes as csv

In [9]:
merged_df[["bench", "query", "orig/rewr(med)", "orig(med)", "rewr(med)", "stage0(med)", "stage1(med)", "stage2(med)", "stage3(med)", 
           "#relations", "#conditions", "#filters", "#joins", "depth", "min(container counts)", "max(container counts)", 
           "mean(container counts)", "q25(container counts)", "median(container counts)", "q75(container counts)", "min(branching factors)", 
           "max(branching factors)", "mean(branching factors)",  "median(branching factors)", "q25(branching factors)", "q75(branching factors)", 
           "balancedness factor", "container counts list", "branching factors list", "text"]].to_csv('results/features_times_SPA_full_enum_infos.csv', index=False)

In [10]:
merged_df_extra[["bench", "query", "orig/rewr(med)", "orig(med)", "rewr(med)", "stage0(med)", "stage1(med)", "stage2(med)", "stage3(med)", 
           "#relations", "#conditions", "#filters", "#joins", 'total cost','min(table rows)',
           'max(table rows)', 'mean(table rows)', 'q25(table rows)', 'median(table rows)', 'q75(table rows)', 'min(join rows)', 'max(join rows)', 
           'mean(join rows)', 'q25(join rows)', 'median(join rows)', 'q75(join rows)', "depth", "min(container counts)", "max(container counts)", 
           "mean(container counts)", "q25(container counts)", "median(container counts)", "q75(container counts)", "min(branching factors)", 
           "max(branching factors)", "mean(branching factors)",  "median(branching factors)", "q25(branching factors)", "q75(branching factors)", 
           "balancedness factor","container counts list", "branching factors list", 
           "text"]].to_csv('results/features_times_SPA_extra_full_enum_infos.csv', index=False)

## The same for the 0MA data with additional stage infos

### Merge features and evaluation times for each query

1. Merge features of the query structure and the join tree with the evaluation time:

In [11]:
df1 = pd.read_csv('results/featuresDatabase_SPA.csv')
df2 = pd.read_csv('results/featuresScala_SPA.csv')
df3 = pd.read_csv('results/featuresHypergraph_SPA.csv')
df4 = pd.read_csv('results/SPA_Scala_comparison_TO_augment_server_infos.csv')

merged_df = pd.merge(df1, df2, on=["bench", "query"], how='inner').merge(df3, on=["bench", "query"], how='inner')\
                .merge(df4, on=["bench", "query"], how='inner')

2. Merge features of the query structure, the join tree and the PostgreSQL EXPLAIN with the evaluation time:

In [12]:
df0_extra = pd.read_csv('results/featuresDatabase_POS_extra.csv')
df0_extra = df0_extra[['bench', 'query', 'total cost', 'min(table rows)', 'max(table rows)', 'mean(table rows)', 'q25(table rows)', 
                       'median(table rows)', 'q75(table rows)', 'min(join rows)', 'max(join rows)', 'mean(join rows)', 'q25(join rows)', 
                       'median(join rows)', 'q75(join rows)']]

In [13]:
df1_extra = pd.read_csv('results/featuresDatabase_SPA.csv')
df2_extra = pd.read_csv('results/featuresScala_SPA.csv')
df4_extra = pd.read_csv('results/POS_Scala_comparison_TO_augment_server_infos.csv')
df2_extra["bench"] = df2_extra["bench"].replace("IMDB", "JOB")

merged_df_extra = pd.merge(pd.merge(df1_extra, df0_extra, on=["bench", "query"], how='inner'), df2_extra, on=["bench", "query"], how='inner').merge(df4_extra, on=["bench", "query"], how='inner')

### Save the resulting dataframes as csv

In [14]:
merged_df[["bench", "query", "orig/rewr(med)", "orig(med)", "rewr(med)", "stage0(med)", "stage1(med)", 
           "#relations", "#conditions", "#filters", "#joins", "depth", "min(container counts)", "max(container counts)", 
           "mean(container counts)", "q25(container counts)", "median(container counts)", "q75(container counts)", "min(branching factors)", 
           "max(branching factors)", "mean(branching factors)",  "median(branching factors)", "q25(branching factors)", "q75(branching factors)", 
           "balancedness factor", "container counts list", "branching factors list", "text"]].to_csv('results/features_times_SPA_infos.csv', index=False)

In [15]:
merged_df_extra[["bench", "query", "orig/rewr(med)", "orig(med)", "rewr(med)", "stage0(med)", "stage1(med)",
           "#relations", "#conditions", "#filters", "#joins", 'total cost','min(table rows)',
           'max(table rows)', 'mean(table rows)', 'q25(table rows)', 'median(table rows)', 'q75(table rows)', 'min(join rows)', 'max(join rows)', 
           'mean(join rows)', 'q25(join rows)', 'median(join rows)', 'q75(join rows)', "depth", "min(container counts)", "max(container counts)", 
           "mean(container counts)", "q25(container counts)", "median(container counts)", "q75(container counts)", "min(branching factors)", 
           "max(branching factors)", "mean(branching factors)",  "median(branching factors)", "q25(branching factors)", "q75(branching factors)", 
           "balancedness factor","container counts list", "branching factors list", 
           "text"]].to_csv('results/features_times_SPA_extra_infos.csv', index=False)