In [1]:
import glob
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import tools, config

from pandas.testing import assert_frame_equal

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [2]:
%store -r results_dir

In [3]:
def splitter(fp):
    x = fp.split(".")
    y = x[-2].split("_")[-1]
    return y

def equals(df1, df2):
    try:
        assert_frame_equal(df1, df2, check_names=False)
        return True
    except AssertionError:
        return False
    
def equals_csv(fp1, fp2):
    _df1 = pd.read_csv(fp1)
    _df2 = pd.read_csv(fp2)
    try:
        _df1 = tools.to_consistent(_df1)
        _df2 = tools.to_consistent(_df2)
        assert_frame_equal(_df1, _df2, 
                           check_names=False,
                           check_exact=False,
                           check_less_precise=2)
        return True
    except AssertionError:
        return False

def results_qc(results_dir):
    """Compare the CSV results from a dual SF/BQ query sequence
    
    Parameters
    ----------
    results_dir : str, abs path to folder that contains results files
    """
    
    fps_query_sf = glob.glob(results_dir + config.sep + "query_result_sf*")
    fps_query_bq = glob.glob(results_dir + config.sep + "query_result_bq*")
    
    print(fps_query_sf)
    dfbq = pd.DataFrame([[fp, int(splitter(fp))] for fp in fps_query_bq],
                        columns=["fp_bq", "q_bq"])
    
    dfsf = pd.DataFrame([[fp, int(splitter(fp))] for fp in fps_query_sf],
                        columns=["fp_sf", "q_sf"])
    
    dfbq.sort_values(by="q_bq", inplace=True)
    dfbq.reset_index(inplace=True, drop=True)
    
    dfsf.sort_values(by="q_sf", inplace=True)
    dfsf.reset_index(inplace=True, drop=True)
    
    df = pd.concat([dfbq, dfsf], axis=1)
    
    df.index = df.index + 1
    
    result = df.apply(lambda r: equals_csv(r.fp_bq, r.fp_sf), axis=1)
    
    return df, result


def print_head(df, n, n0, n1):
    #r = df.loc[df.q_bq == n, "fp_bq"].values[0]
    #print("result:", r)
    #print("-"*40)
    #print(r.fp_bq.values)
    
    #df1 = pd.read_csv(r.fp_bq.values)
    #df2 = pd.read_csv(r.fp_sf.values)
    # df.q_bq == 
    df_bq = pd.read_csv(df.loc[n, "fp_bq"])
    df_sf = pd.read_csv(df.loc[n, "fp_sf"])
    
    df_bq.fillna(value=-9999, inplace=True)
    df_sf.fillna(value=-9999, inplace=True)
    df_bq.columns = map(str.lower, df_bq.columns)
    df_sf.columns = map(str.lower, df_sf.columns)

    print("BQ:")
    print(df_bq.head())
    print("-"*40)
    print("SF:")
    print(df_sf.head())
    
    print("="*40)
    
    print("BQ:")
    print(df_bq.loc[n0:n1])
    print("-"*40)
    print("SF:")
    print(df_sf.loc[n0:n1])
    
    print("="*40)
    
    print("BQ:")
    print(df_bq.tail())
    print("-"*40)
    print("SF:")
    print(df_sf.tail())
    
    print("="*40)

    print(df_bq.eq(df_sf))
    return df_bq, df_sf
    

In [4]:
df, result = results_qc(results_dir)

['/home/colin/code/bq_snowflake_benchmark/results/bqsf_ds_01_results-ds_100GB_05-dev_test_06-2020-05-27_17:21:13.001544/query_result_sf_15.csv']


In [5]:
df

Unnamed: 0,fp_bq,q_bq,fp_sf,q_sf
1,/home/colin/code/bq_snowflake_benchmark/result...,15,/home/colin/code/bq_snowflake_benchmark/result...,15


### RESULTS SHOULD BE True if matched!

In [6]:
result

1    True
dtype: bool

In [7]:
if len(df) == 1:
    n = 1

In [8]:
d1, d2 = print_head(df=df, n=n, n0=24, n1=28)

BQ:
   ca_zip       r1
0   30059  1005.02
1   30069   701.38
2   30116   186.72
3   30150   718.28
4   30162  1283.10
----------------------------------------
SF:
   ca_zip       r1
0   30059  1005.02
1   30069   701.38
2   30116   186.72
3   30150   718.28
4   30162  1283.10
BQ:
    ca_zip       r1
24   31952  1327.58
25   32293   935.32
26   32477   366.55
27   32562   343.48
28   32808   852.92
----------------------------------------
SF:
    ca_zip       r1
24   31952  1327.58
25   32293   935.32
26   32477   366.55
27   32562   343.48
28   32808   852.92
BQ:
    ca_zip      r1
95   39310  651.19
96   39391  843.48
97   39431  465.56
98   39454  635.58
99   39532  622.27
----------------------------------------
SF:
    ca_zip      r1
95   39310  651.19
96   39391  843.48
97   39431  465.56
98   39454  635.58
99   39532  622.27
    ca_zip    r1
0     True  True
1     True  True
2     True  True
3     True  True
4     True  True
5     True  True
6     True  True
7     True  True
8   