In [1]:
import glob
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import tools

from pandas.testing import assert_frame_equal

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [2]:
results_dir = "/home/colin/code/bq_snowflake_benchmark/results/bqsf_ds_01_results-ds_100GB_05-dev_test_06-2020-05-27_15:32:23.711930/"

In [3]:
def splitter(fp):
    x = fp.split(".")
    y = x[-2].split("_")[-1]
    return y

def equals(df1, df2):
    try:
        assert_frame_equal(df1, df2, check_names=False)
        return True
    except AssertionError:
        return False
    
def equals_csv(fp1, fp2):
    _df1 = pd.read_csv(fp1)
    _df2 = pd.read_csv(fp2)
    try:
        _df1 = tools.to_consistent(_df1)
        _df2 = tools.to_consistent(_df2)
        assert_frame_equal(_df1, _df2, 
                           check_names=False,
                           check_exact=False,
                           check_less_precise=2)
        return True
    except AssertionError:
        return False

def print_head(df, n, n0, n1):
    r = df.loc[n]
    
    df1 = pd.read_csv(r.fp_bq)
    df2 = pd.read_csv(r.fp_sf)
    df1.fillna(value=-9999, inplace=True)
    df2.fillna(value=-9999, inplace=True)
    df1.columns = map(str.lower, df1.columns)
    df2.columns = map(str.lower, df2.columns)

    print(df1.head())
    print("-"*40)
    print(df2.head())
    print("-"*40)

    print("="*40)
    print(df1.loc[n0:n1])
    print("-"*40)
    print(df2.loc[n0:n1])
    
    print("="*40)
    
    print(df1.loc[n0:n1])
    print("-"*40)
    print(df2.loc[n0:n1])
    print("-"*40)
    
    print(df1.eq(df2))
    return df1, df2
    
def results_qc(results_dir):
    """Compare the CSV results from a dual SF/BQ query sequence
    
    Parameters
    ----------
    results_dir : str, abs path to folder that contains results files
    """
    
    fps_query_sf = glob.glob(results_dir + "query_result_sf*")
    fps_query_bq = glob.glob(results_dir + "query_result_bq*")
    
    dfbq = pd.DataFrame([[fp, int(splitter(fp))] for fp in fps_query_bq],
                        columns=["fp_bq", "q_bq"])
    
    dfsf = pd.DataFrame([[fp, int(splitter(fp))] for fp in fps_query_sf],
                        columns=["fp_sf", "q_sf"])
    
    dfbq.sort_values(by="q_bq", inplace=True)
    dfbq.reset_index(inplace=True, drop=True)
    
    dfsf.sort_values(by="q_sf", inplace=True)
    dfsf.reset_index(inplace=True, drop=True)
    
    df = pd.concat([dfbq, dfsf], axis=1)
    
    df.index = df.index + 1
    
    result = df.apply(lambda r: equals_csv(r.fp_bq, r.fp_sf), axis=1)
    
    return df, result

In [4]:
df, result = results_qc(results_dir)

In [5]:
result.head(12)

1      True
2     False
3      True
4      True
5     False
6      True
7      True
8     False
9      True
10     True
11     True
12    False
dtype: bool

In [6]:
d1, d2 = print_head(df=df, n=2, n0=24, n1=28)

   d_week_seq1   f0_   f1_   f2_   f3_  f4_  f5_   f6_
0         5270  3.95  1.54  2.04  1.41  3.5  3.5  3.41
1         5270  3.95  1.54  2.04  1.41  3.5  3.5  3.41
2         5270  3.95  1.54  2.04  1.41  3.5  3.5  3.41
3         5270  3.95  1.54  2.04  1.41  3.5  3.5  3.41
4         5270  3.95  1.54  2.04  1.41  3.5  3.5  3.41
----------------------------------------
   d_week_seq1   f0_  round(mon_sales1/mon_sales2,2)  \
0         5270  3.95                            1.54   
1         5270  3.95                            1.54   
2         5270  3.95                            1.54   
3         5270  3.95                            1.54   
4         5270  3.95                            1.54   

   round(tue_sales1/tue_sales2,2)  round(wed_sales1/wed_sales2,2)  \
0                            2.04                            1.41   
1                            2.04                            1.41   
2                            2.04                            1.41   
3               

In [7]:
d1.loc[26]

d_week_seq1    5271.00
f0_               0.92
f1_               0.97
f2_               1.08
f3_               0.82
f4_               0.90
f5_               1.05
f6_               1.41
Name: 26, dtype: float64

In [8]:
d2.loc[26]

d_week_seq1                       5271.00
f0_                                  0.92
round(mon_sales1/mon_sales2,2)       0.97
round(tue_sales1/tue_sales2,2)       1.08
round(wed_sales1/wed_sales2,2)       0.82
round(thu_sales1/thu_sales2,2)       0.90
round(fri_sales1/fri_sales2,2)       1.05
round(sat_sales1/sat_sales2,2)       1.41
Name: 26, dtype: float64