In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import qc, config, tools, sfa

from importlib import reload

pd.options.display.max_rows = 9999
pd.options.display.max_columns = 9999
pd.set_option('precision', 7)

### Single Query Result Comparison

In [2]:
query_n = 75

a = qc.QueryQC()
a.test = "ds"
a.scale = 1
a.cid = "01"
a.stream_n = 1
a.desc =  f"qc_query"
a.seq_id = "NA"

a.verbose = False
a.verbose_query = True
a.qual = False
a.save = False
a.verbose_iter = True

a.set_timestamp_dir()

a.run_single(query_n=query_n)

SNOWFLAKE QUERY TEXT
ALTER SESSION SET TIMEZONE = 'UTC'

SNOWFLAKE QUERY TEXT
ALTER SESSION SET QUERY_TAG = 'ds_1gb_01-xx-qc_query'

SNOWFLAKE QUERY TEXT
ALTER SESSION SET USE_CACHED_RESULT=false

SNOWFLAKE QUERY TEXT
USE WAREHOUSE TEST9000

SNOWFLAKE QUERY TEXT
USE DATABASE ds_1GB_01

Snowflake Start Query: 75
--------------------
Stream Completion: 1 / 1
Query Label: ds_1gb_01-q75-1-qc_query
--------------------

SNOWFLAKE QUERY TEXT
ALTER SESSION SET QUERY_TAG = 'ds_1gb_01-q75-1-qc_query'

SNOWFLAKE QUERY TEXT

WITH all_sales AS (
 SELECT d_year
       ,i_brand_id
       ,i_class_id
       ,i_category_id
       ,i_manufact_id
       ,SUM(sales_cnt) AS sales_cnt
       ,SUM(sales_amt) AS sales_amt
 FROM (SELECT d_year
             ,i_brand_id
             ,i_class_id
             ,i_category_id
             ,i_manufact_id
             ,cs_quantity - COALESCE(cr_return_quantity,0) AS sales_cnt
             ,cs_ext_sales_price - COALESCE(cr_return_amount,0.0) AS sales_amt
       FROM c

In [3]:
df_sf = a.result_sf
df_bq = a.result_bq

In [4]:
df_sf.dtypes

PREV_YEAR           int16
YEAR                int16
I_BRAND_ID          int32
I_CLASS_ID           int8
I_CATEGORY_ID        int8
I_MANUFACT_ID       int16
PREV_YR_CNT         int64
CURR_YR_CNT         int64
SALES_CNT_DIFF      int64
SALES_AMT_DIFF    float64
dtype: object

In [5]:
df_bq.dtypes

prev_year           int64
year                int64
i_brand_id          int64
i_class_id          int64
i_category_id       int64
i_manufact_id       int64
prev_yr_cnt         int64
curr_yr_cnt         int64
sales_cnt_diff      int64
sales_amt_diff    float64
dtype: object

In [6]:
df_sf = tools.to_consistent(df_sf, n=5)
df_bq = tools.to_consistent(df_bq, n=5)

In [7]:
qc.assert_equal(df_sf, df_bq)

True

In [8]:
def color_false(val):
    mapper = {True: 'white', False: 'red'}
    return 'color: %s' % mapper[val]

In [9]:
diff = df_sf.eq(df_bq)
diff.loc[0:5].style.applymap(color_false)

Unnamed: 0,prev_year,year,i_brand_id,i_class_id,i_category_id,i_manufact_id,prev_yr_cnt,curr_yr_cnt,sales_cnt_diff,sales_amt_diff
0,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True,True,True


In [10]:
df_diff = diff.sum().sum() / (diff.shape[0] * diff.shape[1])
print("Percent Data Match:", df_diff)

Percent Data Match: 1.0


In [11]:
diff.sum()/diff.shape[0]

prev_year         1.0
year              1.0
i_brand_id        1.0
i_class_id        1.0
i_category_id     1.0
i_manufact_id     1.0
prev_yr_cnt       1.0
curr_yr_cnt       1.0
sales_cnt_diff    1.0
sales_amt_diff    1.0
dtype: float64

In [12]:
diff.style.applymap(color_false)

Unnamed: 0,prev_year,year,i_brand_id,i_class_id,i_category_id,i_manufact_id,prev_yr_cnt,curr_yr_cnt,sales_cnt_diff,sales_amt_diff
0,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True,True,True,True
9,True,True,True,True,True,True,True,True,True,True
