In [20]:
import json
import os
import pandas as pd
import numpy as np
with open('config.json') as f:
    config = json.load(f)

In [21]:
cpp_files = sorted(os.listdir(config["path_of_cpp"]))
py_files = sorted(os.listdir(config["path_of_python"]))

In [42]:
def compute_err(config,diff):
    file_res = []
    if config["error_compute"]  == "SAE":
        file_res.append(np.nansum(diff))
    elif config["error_compute"] == "SSE":
        file_res.append(np.nansum(np.square(diff)))
    elif config["error_compute"] == "MAE":
        mean_error = np.nanmean(diff)
        file_res.append(mean_error)
    elif config["error_compute"] == "MSE":
        n = np.count_nonzero(~np.isnan(diff))
        mse = np.divide(np.nansum(np.square(diff)), n)
        file_res.append(mse)
    else:
        exit(-1)
    return file_res

In [112]:
err_all = {}
precision = config["float_acc"]
for file in cpp_files:
    err_all[file] = []
for cpp_file, py_file in zip(cpp_files, py_files):
    cpp_table = pd.read_parquet(config["path_of_cpp"] + "/" + cpp_file)
    py_table = pd.read_parquet(config["path_of_python"] + "/" + py_file)
    if cpp_table.dtypes.to_dict() == py_table.dtypes.to_dict():
        for col_name,col_type in cpp_table.dtypes.to_dict().items():
            if col_type == np.dtype('float64'):
                cpp_col = np.around(cpp_table[col_name].to_numpy(), decimals = precision)
                py_col = np.around(py_table[col_name].to_numpy(), decimals = precision)
                diff = np.absolute(cpp_col - py_col)
                if config["nan_handling"] == "labeling":
                    indices = np.where(np.logical_or(np.isnan(cpp_col), np.isnan(py_col)))
                    err_all[py_file].append((compute_err(config,diff),indices))
                elif config["nan_handling"] == "ignore":
                    err_all[py_file].append(compute_err(config,diff))
                else:
                    err_all[py_file] = []
            elif col_type == np.dtype('int64'):
                cpp_col = cpp_table[col_name].to_numpy()
                py_col = py_table[col_name].to_numpy()
                diff = np.absolute(cpp_col - py_col)
                if config["nan_handling"] == "labeling":
                    indices = np.where(np.logical_or(np.isnan(cpp_col), np.isnan(py_col)))
                    err_all[py_file].append((compute_err(config,diff),indices))
                elif config["nan_handling"] == "ignore":
                    err_all[py_file].append(compute_err(config,diff))
                else:
                    err_all[py_file] = []
            else:
                continue

In [113]:
err_all

{'000001.XSHE.parquet': [[0.0], [0.0], [0.0], [0.0], [0.0]],
 '000002.XSHE.parquet': [[0.0], [0.0], [0.0], [0.0], [0.0]],
 '000004.XSHE.parquet': [[0.0], [0.0], [0.0], [0.0], [0.0]],
 '000005.XSHE.parquet': [[0.0], [0.0], [0.0], [0.0], [0.0]],
 '000006.XSHE.parquet': [[0.0], [0.0], [0.0], [0.0], [0.0]],
 '000007.XSHE.parquet': [[0.0], [0.0], [0.0], [0.0], [0.0]],
 '000008.XSHE.parquet': [[0.0], [0.0], [0.0], [0.0], [0.0]],
 '000009.XSHE.parquet': [[0.0], [0.0], [0.0], [0.0], [0.0]]}

In [78]:
#normal
col_names = []
cpp_table = pd.read_parquet(config["path_of_cpp"] + "/" + cpp_files[0])
for col_name,col_type in cpp_table.dtypes.to_dict().items():
    col_names.append(col_name)
for file in cpp_files:
    err = err_all[file]
    print(err)
#per_col
#per_table

[0.0, [0.0]]
[0.0, [0.0]]
[0.0, [0.0]]
[0.0, [0.0]]
[0.0, [0.0]]
[0.0, [0.0]]
[0.0, [0.0]]
[0.0, [0.0]]


In [12]:
if config["format"] == "parquet":
    

['000008.XSHE.parquet',
 '000006.XSHE.parquet',
 '000005.XSHE.parquet',
 '000004.XSHE.parquet',
 '000009.XSHE.parquet',
 '000001.XSHE.parquet',
 '000002.XSHE.parquet',
 '000007.XSHE.parquet']