# Reconciliation of Text/Numerical Mixed Data

Author [Edward Lu](https://github.com/edwardlu71/notebooks)
Date: Oct 2020

In [None]:
from IPython.display import display, HTML
import pyodbc
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session

In [None]:
# load sql query
from lib.edkit import constants
c = constants.EdkitConstants()
c.load_config_from_file()
sqls = c.CONFIG_DICT['sql']

In [None]:
#query = sqls["bondsquotesrt"]
query = sqls["bondsquoteshist"]
#query = sqls["fxrates"]
#query = sqls["spotrates"]
#query = sqls["floatingrates"]
#query = sqls["floatingrates"]
tolerance=0.01  # percent. 0.01 means 0.01%
tolerance_type = "percent"
selected=None

In [None]:
# sybase
server1 = "server_a"
server2 = "server_b"
uri1 = f"DRIVER=FreeTDS;SERVER={server1};PORT=5555;DATABASE=Data;UID=tester;PWD=password;"
uri2 = f"DRIVER=FreeTDS;SERVER={server2};PORT=5555;DATABASE=Data;UID=tester;PWD=password;"

def creator1():
    return pyodbc.connect(uri1)
def creator2():
    return pyodbc.connect(uri2)

engine1 = sqlalchemy.create_engine("sybase+pyodbc://", creator=creator1)
engine2 = sqlalchemy.create_engine("sybase+pyodbc://", creator=creator2)

In [None]:
df1 = pd.read_sql_query(query, con=engine1)
df2 = pd.read_sql_query(query, con=engine2)
display(df1)
display(df2)

In [None]:
# first column of data frame can't have duplication
df1.iloc[:,0].duplicated().any() or df2.iloc[:,0].duplicated().any()

In [None]:
# All rows in df1 that do not have a match in df2
df1_only = df1[~df1.iloc[:,0].isin(df2.iloc[:,0])].reset_index(drop=True)
df2_only = df2[~df2.iloc[:,0].isin(df1.iloc[:,0])].reset_index(drop=True)

In [None]:
df_only = pd.DataFrame({"Sample_x": df1_only.iloc[:, 0], "Sample_y": df2_only.iloc[:, 0]})

In [None]:
display(df_only)

In [None]:
# all rows in df1 that are existing in the index (column 1) list of df2 
df1_intersection = df1[~df1.iloc[:,0].isin(list(df1_only.iloc[:,0]))]
df1_intersection = df1_intersection.sort_values(by=[df1_intersection.columns[0]]).reset_index(drop=True)
#df1_intersection.reset_index(drop=True, inplace=True)
df2_intersection = df2[~df2.iloc[:,0].isin(list(df2_only.iloc[:,0]))]
df2_intersection = df2_intersection.sort_values(by=[df2_intersection.columns[0]]).reset_index(drop=True)
#df2_intersection.reset_index(drop=True, inplace=True)

In [None]:
df1_intersection.columns[0]

In [None]:
if selected is None:  # tolerance comparison is applied to all numerical fields
    df1_intersection_abc = df1_intersection.select_dtypes(exclude=['float64', 'int64'])
    df1_intersection_123 = df1_intersection.select_dtypes(include=['float64', 'int64'])
    df2_intersection_abc = df2_intersection.select_dtypes(exclude=['float64', 'int64'])
    df2_intersection_123 = df2_intersection.select_dtypes(include=['float64', 'int64'])
else:  # tolerance comparison is applied to selected numerical fields
    df1_intersection_abc = df1_intersection[df1_intersection.columns[~df1_intersection.columns.isin(selected)]]
    df1_intersection_123 = df1_intersection[df1_intersection.columns[df1_intersection.columns.isin(selected)]]
    df2_intersection_abc = df2_intersection[df2_intersection.columns[~df2_intersection.columns.isin(selected)]]
    df2_intersection_123 = df2_intersection[df2_intersection.columns[df2_intersection.columns.isin(selected)]]
    
df1_intersection_abc.info(), df1_intersection_123.info()

In [None]:
# index of difference of abc
index_diff_abc = np.where(df1_intersection_abc != df2_intersection_abc)
# prepare mask of abc
np_mask_abc = np.full(df1_intersection_abc.shape, False)
np_mask_abc[:, 0] = True
np_mask_abc[index_diff_abc] = True

# index of difference of 123
if tolerance_type == "percent":
    index_diff_123 = np.where( abs((df1_intersection_123 - df2_intersection_123)*100/df2_intersection_123) > tolerance )
else:
    index_diff_123 = np.where( abs(df1_intersection_123 - df2_intersection_123) > tolerance )
# prepare mask of 123
np_mask_123 = np.full(df1_intersection_123.shape, False)
np_mask_123[index_diff_123] = True

np_mask = np.concatenate((np_mask_abc, np_mask_123), axis=1)

np_mask

In [None]:
# re-org the fields, strings left, numericals right
df1_intersection = pd.concat([df1_intersection_abc, df1_intersection_123], axis=1)
df2_intersection = pd.concat([df2_intersection_abc, df2_intersection_123], axis=1)

In [None]:
masked_diff = pd.merge(df1_intersection.where(np_mask), df2_intersection.where(np_mask), how='outer', indicator=True)
df1_masked_diff = masked_diff[masked_diff['_merge'] == 'left_only'].drop(columns=['_merge']).reset_index(drop=True)
df2_masked_diff = masked_diff[masked_diff['_merge'] == 'right_only'].drop(columns=['_merge']).reset_index(drop=True)
masked_diff = pd.merge(df1_masked_diff, df2_masked_diff, on = [df1_masked_diff.columns[0]]).dropna(axis='columns', how='all')

if masked_diff.size > 0:
    X = masked_diff.columns[1:][:len(masked_diff.columns[1:]) // 2]
    Y = masked_diff.columns[1:][len(masked_diff.columns[1:]) // 2:]
    ordered_columns = [masked_diff.columns[0]] + [i for p in zip(X, Y) for i in p]
    df_diff = masked_diff[ordered_columns]
else:
    df_diff = pd.DataFrame([])
df_diff

In [None]:
# conclusion
if len(df1_only) + len(df2_only) + len(df_diff) > 0:
    print(f"df1_only = {len(df1_only)}")
    print(f"df2_only = {len(df2_only)}")
    if len(df_diff) > 0: 
        diffcol_num = (df_diff.shape[1] - 1)//2
    else:
        diffcol_num = 0
    print(f"df_diff = {len(df_diff)} records on {diffcol_num} columns")
else:
    print("df1 and df2 are identical")