# Validating distinct tables with ambiguous column names

In [1]:
import pandas
import datetime
import os

## Import and check data

In [2]:
# A base dataset, will be used to compare against the comparison
base = pandas.read_excel("/Users/danielcorcoran/Desktop/test_validation.xlsx", sheet_name = "ORIGINAL")

# A comparison dataset, similar to the base dataset, with different metadata and a few value variances
comparison = pandas.read_excel("/Users/danielcorcoran/Desktop/test_validation.xlsx", sheet_name = "DUPLICATE")

In [3]:
base.head()

Unnamed: 0,name,score,year
0,a,13,2001
1,b,13,2001
2,c,16,2001
3,d,15,2001
4,e,17,2001


In [4]:
comparison.head()

Unnamed: 0,name_copy,score_copy,year_copy
0,a,13,2001.0
1,b,13,2002.0
2,c,16,2001.0
3,d,15,2001.0
4,e,15,2001.0


In [5]:
base_columns = base.columns.tolist()
base_columns

['name', 'score', 'year']

In [6]:
comparison_columns = comparison.columns.tolist()
comparison_columns

['name_copy', 'score_copy', 'year_copy']

Identifying two possible inputs, one where the source data is indexed, and one where the data is not. e.g datetime stamps or names associated with the values.

## Method 1: without index labels 

In [7]:
'''
This method will involve taking a copy of each base column and running it against 
every column from the comparison dataset.

A counter variable will be created and added up for each column in the comparison dataset and with that, 
compute a percentage of values contained in the list

This method should accurately identify which base dataset columns reference which comparison dataset columns,
however will not give great insight as to which cells are incorrect.
'''

'\nThis method will involve taking a copy of each base column and running it against \nevery column from the comparison dataset.\n\nA counter variable will be created and added up for each column in the comparison dataset and with that, \ncompute a percentage of values contained in the list\n\nThis method should accurately identify which base dataset columns reference which comparison dataset columns,\nhowever will not give great insight as to which cells are incorrect.\n'

### Assumptions

In [8]:
'''
Both data sets have the same amount of columns
Values are numeric
Assumes data is NOT the the same order across base and comparison.
'''

'\nBoth data sets have the same amount of columns\nValues are numeric\nAssumes data is NOT the the same order across base and comparison.\n'

In [9]:
def output_percent_match_from_lists(base_list, comparison_list):

    item_count = len(base_list)

    containing = 0
    
    anomolies = {}

    for index in range(len(comparison_list)):
        item = comparison_list[index]
        if item in base_list:
            base_list.remove(item)
            containing = containing + 1
        else:
            anomolies[index] = item

    contains_percent = containing/item_count *100
    
    return contains_percent

### Testing the output_percent_match_from_lists() function

In [10]:
list1 = [1,2,3,4,5,6,7,9,8,12]
list2 = [1,2,3,4,5,6,7,8,9,10]

output_percent_match_from_lists(list1, list2)

90.0

### Start

In [11]:
#create a dictionary to hold the scores
# the keys will be set to the column names of the base dataset, the value will be set to 
# a list initially to store each score for each column in the comparison dataset

storage = {}
for column in base_columns:
    storage[column] = []

storage

{'name': [], 'score': [], 'year': []}

In [12]:
storage

{'name': [], 'score': [], 'year': []}

In [13]:
#store number of columns
number_columns = len(base_columns)
number_columns

3

###  Iterate through every combination of base column and comparison column and print/store the results

In [14]:
for base_index in range(len(base_columns)):
    
    for current_comparison_column in comparison_columns:
        
        current_base_column = base_columns[base_index]
        current_base_series = base[current_base_column]
        current_base_series_list = current_base_series.tolist()
        comparison_series = comparison[current_comparison_column]
        comparison_series_list = comparison_series.tolist()
        percent = output_percent_match_from_lists(current_base_series_list, comparison_series_list)
        
        print("THE SIZE OF BASE SERIES LIST IS: ", len(current_base_series_list))
        print("BASE COLUMN NAME:", current_base_column)
        print(current_base_series_list)
        print("COMPARISON COLUMN NAME:", current_comparison_column)
        print(comparison_series_list)
        print("MATCH PERCENT:", percent,"\n")
        
        storage[current_base_column].append(percent)

THE SIZE OF BASE SERIES LIST IS:  3
BASE COLUMN NAME: name
['f', 'f', 'a']
COMPARISON COLUMN NAME: name_copy
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', nan, 'b', 'g', 'h', 'd', 'a', 'd', 'h', 'f', 'h', 'f', 'a']
MATCH PERCENT: 85.0 

THE SIZE OF BASE SERIES LIST IS:  20
BASE COLUMN NAME: name
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'f', 'a', 'g', 'h', 'f', 'a', 'd', 'h', 'f', 'h', 'f', 'a']
COMPARISON COLUMN NAME: score_copy
[13, 13, 16, 15, 15, 12, 16, 20, 15, 12, 16, 14, 12, 11, 12, 14, 11, 14, 16, 18]
MATCH PERCENT: 0.0 

THE SIZE OF BASE SERIES LIST IS:  20
BASE COLUMN NAME: name
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'f', 'a', 'g', 'h', 'f', 'a', 'd', 'h', 'f', 'h', 'f', 'a']
COMPARISON COLUMN NAME: year_copy
[2001.0, 2002.0, 2001.0, 2001.0, 2001.0, 2001.0, nan, nan, 2001.0, 2001.0, 2001.0, 2001.0, 2001.0, 2001.0, 2002.0, 2002.0, 2002.0, 2002.0, 2002.0, 2002.0]
MATCH PERCENT: 0.0 

THE SIZE OF BASE SERIES LIST IS:  20
BASE COLUMN NAME: score
[13, 13, 16, 15, 17, 12, 16, 20, 15, 1

In [15]:
storage

{'name': [85.0, 0.0, 0.0], 'score': [0.0, 90.0, 0.0], 'year': [0.0, 0.0, 85.0]}

### A function to return the index and value of a maximum value in a python list

In [16]:
def find_maximum_values_index(input_list):
    
    argument_type = type(input_list)
    
    if argument_type == list:
        
        current_top_index = None
        current_top_value = None

        for index in range(len(input_list)):

                value = input_list[index]

                if current_top_value is None:
                   
                    current_top_value = value
                    current_top_index = index

                elif value > current_top_value:
                    
                    current_top_index = index
                    current_top_value = value

        return current_top_index, current_top_value
    
    else:
        print("You must pass in a list to find_maximum_values_index() function.")
    
    
    

In [17]:
storage

{'name': [85.0, 0.0, 0.0], 'score': [0.0, 90.0, 0.0], 'year': [0.0, 0.0, 85.0]}

In [18]:
predicted_matches = {}

for column_name in base_columns:
    
    current_list = storage[column_name]
    tuple_result = find_maximum_values_index(current_list)
    
    top_index = tuple_result[0]
    top_value = tuple_result[1]
    
    predicted_matches[column_name] = {"Match column" : comparison_columns[top_index],"Match Percent" : top_value}

In [19]:
predicted_matches

{'name': {'Match column': 'name_copy', 'Match Percent': 85.0},
 'score': {'Match column': 'score_copy', 'Match Percent': 90.0},
 'year': {'Match column': 'year_copy', 'Match Percent': 85.0}}