# Similarity Modeling Approach

In order to compare elements of the dataset, we will match elements that have the highest cosine similarity, using the D-statistic from the KS 2-sample test to match features that have the highest similarity in distribution. 

In [1]:
from typing import List
import pandas as pd
import numpy as np
a_comp = pd.read_csv("../data/artifacts/a_comp.csv")
b_comp = pd.read_csv("../data/artifacts/b_comp.csv")
ks_stat_df = pd.read_csv("../data/artifacts/ks_stat_df.csv")

In [2]:
# create mapping a and b cols according to distribution similarity
b_col_list = []
ks_stat_values = ks_stat_df.drop("a_col_map", axis=1).values.astype('float')
for val in np.argmax(ks_stat_values, axis=1):
    b_col_list.append(ks_stat_df.drop("a_col_map", axis=1).columns.to_numpy()[val])

# get last index for each group, assuming append only structure
max_index_per_entity = b_comp.reset_index().groupby('b_entity_id')['index'].max()
# fill NA with null class flag 0
b_model_df = b_comp.iloc[list(max_index_per_entity.values)].set_index('b_entity_id').fillna(0).reset_index()[['b_entity_id']+b_col_list]
a_model_df = a_comp.fillna(0)

In [3]:
b_model_df

Unnamed: 0,b_entity_id,fax_country_b__address,fax_country_b__address.1,fax_country_b__address.2,fax_country_b__address.3,primary_sic_code_b__company,industry_code_b__company,nace_code_b__company,sector_code_b__company,sector_code_b__company.1,...,fax_country_b__address.4,tele_country_b__address,tele_area_b__address,tele_area_b__address.1,fax_country_b__address.5,tele_area_b__address.2,sector_code_b__company.2,primary_sic_code_b__company.1,sector_code_b__company.3,industry_code_b__company.1
0,000BFG-E,0.0,0.0,0.0,0.0,5331.0,3520.0,47.19,3500.0,3500.0,...,0.0,1.0,210.0,210.0,0.0,210.0,3500.0,5331.0,3500.0,3520.0
1,000FF7-E,1.0,1.0,1.0,1.0,7819.0,3430.0,59.12,3400.0,3400.0,...,1.0,1.0,510.0,510.0,1.0,510.0,3400.0,7819.0,3400.0,3430.0
2,000FJL-E,1.0,1.0,1.0,1.0,7371.0,3308.0,62.01,3300.0,3300.0,...,1.0,1.0,617.0,617.0,1.0,617.0,3300.0,7371.0,3300.0,3308.0
3,000HNQ-E,1.0,1.0,1.0,1.0,3716.0,1250.0,29.10,1200.0,1200.0,...,1.0,1.0,574.0,574.0,1.0,574.0,1200.0,3716.0,1200.0,1250.0
4,000HNW-E,1.0,1.0,1.0,1.0,3695.0,1315.0,26.80,1300.0,1300.0,...,1.0,1.0,650.0,650.0,1.0,650.0,1300.0,3695.0,1300.0,1315.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196493,0HPZTX-E,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196494,0HPZV0-E,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196495,0HPZV5-E,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196496,0HPZY3-E,0.0,0.0,0.0,0.0,1499.0,1125.0,8.92,1100.0,1100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1100.0,1499.0,1100.0,1125.0


### Apply Balance Distribution to Dataset for Comparison

The comparison algorithm fits each of the respective feature values to the hypothetical class balance distribution used to construct the K-S 2-sample test. These comparison values are then tested for cosine similarity and mapped to respective entity values.

In [27]:
a_model = a_model_df.drop("vendor_id", axis=1)[ks_stat_df.a_col_map.values]
b_model = b_model_df.drop("b_entity_id", axis=1)

def get_balance_distribution_mapping(
    features: np.array,
    col_names: np.array
) -> List: 
    """Get the balance distribution for each feature in numpy array
    
    Args:
        features (np.array) : feature set
        col_names (np.array) : column name for feature set
    Returns: 
        balance_distribution_mapping (List) : map of unique values to balance distributions for each column in array
    """
    balance_distribution_mapping = {}
    for i in range(features.shape[1]):
        feature = features[:, i]
        # get unique values for feature
        unique, counts = np.unique(feature, return_counts=True)
        # get count for each unique value in array
        counts = np.asarray((unique, counts)).T[:, 1]
        # get count distribution
        count_distribution = counts/sum(counts)
        balance_distribution_mapping[col_names[i]] = dict(zip(unique, count_distribution))
    return balance_distribution_mapping

a_balance_mapping = get_balance_distribution_mapping(a_model.values.astype('float'), a_model.columns.to_numpy())
b_balance_mapping = get_balance_distribution_mapping(b_model.values.astype('float'), b_model.columns.to_numpy())
for i, col in enumerate(a_model.columns.to_numpy()):
    a_model.iloc[:, i] = a_model.iloc[:, i].astype(float).map(a_balance_mapping[col])
for i, col in enumerate(b_model.columns.to_numpy()):
    b_model.iloc[:, i] = b_model.iloc[:, i].astype(float).map(b_balance_mapping[col])

  a_model.iloc[:, i] = a_model.iloc[:, i].astype(float).map(a_balance_mapping[col])


In [13]:
import torch
import torch.nn.functional as F
# now compute similarity between class distribution balance mappings
a_input = a_model.values.astype("float")
b_input = b_model.values.astype("float")

a_entities = a_model_df.vendor_id.values
b_entities = b_model_df.b_entity_id.values

response_data = []
# for each row in a_input, get cosine similarity for each row and b and find argmax
for i in range(a_input.shape[0]):
    # multiply matrices in order to recieve cosine across all b entities
    cosine = b_input@a_input[i, :]
    softmax_res = F.softmax(torch.from_numpy(cosine), dim=0).numpy()
    # get location of maximum likelihood
    pick = np.argmax(softmax_res)
    likelihood = softmax_res.max()
    # get probability from normalized softmax
    res = {
        "vendor_id":a_entities[i],
        "b_entity_id":b_entities[pick],
        "likelihood":likelihood
    }
    response_data.append(res)

In [30]:
res = pd.DataFrame(data=response_data)
res

Unnamed: 0,vendor_id,b_entity_id,likelihood
0,285924451,000NYL-E,0.000011
1,47653720,000NYL-E,0.000011
2,149196787,000NYL-E,0.000010
3,274892372,000NYL-E,0.000010
4,197539987,000NYL-E,0.000011
...,...,...,...
76339,49244286,000NYL-E,0.000010
76340,21988338,000NYL-E,0.000011
76341,36866816,000NYL-E,0.000010
76342,63214740,000NYL-E,0.000010


In [33]:
res.to_csv("../output/output.csv", index=False)