In [11]:
import numpy as np
import pandas as pd

def simple_matching(v1, v2):
    matching = np.sum(v1 == v2)
    
    similarity = matching / len(v1)
    return similarity

def extended_jaccard(v1, v2): 
    intersection = np.sum(np.logical_and(v1, v2))
    union = np.sum(np.logical_or(v1, v2))
    
    similarity = intersection / union
    return similarity

def cosine(v1, v2):
    dot_product = np.dot(v1, v2)
    
    # Compute Euclidean lengths
    norm1 = np.linalg.norm(v1) 
    norm2 = np.linalg.norm(v2)
    
    similarity = dot_product / (norm1 * norm2)
    return similarity

data = pd.read_csv('preprocessed_data.csv')

numerical = data.select_dtypes(include=['int', 'float']).columns.tolist()

data = data[numerical]

vector_11000 = data[data['CustomerKey'] == 11000].drop(columns=['CustomerKey']).values.flatten()
vector_11001 = data[data['CustomerKey'] == 11001].drop(columns=['CustomerKey']).values.flatten()
vector_11012 = data[data['CustomerKey'] == 11012].drop(columns=['CustomerKey']).values.flatten()


sim_simple_matching = simple_matching(vector_11000, vector_11001)
sim_ej = extended_jaccard(vector_11000, vector_11001)
sim_cosine = cosine(vector_11000, vector_11001)

print("Similarities between 11000 and 11001")
print("Simple matching:", sim_simple_matching)
print("Extended Jaccard:", sim_ej)
print("Cosine:", sim_cosine)

sim_simple_matching = simple_matching(vector_11000, vector_11012)
sim_ej = extended_jaccard(vector_11000, vector_11012)
sim_cosine = cosine(vector_11000, vector_11012)

print("\nSimilarities between 11000 and 11012")
print("Simple matching:", sim_simple_matching)
print("Extended Jaccard:", sim_ej)
print("Cosine:", sim_cosine)

Similarities between 11000 and 11001
Simple matching: 0.25
Extended Jaccard: 0.75
Cosine: 0.9996178177253329

Similarities between 11000 and 11012
Simple matching: 0.25
Extended Jaccard: 0.5
Cosine: 0.9988883234689685
