In [1]:
import os 
print(os.getcwd())

path = os.path.join(os.getcwd(), 'config')
fs = [f for f in os.listdir(path) if 'json' in f]
fs

/Users/ginazhou/Documents/GitHub/cards


['prod transfer cbi fct f24.json',
 'student.json',
 'sbx points.json',
 'welcome.json',
 'aero fx.json',
 'unique ac bistro.json',
 'aero 40 yr engagement.json',
 'f25 winter btv.json',
 'aero mobile wallet.json',
 'mbna estmt.json',
 'unique ac target.json',
 'aero 40 yr acquisition.json',
 'mbna spend stim.json',
 'f25 winter aero.json',
 'unique ac apply buy?.json',
 'n2c.json',
 'amazon grocery spend stim.json',
 'aero 10 yr contest.json']

In [2]:
def calc_jaccard(new_json, old_jsons,n):

    def jaccard(json1, json2):
        # get key & values
        set1 = {key for key, value in json1.items() if value is True}
        set2 = {key for key, value in json2.items() if value is True}
        
        # calc intersection and union
        intersection = set1.intersection(set2)
        union = set1.union(set2)
        
        # calc jccard similarity 
        if not union:  # Avoid division by zero
            return 0.0
        return len(intersection) / len(union)

    # repeat for every json
    scored_jsons = [(old_json, jaccard(new_json, old_json)) for old_json in old_jsons]

    # sort in descending order  
    scored_jsons.sort(key=lambda x: x[1], reverse=True)

    # Return the top N similar JSONs with their scores
    return scored_jsons[:n]


In [3]:
def calc_hamming(new_json, old_jsons,n):

    def hamming(json1, json2):
        # get key & values
        set1 = {key for key, value in json1.items() if value is True}
        set2 = {key for key, value in json2.items() if value is True}
        
        # calc intersection and union
        keys = set1.union(set2)
        
        # Calculate Hamming distance
        hamming = sum(json1.get(key, False) != json2.get(key, False) for key in keys)
        
        total_keys = len(keys)
        return 1 - (hamming / total_keys)

    # repeat for every json
    scored_jsons = [(old_json, hamming(new_json, old_json)) for old_json in old_jsons]

    # sort in descending order  
    scored_jsons.sort(key=lambda x: x[1], reverse=True)

    # Return the top N similar JSONs with their scores
    return scored_jsons[:n]


In [None]:
import json

new_json = json.load(open(path+'/'+fs[1], 'r'))
old_jsons = [json.load(open(path+'/'+f, 'r')) for f in fs] 


# top 3 similar offer are: 
top3 = calc_jaccard(new_json, old_jsons, 10)

print(f"the top 3 campaigns matching {fs[1]} are: ")
for idx, (json_obj, score) in enumerate(top3, start=1):
    print(f"{idx}. {json_obj['title']} with a score of {score:.2f}")


# try hamming 
top3 = calc_hamming(new_json, old_jsons, 10)
print(f"")
print(f"the top 3 campaigns matching {fs[1]} are: ")
for idx, (json_obj, score) in enumerate(top3, start=1):
    print(f"{idx}. {json_obj['title']} with a score of {score:.2f}")

the top 3 campaigns matching student.json are: 
1. TD Cash Back Visa* Card Student Limited Time Offer with a score of 1.00
2. TD Rewards Visa and Other TD Visa Cards – Starbucks Rewards Redemption Bonus Offer with a score of 0.50
3. Mobile Wallet Offer with a score of 0.50
4. Limited Time Promotional TD Aeroplan Offer (Mobile Wallet Bonus) with a score of 0.40
5. MBNA Paperless Statements $5 Credit Offer with a score of 0.33
6. TD Cash Back Visa* Card New to Canada Offer with a score of 0.33
7. TD FX Offer with a score of 0.25
8. 40th Anniversary - Engagement Offer with a score of 0.25
9. TD Business Travel Visa Card 2025 Welcome Offer with a score of 0.25
10. Aeroplan 40th Anniversary Card Acquisition Offer with a score of 0.25

the top 3 campaigns matching student.json are: 
1. TD Cash Back Visa* Card Student Limited Time Offer with a score of 1.00
2. TD Rewards Visa and Other TD Visa Cards – Starbucks Rewards Redemption Bonus Offer with a score of 0.50
3. Mobile Wallet Offer with a 

In [None]:
import json
from sklearn.feature_extraction.text import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from statistics import mean

def calc_cosine(new_json, old_jsons, n):
    all_jsons = old_jsons + [new_json]
    
    # vectorize json 
    vectorizer = DictVectorizer(sparse=False)
    feature_vectors = vectorizer.fit_transform(all_jsons)
    
    # calc consine similarity
    new_vector = feature_vectors[-1].reshape(1, -1)   
    old_vectors = feature_vectors[:-1]   

    score = cosine_similarity(new_vector, old_vectors).flatten()
    
    # Calculate average similarity score
    average_score = mean(scores)
    
    return scores.tolist(), average_score


# Calculate similarity scores
scores, avg_score = calculate_similarity_score(new_json, old_jsons)

print("Similarity Scores:", scores)
print("Average Similarity Score:", avg_score)