In [None]:
from datasketch import MinHash, MinHashLSH
import json

#load data
file_name = "./data/actual.json"

with open(file_name, 'r') as file:
    data = json.load(file) 


In [None]:
#find vector dimensions for combination merchandise and the amount that is carried on that route where 
#the amount is specified by a indicator (small, medium, large)
#to do that the max and min is found for every merchandise

#find min and max for a merch and the possible merch to be carried 
mapminmax={}
possiblecomb={}

for route in data:
    for trip in route["route"]:
        for merch in trip["merchandise"]:
            if merch+"-min" in mapminmax:
                mapminmax[merch+"-min"]=min(mapminmax[merch+"-min"], trip["merchandise"][merch])
            else:
                mapminmax[merch+"-min"]= trip["merchandise"][merch]
            if merch+"-max" in mapminmax:
                mapminmax[merch+"-max"]=max(mapminmax[merch+"-max"], trip["merchandise"][merch])
            else:
                mapminmax[merch+"-max"]= trip["merchandise"][merch]
            
            if merch in possiblecomb:
                possiblecomb[merch]=possiblecomb[merch]+1
            else:
                possiblecomb[merch]=1

#determine borders for binning
mapborders={}

for item in possiblecomb:
    if possiblecomb[item]>1:
        smallmediumborder=mapminmax[item+"-"+"min"]+((1/3)*(mapminmax[item+"-"+"max"]-mapminmax[item+"-"+"min"]))
        mediumlargeborder=mapminmax[item+"-"+"min"]+((2/3)*(mapminmax[item+"-"+"max"]-mapminmax[item+"-"+"min"]))
        mapborders[item]=[smallmediumborder, mediumlargeborder]
    else:
        mapborders[item]=[0]

def convert_frozensets_to_string(input_set):
    result_set = set()
    for element in input_set:
        if isinstance(element, frozenset):
            for i, merch in enumerate(element):
                if i==0:
                    result_string=merch
                else:
                    result_string += "-"+str(merch)
            result_set.add(result_string)
        else:
            result_set.add(element)
    return result_set

def create_minhash(set_data):
    minhash = MinHash()
    MinHash(num_perm=256)
    for item in set_data:
        minhash.update(item.encode('utf-8'))
    return minhash

def calculate_jaccard_similarity(minhash1, minhash2):
    return minhash1.jaccard(minhash2)
#0add the single cities
#1add the conn between cities
#2add single merch
#3add the merch with binned quantities (s, m, l)
#4add the conn between cities with the product
#5combine "from" city with product and "to" city with product (without adding from or to)
#6combine products with every other product within a specific trip 

#add all the generated sets to a minhashing list
minhash_list=[]

for route in data:
    vector=set()
    for trip in route["route"]:
        #0:
        vector.update([trip["from"],trip["to"]])
        #1:
        vector.update([trip["from"]+"-"+trip["to"]])
        for merch in trip["merchandise"]:
            #2:
            vector.update([merch])
            #3:
            if mapborders[merch][0]==0:
                    vector.update([merch+"-"+"medium"])
            else:
                if trip["merchandise"][merch]<mapborders[merch][0]:
                    vector.update([merch+"-"+"small"])
                elif trip["merchandise"][merch]>mapborders[merch][1]:
                    vector.update([merch+"-"+"large"])
                else:
                    vector.update([merch+"-"+"medium"])
            #4:
            vector.update([trip["from"]+"-"+trip["to"]+"-"+merch])
            #5:
            vector.update([trip["from"]+"-"+merch, trip["to"]+"-"+merch])
            #6
            for comb in trip["merchandise"]:
                if comb != merch:
                    vector.add(frozenset((comb, merch)))
    #first normalise the vector to set elements or tuples
    vector=convert_frozensets_to_string(vector)
    #then add the set to the minhash_list with the reference actual route
    minhash_list.append([route["id"], create_minhash(vector)])


In [None]:
result=[]
for name1, minhash1 in minhash_list:
    for name2, minhash2 in minhash_list:
        result.append([name1, name2, calculate_jaccard_similarity(minhash1, minhash2)])
print(result[:100])
