In [28]:
def rankroutesfordriver(driverdata3, rank_routes1, prints=False):
    from datasketch import MinHash, MinHashLSH
    
    def convert_frozensets_to_string(input_set):
        result_set = set()
        for element in input_set:
            if isinstance(element, frozenset):
                for i, merch in enumerate(element):
                    if i==0:
                        result_string=merch
                    else:
                        result_string += "-"+str(merch)
                result_set.add(result_string)
            else:
                result_set.add(element)
        return result_set

    def create_minhash(set_data, numperm4):
        minhash = MinHash(num_perm=numperm4)
        for item in set_data:
            minhash.update(item.encode('utf-8'))
        return minhash

    def calculate_jaccard_similarity(minhash1, minhash2):
        return minhash1.jaccard(minhash2)

    def route_to_minhash(data1, mapborders, num_perm5):
        #0add the single cities
        #1add the conn between cities
        #2add single merch
        #3add the merch with binned quantities (s, m, l)
        #4add the conn between cities with the product
        #5combine "from" city with product and "to" city with product (without adding from or to)
        #6combine products with every other product within a specific trip 

        #add all the generated sets to a minhashing list
        minhash_list=[]
        
        for route in data1:
            vector=set()
            for trip in route["route"]:
                #0:
                vector.update([trip["from"],trip["to"]])
                #1:
                vector.update([trip["from"]+"-"+trip["to"]])
                for merch in trip["merchandise"]:
                    #2:
                    vector.update([merch])
                    #3:
                    if mapborders[merch][0]==0:
                            vector.update([merch+"-"+"medium"])
                    else:
                        if trip["merchandise"][merch]<mapborders[merch][0]:
                            vector.update([merch+"-"+"small"])
                        elif trip["merchandise"][merch]>mapborders[merch][1]:
                            vector.update([merch+"-"+"large"])
                        else:
                            vector.update([merch+"-"+"medium"])
                    #4:
                    vector.update([trip["from"]+"-"+trip["to"]+"-"+merch])
                    #5:
                    vector.update([trip["from"]+"-"+merch, trip["to"]+"-"+merch])
                    #6
                    for comb in trip["merchandise"]:
                        if comb != merch:
                            vector.add(frozenset((comb, merch)))
            #first normalise the vector to set elements or tuples
            vector=convert_frozensets_to_string(vector)
            #then add the set to the minhash_list with the reference actual route
            minhash_list.append([route["id"], create_minhash(vector, num_perm5)])
        return minhash_list
    
    def space_borders(data):
        #find vector dimensions for combination merchandise and the amount that is carried on that route where 
        #the amount is specified by a indicator (small, medium, large)
        #to do that the max and min is found for every merchandise

        #find min and max for a merch and the possible merch to be carried 
        mapminmax={}
        possiblecomb={}

        for route in data:
            for trip in route["route"]:
                for merch in trip["merchandise"]:
                    if merch+"-min" in mapminmax:
                        mapminmax[merch+"-min"]=min(mapminmax[merch+"-min"], trip["merchandise"][merch])
                    else:
                        mapminmax[merch+"-min"]= trip["merchandise"][merch]
                    if merch+"-max" in mapminmax:
                        mapminmax[merch+"-max"]=max(mapminmax[merch+"-max"], trip["merchandise"][merch])
                    else:
                        mapminmax[merch+"-max"]= trip["merchandise"][merch]
                    
                    if merch in possiblecomb:
                        possiblecomb[merch]=possiblecomb[merch]+1
                    else:
                        possiblecomb[merch]=1

        #determine borders for dividing range into partitions
        mapborders={}

        for item in possiblecomb:
            if possiblecomb[item]>1:
                smallmediumborder=mapminmax[item+"-"+"min"]+((1/3)*(mapminmax[item+"-"+"max"]-mapminmax[item+"-"+"min"]))
                mediumlargeborder=mapminmax[item+"-"+"min"]+((2/3)*(mapminmax[item+"-"+"max"]-mapminmax[item+"-"+"min"]))
                mapborders[item]=[smallmediumborder, mediumlargeborder]
            else:
                mapborders[item]=[0]
                
        return mapborders

    def rank_routes(minhaslistdata, minhashlistroutes):
        score1=[]
        for routes1 in minhashlistroutes:
            summation=0.0
            for routes2 in minhaslistdata:
                summation=summation+calculate_jaccard_similarity(routes1[1], routes2[1])
            score1.append([routes1[0], summation/len(minhaslistdata)])
        score1=sorted(score1, key=lambda x: x[1], reverse=True)
        return score1

    numperm3=128
    
    if(prints==True): print("map borders")
    mapborders=space_borders(driverdata3+rank_routes1)
    
    if(prints==True): print("convert routes into minhash signatures")
    minhash_list_data1=route_to_minhash(driverdata3, mapborders, numperm3)
    minhash_list_rankroutes1=route_to_minhash(rank_routes1, mapborders, numperm3)
        
    if(prints==True): print("Compare using jaccard")
    result=rank_routes(minhash_list_data1, minhash_list_rankroutes1)
    
    return result

In [31]:
import json

def load_data(filename):
    with open(filename, 'r') as file:
        data4 = json.load(file)
    return data4

print("load data")
#load driverdata
file_name_driver = "./data/actual.json"
driver_data=load_data(file_name_driver)

#load routes to be ranked
file_name_rroutes = "./data/standard.json"
rank_routes=load_data(file_name_rroutes)

driver_data1=[]
for route in driver_data:
    if route["driver"]=="I":
        driver_data1.append(route)

print("rankroutes")
result=rankroutesfordriver(driver_data1, rank_routes, prints=True)
print(result)


load data
rankroutes
map borders
convert routes into minhash signatures
Compare using jaccard
[['s16', 0.16987541333530748], ['s6', 0.16868013399466983], ['s18', 0.1637895136215576], ['s3', 0.15531922441022603], ['s17', 0.15181204348040667], ['s1', 0.14326772406475174], ['s12', 0.142796552660152], ['s13', 0.14151336245188037], ['s11', 0.1378218771592143], ['s8', 0.13562102087651762], ['s7', 0.12584749160991018], ['s10', 0.12578117288520382], ['s20', 0.12342685815812851], ['s15', 0.12058517791925773], ['s4', 0.10882285805942157], ['s14', 0.10830156203731123], ['s9', 0.10795608775046886], ['s19', 0.10302074079557792], ['s2', 0.10202673107294442], ['s5', 0.08865579656499852]]
