In [22]:
def rankroutes(actual_routes_file, routes_to_sort_file, driver_id, limit_actual_routes=0, limit_routes_to_sort=0, prints=False, numperm3=128, findbest=0):
    from datasketch import MinHash
    import json
    
    def convert_frozensets_to_string(input_set):
        result_set = set()
        for element in input_set:
            if isinstance(element, frozenset):
                for i, merch in enumerate(element):
                    if i==0:
                        result_string=merch
                    else:
                        result_string += "-"+str(merch)
                result_set.add(result_string)
            else:
                result_set.add(element)
        return result_set

    def create_minhash(set_data, numperm4):
        minhash = MinHash(num_perm=numperm4)
        for item in set_data:
            minhash.update(item.encode('utf-8'))
        return minhash

    def calculate_jaccard_similarity(minhash1, minhash2):
        return minhash1.jaccard(minhash2)

    def route_to_minhash(data1, mapborders, num_perm5):
        #0add the single cities
        #1add the conn between cities
        #2add single merch
        #3add the merch with binned quantities (s, m, l)
        #4add the conn between cities with the product
        #5combine "from" city with product and "to" city with product (without adding from or to)
        #6combine products with every other product within a specific trip 

        #add all the generated sets to a minhashing list
        minhash_list=[]
        
        for route in data1:
            vector=set()
            for trip in route["route"]:
                #0:
                vector.update([trip["from"],trip["to"]])
                #1:
                vector.update([trip["from"]+"-"+trip["to"]])
                for merch in trip["merchandise"]:
                    #2:
                    vector.update([merch])
                    #3:
                    if mapborders[merch][0]==0:
                            vector.update([merch+"-"+"medium"])
                    else:
                        if trip["merchandise"][merch]<mapborders[merch][0]:
                            vector.update([merch+"-"+"small"])
                        elif trip["merchandise"][merch]>mapborders[merch][1]:
                            vector.update([merch+"-"+"large"])
                        else:
                            vector.update([merch+"-"+"medium"])
                    #4:
                    vector.update([trip["from"]+"-"+trip["to"]+"-"+merch])
                    #5:
                    vector.update([trip["from"]+"-"+merch, trip["to"]+"-"+merch])
                    #6
                    for comb in trip["merchandise"]:
                        if comb != merch:
                            vector.add(frozenset((comb, merch)))
            #first normalise the vector to set elements or tuples
            vector=convert_frozensets_to_string(vector)
            #then add the set to the minhash_list with the reference actual route
            minhash_list.append([route["id"], create_minhash(vector, num_perm5)])
        return minhash_list
    
    def space_borders(data):
        #find vector dimensions for combination merchandise and the amount that is carried on that route where 
        #the amount is specified by a indicator (small, medium, large)
        #to do that the max and min is found for every merchandise

        #find min and max for a merch and the possible merch to be carried 
        mapminmax={}
        possiblecomb={}

        for route in data:
            for trip in route["route"]:
                for merch in trip["merchandise"]:
                    if merch+"-min" in mapminmax:
                        mapminmax[merch+"-min"]=min(mapminmax[merch+"-min"], trip["merchandise"][merch])
                    else:
                        mapminmax[merch+"-min"]= trip["merchandise"][merch]
                    if merch+"-max" in mapminmax:
                        mapminmax[merch+"-max"]=max(mapminmax[merch+"-max"], trip["merchandise"][merch])
                    else:
                        mapminmax[merch+"-max"]= trip["merchandise"][merch]
                    
                    if merch in possiblecomb:
                        possiblecomb[merch]=possiblecomb[merch]+1
                    else:
                        possiblecomb[merch]=1

        #determine borders for dividing range into partitions
        mapborders={}

        for item in possiblecomb:
            if possiblecomb[item]>1:
                smallmediumborder=mapminmax[item+"-"+"min"]+((1/3)*(mapminmax[item+"-"+"max"]-mapminmax[item+"-"+"min"]))
                mediumlargeborder=mapminmax[item+"-"+"min"]+((2/3)*(mapminmax[item+"-"+"max"]-mapminmax[item+"-"+"min"]))
                mapborders[item]=[smallmediumborder, mediumlargeborder]
            else:
                mapborders[item]=[0]
                
        return mapborders

    def rank_routes(minhaslistdata, minhashlistroutes):
        score1=[]
        for routes1 in minhashlistroutes:
            summation=0.0
            for routes2 in minhaslistdata:
                summation=summation+calculate_jaccard_similarity(routes1[1], routes2[1])
            score1.append([routes1[0], summation/len(minhaslistdata)])
        score1=sorted(score1, key=lambda x: x[1], reverse=True)
        
        #now only output the routes without rating:
        routes=[]
        for element in score1:
            routes.append(element[0])
        
        return routes

    def load_json(filename):
        with open(filename, 'r') as file:
            data4 = json.load(file)
        return data4

    def load_data1(actual_routes_file1, routes_to_sort_file1, limit_actual_routes1, limit_routes_to_sort1, driver_id):
        #load data and limit it as required
        
        #load driverdata
        driver_data=load_json(actual_routes_file1)
        #load routes to be ranked
        rank_routes1=load_json(routes_to_sort_file1)

        #limit data to only the driver that is selected
        driver_data1=[]
        for route in driver_data:
            if route["driver"]==driver_id:
                driver_data1.append(route)

        #if a limit on total routes is set apply it
        if limit_actual_routes1!=0:
            driver_data1=driver_data1[:limit_actual_routes1]
        if limit_routes_to_sort1!=0:
            rank_routes1=rank_routes1[:limit_routes_to_sort1]
        
        return driver_data1, rank_routes1

    def load_data2(actual_routes_file1, driver_id, limit_actual_routes1=0):
        #load data and limit it as required
        
        #load driverdata
        driver_data1=load_json(actual_routes_file1)

        #limit data to only the driver that is selected
        if driver_id!=0:
            driver_data=[]
            for route in driver_data1:
                if route["driver"]==driver_id:
                    driver_data.append(route)
            driver_data1=driver_data

        #if a limit on total routes is set apply it
        if limit_actual_routes1!=0:
            driver_data1=driver_data1[:limit_actual_routes1]
        return driver_data1

    if(prints==True):  print("loading data")
    if findbest!=0:
        driver_data3=load_data2(actual_routes_file, driver_id, limit_actual_routes)
        rank_routes3=routes_to_sort_file
    else:
        [driver_data3, rank_routes3] = load_data1(actual_routes_file, routes_to_sort_file, limit_actual_routes, limit_routes_to_sort, driver_id)
        
    if(prints==True): print("mapping partition borders")
    mapborders=space_borders(driver_data3+rank_routes3)
    
    if(prints==True): print("converting routes into minhash signatures")
    minhash_list_data1=route_to_minhash(driver_data3, mapborders, numperm3)
    minhash_list_rankroutes1=route_to_minhash(rank_routes3, mapborders, numperm3)
        
    if(prints==True): print("comparing using jaccard")
    result=rank_routes(minhash_list_data1, minhash_list_rankroutes1)

    if findbest!=0:
        for routes in routes_to_sort_file:
            if routes["id"]==result[0]:
                return routes["route"]
    else:
        return result[:5]

In [23]:
filename="/home/felix/Documents/Python/DataMining/DataMining/data/actual.json"

filename2="/home/felix/Documents/Python/DataMining/DataMining/data/standard.json"

result=rankroutes(filename, filename2, "A", 100, prints=True)
print(result)

loading data
mapping partition borders
converting routes into minhash signatures
comparing using jaccard
['s1', 's11', 's17', 's2', 's6']
