In [1]:
import os
import json

# Get the current working directory, select file path for actual routes in subfolder 'data'
current_directory = os.getcwd()
subfolder_path = 'data'
file_name = "actual.json"
file_path = os.path.join(current_directory, subfolder_path, file_name)
prints = True

#import data
if(prints==True): print("loading data into memory")
with open(file_path, 'r') as file:
    data = json.load(file)
if(prints==True): print("loaded " + str(len(data)) + " routes into memory")

loading data into memory
loaded 100000 routes into memory


In [2]:
#find the amount of dimensions (possible merch) for all the possible 
#connections between cities 
if(prints==True): print("determine subspaces and its dimensions")
dim_count = {}

for route_info in data:
    for trip_info in route_info["route"]:
        
        set_of_items = set()
        
        for merchandise in trip_info["merchandise"]:
            set_of_items.add(merchandise)

        conn_name = trip_info["from"] + "-" + trip_info["to"]
        
        if conn_name not in dim_count:
            dim_count[conn_name] = set_of_items
        else:
            dim_count[conn_name].update(set_of_items)

#create a mapping method by converting the sets (first for speed) into
#tuples which take less memory than lists
#we do this so we can say that 'pens' is for example the first dimension
#and 'milk' the second  
for conn_name in dim_count:
    dim_count[conn_name] = (tuple(dim_count[conn_name]))
if(prints==True): print("found " + str(len(dim_count)) + " subspaces:")
#if(prints==True): print(dim_count)

determine subspaces and its dimensions
found 380 subspaces:


In [3]:
#convert the data into lists of data points so the clustering can be applied
if(prints==True): print("converting the data into data points for each subspaces")
data_points = {}
for conn_name in dim_count:
    data_points[conn_name] = []

for route_info in data:
    for trip_info in route_info["route"]:
        conn_name = trip_info["from"] + "-" + trip_info["to"]
        
        temp_point = [0] * len(dim_count[conn_name])
        
        for merch in trip_info["merchandise"]:
            index = dim_count[conn_name].index(merch)
            temp_point[index] = trip_info["merchandise"][merch]
                    
        data_points[conn_name].append(temp_point) #change to list again if needed
if(prints==True): print("done")
#if(prints==True): print(data_points)

converting the data into data points for each subspaces
done


In [4]:
import numpy as np
import warnings
import matplotlib.pyplot as plt
import random
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
prints = False


#ignore warnings temporarily for better readability
warnings.filterwarnings("ignore")

def sample(upper_limit1, amount_of_samples1, data1):
    rand_numbs = random.sample(range(0, upper_limit1), amount_of_samples1)
    sample_space = [list(data1[i]) for i in rand_numbs]
    return sample_space

def round_list(lst, decimal_places=1):
    rounded_list = [round(element, decimal_places) for element in lst]
    return rounded_list

ext_data_points = {}
clusterinfo = {}
count = 0
for city in data_points:
    count = count + 1
    max_expected_clusters = 0
    labeling = {}
    data = data_points[city]
    #limit the sample space
    datasize = len(data)
    upper_limit = datasize
    if datasize < 1000:
        amount_of_samples = min(datasize, 100)
        max_expected_clusters = round(float(amount_of_samples)**(1/2))
    elif datasize < 10000:
        amount_of_samples = round(datasize/10)
        max_expected_clusters = round(float(amount_of_samples)**(1/2))

    else:
        amount_of_samples = round(datasize/100)
        max_expected_clusters = round(float(amount_of_samples)**(1/2))

    sample_space = []
    
    #check if all the samplepoint    print(amount_of_samples)
    #are the same and if they are try again 5 times
    #if still the same pass the knowledge on
    for _ in range(5):
        one_point_marker = True
        sample_space = sample(upper_limit, amount_of_samples, data)
        if all(x == data[0] for x in sample_space) != True:
            one_point_marker = False
            break
            
    # Calculate silhouette scores for different values of k using samples 
    silhouette_scores = []
    #limit expected amount of clusters proportional to the amount of datapoints we can maybe say that for every 2% of dataset there may be a cluster existing
    
    K_range = range(2, max_expected_clusters)

    if(prints==True): print(str(count) + "/"+str(len(data_points)) + " trying to find " + str(max_expected_clusters) + " clusters for subspace: " + city + " in " + str(amount_of_samples) + " samples with datasize of " + str(datasize))

    
    if (one_point_marker == False) and (max_expected_clusters > 2):
        #loop through possible amount of clusters and stop for first silhouette with score >0.7
        for k in K_range:
            try:
                scaler = MinMaxScaler()
                sample_space = scaler.fit_transform(sample_space)
                kmeans = KMeans(n_clusters=k)
                kmeans.fit(sample_space)
                labels = kmeans.labels_
                silhouette_scores.append(silhouette_score(sample_space, labels))
            except ConvergenceWarning as e:
                break
            if max(silhouette_scores) > 0.7:
                break
        
        #determine k using the sampled space
        max_ss = max(silhouette_scores)
        if max_ss > 0.6:
            k = silhouette_scores.index(max_ss) + 2
        else:
            k = 1
            
    else:
        k = 1
    if(prints==True): print("found " + str(k) + " clusters. Now perform clustering on the data:")
    #now use kmeans to find the labels of the points for the entire set
    scaler = MinMaxScaler()
    data = scaler.fit_transform(data)
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data)

    if(prints==True): print("done")
    if(prints==True): print("building map with labels and cluster info")
    #build the extended data points set which also contain the labels of the points
    labels = kmeans.labels_
    
    for index, point in enumerate(data_points[city]):
        labeling[tuple(point)] = labels[index]
    ext_data_points[city] = labeling
    
    #build a dataset which contains info about each cluster
    for i in range(k):
        temp = {}
        oneclusterdata = data[labels==i]
        temp["count"] = (len(oneclusterdata))
        kmeans1 = KMeans(n_clusters=1)
        kmeans1.fit(oneclusterdata)
        temp["inertia"] = (kmeans1.inertia_)
        temp["centroid"] = tuple(round_list(scaler.inverse_transform(kmeans.cluster_centers_)[i]))
        clusterinfo[city + "-" + str(i)] = temp
if(prints==True): print("found " + str(len(clusterinfo)) + " clusters")

#if(prints==True): print(ext_data_points)
#if(prints==True): print(clusterinfo)

In [5]:
#Now build the new, transformed dataset out of the old dataset and the gathered data
#the data we gathered are the mapping tool (dim_count) and the ext_data_points
with open(file_path, 'r') as file:
    data = json.load(file)

dataset = []
dim_map = dim_count
clust_map = ext_data_points

for route_info in data:
    conv_route = []
    cont_trips = []
    
    
    for trip_info in route_info["route"]:
        conn_name = trip_info["from"] + "-" + trip_info["to"]
        
        temp_point = [0] * len(dim_map[conn_name])
        
        for merch in trip_info["merchandise"]:
            index = dim_map[conn_name].index(merch)
            temp_point[index] = trip_info["merchandise"][merch]
        cluster = clust_map[conn_name][tuple(temp_point)]
        trip_name = conn_name + "-" + str(cluster)
        cont_trips.append(trip_name)
        
    conv_route.append(route_info["id"])
    conv_route.append(route_info["driver"])
    conv_route.append(route_info["sroute"])
    conv_route.append(tuple(cont_trips))
    dataset.append(tuple(conv_route))
    
dataset = tuple(dataset)
#for x in dataset:
#    if(prints==True): print(x)


#the structure is as follows:
#(id, driver, sroute, (city1-city2-clusternumber))

In [6]:
#remove the singletons not frequent enough, depending on a certain thresehold. 
#In this case, I've chosen 100. 
#The function outputs a tuple with the frequent items and not frequently enough items.
def prune(X, thresehold):
    reduce = []
    for i in X:
        if X[i] < thresehold:
            reduce.append(((i,), X[i]))
    for i in reduce:
        X.pop(i[0][0])
    return (X, reduce)

In [7]:
#A function that, given two trips, outputs wether they could be chained or not,
#for example, if the first trip ends in Milano, it tells you wether the second one
#starts in Milano too. In this way, we only create "possible couples" with the next function.
def match(x, y):
    w1 = x
    l1 = len(w1)
    w1 = w1[:l1-2]
    index1 = w1.index('-')
    w1 = w1[index1+1:]
    
    w2 = y
    l2 = len(w2)
    w2 = w2[:l2-2]
    index2 = w2.index('-')
    w2 = w2[:index2]
    
    return w1 == w2 

In [8]:
#using the above function we create couples.It also gives us a list 
#of the singletons that we could not extend, and a list of the ones we could.
#We need the latter in case the couples don't pass the prune in the next step;
#in that case, we would go back one step and offer the routes that created that couple.

def couples(X):
    nextcandidates = []
    extend = []
    for i in X:
        extend.append(((i,), X[i]))
    lista = []
    for i in X:
        for j in X:
            if match(i, j):
                if ((i,), X[i]) in extend:
                    extend.remove(((i,), X[i]))
                if ((j,), X[j]) in extend:
                    extend.remove(((j,), X[j]))
                    
                if (i, j) not in lista:
                    lista.append((i, j))
                if ((i,), X[i]) not in nextcandidates:
                        nextcandidates.append(((i,), X[i]))
                if ((j,), X[j]) not in nextcandidates:
                        nextcandidates.append(((j,), X[j]))
    return (lista, extend, nextcandidates)

In [9]:
#this function tells us wether a given sequence is a subsequence of a longer one.
#basically we are certifying if a given subroute is part of a bigger route in the dataset.

def subsequence(m, S):
    l = len(m)
    L = len(S)
    for i in range(0, L-l+1):
        if S[i:i+l] == m:
            return True
    return False

In [10]:
#given possible frequent tuples it creates a dictionary with the occurrences
#of such tuples as subsequences in the dataset.

def find(candidates, X):
    freq = {}
    for i in candidates:
        for j in X:
            if subsequence(i, j[3]):
                if i in freq:
                    freq[i] = freq[i]+1
                else:
                    freq[i] = 1
    return freq

In [11]:
#this function takes a list of tuples of the same length and combines them
#to create new tuples of length + 1. Furthermore, it only creates "logical"
#tuples. That is, it won't combine (1,2,3) with (1,2,5) but it will combine 
#(1,2,3) with (2,3,4) to create (1,2,3,4). It also gives as a list of tuples
#we could not extend, and the ones we could, for analogous reasons as in the 
#case of the function couples.

def combine(X):
    extend = []
    for i in X:
        extend.append((i, X[i]))
    nextcandidates = []
    lista = []
    if len(X) != 0:
        for i in X:
            l = len(i)
            break
        for tuple1 in X:
            for tuple2 in X:
                if tuple2[:l-1] == tuple1[1:]:
                    newtuple = tuple1 + (tuple2[l-1],)
                    if (tuple1, X[tuple1]) in extend:
                        extend.remove((tuple1, X[tuple1]))
                    if (tuple2, X[tuple2]) in extend:
                        extend.remove((tuple2, X[tuple2]))
                    if newtuple not in lista:
                        lista.append(newtuple)
                    if (tuple1, X[tuple1]) not in nextcandidates:
                        nextcandidates.append((tuple1, X[tuple1]))
                    if (tuple2, X[tuple2]) not in nextcandidates:
                        nextcandidates.append((tuple2, X[tuple2]))
    return (lista, extend, nextcandidates)

In [12]:
#It takes a trip-clusters and converts it into an actual trip.

def cluster_to_info(x,clusterinfo,dim_map):
    
    w1 = x
    l1 = len(w1)
    w1 = w1[:l1-2]
    index1 = w1.index('-')
    w1 = w1[index1+1:]
    
    w2 = x[:l1-2]
    index2=w2.index('-')
    w2 = w2[:index2]
    
    trip = {}
    trip['from'] = w2
    
    
    info = clusterinfo[x]['centroid']
    l = len(x) - 2
    merch = dim_map[x[:l]]
    s = {}
    for i in range(len(merch)):
         if info[i] != 0:
            s[merch[i]] = info[i]
            
    trip['merchandise'] = s
    trip['to'] = w1
    
    return trip

In [13]:
#It takes all of the cluster-routes and convert them into actual routes.

def offeroutes(OFFER):
    lista = []
    L = len(OFFER)
    for i in range(L):
        ruta = {}
        ruta['id'] = 's'+str(i+1)
        TRIPS = []
        for j in OFFER[i][0]:
            trip = cluster_to_info(j, clusterinfo, dim_map)
            TRIPS.append(trip)
        ruta['route'] = TRIPS
        lista.append(ruta)
    return lista

In [14]:
def FREQUENT_ITEMS(dataset, thresehold):
    OFFER = []
        
    freq = {}
    for x in dataset:
        for i in x[3]:
            if i not in freq:
                freq[i] = 1
            else:
                freq[i] = freq[i] + 1
    
    #we can also use clusterinfo to get freq
    #freq = {}
    #for i in clusterinfo:
    #    freq[i] = clusterinfo[i]['count']
    
    
    S = prune(freq,thresehold)[0]
    (candidates, extend, nextcandidates) = couples(S)
    OFFER = OFFER + extend
    
    
    while len(candidates) != 0:
        new = find(candidates, dataset)
        (S, H) = prune(new, thresehold)
        count=0
        for j in nextcandidates:
            for i in H:
                if subsequence(j[0], i[0]):
                    OFFER = OFFER + [j]
                    count = count+1
            if count == 0:
                for i in S:
                    if subsequence(j[0], i):
                        count = count+1
            if count == 0:
                OFFER = OFFER + [j]
                
        (candidates, extend, nextcandidates) = combine(S)
        OFFER = OFFER + extend
    return offeroutes(OFFER)

In [15]:
OFFER = FREQUENT_ITEMS(dataset, len(dataset)/40)
OFFER

[{'id': 's1',
  'route': [{'from': 'Modena',
    'merchandise': {'Water': 8.0,
     'Potatoes': 3.9,
     'Butter': 14.6,
     'Honey': 27.0,
     'Pasta': 4.8},
    'to': 'Genova'}]},
 {'id': 's2',
  'route': [{'from': 'Genova',
    'merchandise': {'Water': 0.2,
     'Milk': 0.1,
     'Chocolate': 0.2,
     'Rice': 0.2,
     'Pens': 0.1,
     'Potatoes': 0.2,
     'Butter': 0.1,
     'Apples': 3.4,
     'Honey': 0.1,
     'Carrots': 0.2,
     'Bananas': 28.0,
     'Fish': 0.2,
     'Bread': 0.2,
     'Pasta': 0.2,
     'Yogurt': 0.2,
     'Tea': 0.2,
     'Tomatoes': 0.2,
     'Meat': 10.5},
    'to': 'Bolzano'}]},
 {'id': 's3',
  'route': [{'from': 'Lecce',
    'merchandise': {'Chocolate': 25.0,
     'Potatoes': 15.0,
     'Yogurt': 11.0,
     'Tomatoes': 11.3,
     'Meat': 9.3},
    'to': 'Padua'},
   {'from': 'Padua',
    'merchandise': {'Cheese': 1.1,
     'Chocolate': 7.5,
     'Potatoes': 5.0,
     'Beer': 18.2,
     'Honey': 1.2,
     'Fish': 19.2,
     'Meat': 16.5},
    'to':