In [None]:
filename="actual_routes.json"

#import data
import json
print("loading data into memory")
with open(filename, 'r') as file:
    data = json.load(file)
print("loaded "+str(len(data))+" routes into memory")

In [None]:
#find the amount of dimensions (possible merch) for all the possible 
#connections between cities 
print("determine subspaces and its dimensions")
dim_count={}

for route_info in data:
    for trip_info in route_info["route"]:
        
        set_of_items=set()
        
        for merchandise in trip_info["merchandise"]:
            set_of_items.add(merchandise)

        conn_name=trip_info["from"]+"-"+trip_info["to"]
        
        if conn_name not in dim_count:
            dim_count[conn_name]=set_of_items
        else:
            dim_count[conn_name].update(set_of_items)

#create a mapping method by converting the sets (first for speed) into
#tuples which take less memory than lists
#we do this so we can say that 'pens' is for example the first dimension
#and 'milk' the second
for conn_name in dim_count:
    dim_count[conn_name]=(tuple(dim_count[conn_name]))
print("found "+str(len(dim_count))+" subspaces:")
print(dim_count)

In [None]:
#convert the data into lists of data points so the clustering can be applied
print("converting the data into data points for each subspaces")
data_points={}
for conn_name in dim_count:
    data_points[conn_name]=[]

for route_info in data:
    for trip_info in route_info["route"]:
        conn_name=trip_info["from"]+"-"+trip_info["to"]
        
        temp_point=[0] * len(dim_count[conn_name])
        
        for merch in trip_info["merchandise"]:
            index=dim_count[conn_name].index(merch)
            temp_point[index]=trip_info["merchandise"][merch]
                    
        data_points[conn_name].append(temp_point) #change to list again if needed
print("done")
print(data_points)

In [None]:
import numpy as np
import warnings
import matplotlib.pyplot as plt
import random
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler


#ignore warnings temporarily for better readability
warnings.filterwarnings("ignore")

def sample(upper_limit1, amount_of_samples1, data1):
    rand_numbs=random.sample(range(0, upper_limit1), amount_of_samples1)
    sample_space=[list(data1[i]) for i in rand_numbs]
    return sample_space

def round_list(lst, decimal_places=1):
    rounded_list = [round(element, decimal_places) for element in lst]
    return rounded_list

ext_data_points={}
clusterinfo={}
count=0
for city in data_points:
    count=count+1
    max_expected_clusters=0
    labeling={}
    data = data_points[city]
    #limit the sample space
    datasize=len(data)
    upper_limit=datasize
    if datasize<1000:
        amount_of_samples=min(datasize, 100)
        max_expected_clusters=round(float(amount_of_samples)**(1/2))
    elif datasize<10000:
        amount_of_samples=round(datasize/10)
        max_expected_clusters=round(float(amount_of_samples)**(1/2))

    else:
        amount_of_samples=round(datasize/100)
        max_expected_clusters=round(float(amount_of_samples)**(1/2))

    sample_space=[]
    
    #check if all the samplepoint    print(amount_of_samples)
    #are the same and if they are try again 5 times
    #if still the same pass the knowledge on
    for _ in range(5):
        one_point_marker=True
        sample_space=sample(upper_limit, amount_of_samples, data)
        if all(x == data[0] for x in sample_space) != True:
            one_point_marker=False
            break
            
    # Calculate silhouette scores for different values of k using samples 
    silhouette_scores = []
    #limit expected amount of clusters proportional to the amount of datapoints we can maybe say that for every 2% of dataset there may be a cluster existing
    
    K_range = range(2, max_expected_clusters)

    print(str(count)+"/"+str(len(data_points))+" trying to find "+str(max_expected_clusters)+ " clusters for subspace: "+ city +" in " + str(amount_of_samples)+" samples with datasize of " + str(datasize))

    
    if one_point_marker==False and max_expected_clusters>2:
        #loop through possible amount of clusters and stop for first silhouette with score >0.7
        for k in K_range:
            try:
                scaler = MinMaxScaler()
                sample_space = scaler.fit_transform(sample_space)
                kmeans = KMeans(n_clusters=k)
                kmeans.fit(sample_space)
                labels = kmeans.labels_
                silhouette_scores.append(silhouette_score(sample_space, labels))
            except ConvergenceWarning as e:
                break
            if max(silhouette_scores)>0.7:
                break
        
        #determine k using the sampled space
        max_ss=max(silhouette_scores)
        if max_ss>0.6:
            k=silhouette_scores.index(max_ss)+2
        else:
            k=1
            
    else:
        k=1
    print("found " + str(k) + " clusters. Now perform clustering on the data:")
    #now use kmeans to find the labels of the points for the entire set
    scaler = MinMaxScaler()
    data = scaler.fit_transform(data)
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data)

    print("done")
    print("building map with labels and cluster info")
    #build the extended data points set which also contain the labels of the points
    labels=kmeans.labels_
    
    for index, point in enumerate(data_points[city]):
        labeling[tuple(point)]=labels[index]
    ext_data_points[city]=labeling
    
    #build a dataset which contains info about each cluster
    for i in range(k):
        temp={}
        oneclusterdata=data[labels==i]
        temp["count"]=(len(oneclusterdata))
        kmeans1=KMeans(n_clusters=1)
        kmeans1.fit(oneclusterdata)
        temp["inertia"]=(kmeans1.inertia_)
        temp["centroid"]=tuple(round_list(scaler.inverse_transform(kmeans.cluster_centers_)[i]))
        clusterinfo[city+"-"+str(i)]=temp
print("found "+str(len(clusterinfo))+" clusters")

print(ext_data_points)
print(clusterinfo)


In [None]:
#Now build the new, transformed dataset out of the old dataset and the gathered data
#the data we gathered are the mapping tool (dim_count) and the ext_data_points
with open(filename, 'r') as file:
    data = json.load(file)
data=data[0:10000]

dataset=[]
dim_map=dim_count
clust_map=ext_data_points

for route_info in data:
    conv_route=[]
    cont_trips=[]
    
    
    for trip_info in route_info["route"]:
        conn_name=trip_info["from"]+"-"+trip_info["to"]
        
        temp_point=[0] * len(dim_map[conn_name])
        
        for merch in trip_info["merchandise"]:
            index=dim_map[conn_name].index(merch)
            temp_point[index]=trip_info["merchandise"][merch]
        cluster=clust_map[conn_name][tuple(temp_point)]
        trip_name=conn_name+"-"+str(cluster)
        cont_trips.append(trip_name)
        
    conv_route.append(route_info["id"])
    conv_route.append(route_info["driver"])
    conv_route.append(route_info["sroute"])
    conv_route.append(tuple(cont_trips))
    dataset.append(tuple(conv_route))
    
dataset=tuple(dataset)
for x in dataset:
    print(x)


#the structure is as follows:
#(id, driver, sroute, (city1-city2-clusternumber))

In [None]:
#the output variables are:
print(clusterinfo) #contains all info about the clusters
print(dim_map) #needed to find the mapping of the clusters (which dimension is which merchandise)
print(dataset) #converted actual routes using the clusters