In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ortools.linear_solver import pywraplp as OR
import random
import gurobipy as gp
from gurobipy import GRB
import matplotlib.colors as mcolors

import pickle

COLOR_LIST = list(mcolors.CSS4_COLORS.keys())
NUM_POINTS = 500
NUM_CLUSTERS = 5
SQUARE_SIZE = 600

TOLERANCE = ((SQUARE_SIZE**2/NUM_CLUSTERS)**0.5)
print(TOLERANCE)

The below cell generates a set of 1000 random points, to be split into 20 clusters

In [None]:
point_dict = {}
distances_dict = {}
possible_pairs = {}
random_points = SQUARE_SIZE*np.random.rand(2,NUM_POINTS)
for i in range(NUM_POINTS):
    point_dict[("point_"+str(i))] = (random_points[0][i],random_points[1][i])

In [None]:
for i in point_dict.keys():
    temp_list = []
    
    for j in point_dict.keys():
        dist_x = point_dict[i][0] - point_dict[j][0]
        dist_y = point_dict[i][1] - point_dict[j][1]
        distances_dict[(i,j)] = (dist_x**2+dist_y**2)**0.5
        if (distances_dict[(i,j)] <= TOLERANCE):
            temp_list.append(j)
    possible_pairs[i] = temp_list

In [None]:
plot = plt.scatter(random_points[0],random_points[1],c="green")
plt.xlabel("x")
plt.ylabel("y")

In [None]:
# Standard k-medians implementation.

m = gp.Model("clustering")
pairs = {}
isCenter = {}
for i in possible_pairs.keys():
    isCenter[i] = m.addVar(vtype=GRB.BINARY, name = "isCenter[%s]" %i)
    for j in possible_pairs[i]:
        pairs[i,j] = m.addVar(vtype=GRB.BINARY, name = "pair{%s,%s}" % (i,j))
        
for j in possible_pairs.keys():#constraint to define the isNotCenter variable
    m.addConstr(isCenter[j] <= sum(pairs[i,j] for i in possible_pairs[j])) #isCenter[j] is LEQ than sum of all pairs [i,j]
    for i in possible_pairs[j]:#to ensure that isCenter is 1 if any point has it as a center
        m.addConstr(isCenter[j] - pairs[i,j] >= 0)

for i in possible_pairs.keys(): #constraint to ensure that every point has 1 associated cluster
    m.addConstr(sum(pairs[i,j] for j in possible_pairs[i]) == 1)
    m.addConstr(pairs[i,i] >= isCenter[i])
    
m.addConstr(sum(isCenter[j] for j in possible_pairs.keys()) == NUM_CLUSTERS)
    
m.setObjective(sum(distances_dict[i,j]*pairs[i,j] for i in possible_pairs.keys() for j in possible_pairs[i]), GRB.MINIMIZE)
m.optimize()

# for v in m.getVars():
#     print('%s %g' % (v.varName, v.x))
# print('Obj: %g' % m.objVal)


In [None]:
# solve the k-medians problem while allowing 30 points to go un-classified

NUM_OUTLIERS = 30

m = gp.Model("clustering")
pairs = {}
isCenter = {}
outlier = {}
for i in possible_pairs.keys():
    isCenter[i] = m.addVar(vtype=GRB.BINARY, name = "isCenter[%s]" %i)
    outlier[i] = m.addVar(vtype=GRB.BINARY, name = "outlier[%s]" %i)
    for j in possible_pairs[i]:
        pairs[i,j] = m.addVar(vtype=GRB.BINARY, name = "pair{%s,%s}" % (i,j))
        
for j in possible_pairs.keys():#constraint to define the isNotCenter variable
    m.addConstr(isCenter[j] <= sum(pairs[i,j] for i in possible_pairs[j])) #isCenter[j] is LEQ than sum of all pairs [i,j]
    for i in possible_pairs[j]:#to ensure that isCenter is 1 if any point has it as a center
        m.addConstr(isCenter[j] - pairs[i,j] >= 0)

for i in possible_pairs.keys():
    # this constraint was changed to ensure that each point i is either part of at least one cluster, or declared an outlier
    m.addConstr(sum(pairs[i,j] for j in possible_pairs[i]) + outlier[i] >= 1) 
    m.addConstr(pairs[i,i] >= isCenter[i])
    
m.addConstr(sum(isCenter[j] for j in possible_pairs.keys()) == NUM_CLUSTERS)
m.addConstr(sum(outlier[i] for i in possible_pairs.keys()) <= NUM_OUTLIERS)
    
m.setObjective(sum(distances_dict[i,j]*pairs[i,j] for i in possible_pairs.keys() for j in possible_pairs[i]), GRB.MINIMIZE)
m.optimize()

# for v in m.getVars():
#     print('%s %g' % (v.varName, v.x))
# print('Obj: %g' % m.objVal)


In [None]:
centerDict = {}
for center in isCenter.keys():
    if isCenter[center].x == 1:
        centerDict[center] = []
        
for pair in pairs.keys():
    if pairs[pair].x == 1:
        centerDict[pair[1]].append(point_dict[pair[0]])

for center in centerDict.keys():
    centerDict[center] = dict(centerDict[center])

In [None]:
outs = []
for out in outlier:
    if outlier[out].x == 1:
        outs.append(point_dict[out])

In [None]:
color_selection = np.random.choice(COLOR_LIST,NUM_CLUSTERS)
for center in centerDict.keys():
    current_color = np.random.rand(1,3)
    (keys,values) = zip(*centerDict[center].items())
    plt.plot(keys,values,c=current_color)
    plt.scatter(keys,values,c=current_color)
plt.scatter(*zip(*outs),c="red")


In [None]:
# Now, trying the above code on the taxi data.
# Locally, I copied the taxi_count_dict.pickle file from the MST clustering lab.
# These are number of rides hailed in 15? minute intervals for every day of one year.
# See the other lab for more info I guess

with open('data/taxi_count_dict.pickle', 'rb') as handle:
    taxi_counts = pd.DataFrame(pickle.load(handle))
print(taxi_counts.loc[0:6])

In [None]:
len(taxi_counts)

In [None]:
# preprocess data again
point_dict = {}
distances_dict = {}
possible_pairs = {}
for i in range(len(taxi_counts)):
    point_dict[("day_"+str(i))] = taxi_counts.loc[i]["count_vector"]

In [None]:
for i in point_dict.keys():
    temp_list = []
    
    for j in point_dict.keys():
        distances_dict[(i,j)] = np.linalg.norm(np.array(point_dict[i]) - np.array(point_dict[j]), ord=1)
        if True:#(distances_dict[(i,j)] <= TOLERANCE): # Not sure what tolerance should be, so ignoring. If it is too slow, consider adding some tolerance.
            temp_list.append(j)
    possible_pairs[i] = temp_list

In [None]:
# Copy-pasted from above. TODO: put in a single function
NUM_OUTLIERS = 12
NUM_CLUSTERS = 4

m = gp.Model("clustering")
pairs = {}
isCenter = {}
outlier = {}
for i in possible_pairs.keys():
    isCenter[i] = m.addVar(vtype=GRB.BINARY, name = "isCenter[%s]" %i)
    outlier[i] = m.addVar(vtype=GRB.BINARY, name = "outlier[%s]" %i)
    for j in possible_pairs[i]:
        pairs[i,j] = m.addVar(vtype=GRB.BINARY, name = "pair{%s,%s}" % (i,j))
        
for j in possible_pairs.keys():#constraint to define the isNotCenter variable
    m.addConstr(isCenter[j] <= sum(pairs[i,j] for i in possible_pairs[j])) #isCenter[j] is LEQ than sum of all pairs [i,j]
    for i in possible_pairs[j]:#to ensure that isCenter is 1 if any point has it as a center
        m.addConstr(isCenter[j] - pairs[i,j] >= 0)

for i in possible_pairs.keys(): #constraint to ensure that every point has 1 associated cluster
    m.addConstr(sum(pairs[i,j] for j in possible_pairs[i]) + outlier[i] >= 1)
    m.addConstr(pairs[i,i] >= isCenter[i])
    
m.addConstr(sum(isCenter[j] for j in possible_pairs.keys()) == NUM_CLUSTERS)
m.addConstr(sum(outlier[i] for i in possible_pairs.keys()) <= NUM_OUTLIERS)
    
m.setObjective(sum(distances_dict[i,j]*pairs[i,j] for i in possible_pairs.keys() for j in possible_pairs[i]), GRB.MINIMIZE)
m.optimize()

# for v in m.getVars():
#     print('%s %g' % (v.varName, v.x))
# print('Obj: %g' % m.objVal)


In [None]:
# TODO: The above cell took ~ 15 seconds for me to run. 
# It may be worth seeing how long it takes if re-implemented in or-tools with their free MIP solver.


In [None]:
centerDict = {}
for center in isCenter.keys():
    if isCenter[center].x == 1:
        centerDict[center] = []
        
for pair in pairs.keys():
    if pairs[pair].x == 1:
        centerDict[pair[1]].append(pair[0])
        
outs = []
for out in outlier:
    if outlier[out].x == 1:
        outs.append(out)
        
# A little bit to try to visualize the output.
# Indicates information about each of the outliers (month,day,day-of-week)
# and the same information about all the days, grouped by clusters.
    
print("OUTLIERS")
for day in outs:
    entry = taxi_counts.loc[int(day[4:])]
    print(day, "%s,%s:  %s" % (entry["m"],entry["d"],entry["weekday"]))

for center in centerDict:
    print("")
    print("CLUSTER")
    for day in centerDict[center]:
        entry = taxi_counts.loc[int(day[4:])]
        print(day, "%s,%s:  %s" % (entry["m"],entry["d"],entry["weekday"]))

In [None]:
# Visualization.
# Below is NOT working atm
# We want to embed the high-dimensional points into a 2-dimensional space so we can do a scatter plot.

# This code is copied from Vivek from some urban analytics class 
# def embed():
#     '''Generate 2D embedding by hellinger distance of 24D global "points" and store in global
#     "em". The computed distances and 24D vectors are also stored globally. The SMACOF algorithm 
#     is used for the projection.
#     '''
#     global vecs
#     global distances
#     global em
#     shuffle(points)
#     # Use normalized vectors
#     vecs = [norm(i.vec) for i in points]
#     distances = [[hellinger(a, b) for a in vecs] for b in tqdm(vecs)]

#     mds = manifold.MDS(
#         n_components=2, 
#         max_iter=300,
#         eps=1e-9,
#         random_state=np.random.RandomState(seed=0), 
#         dissimilarity="precomputed", 
#         n_jobs=1
#     )
    
#     em = mds.fit(distances).embedding_

# Here is an outline of what the above does:
# - treat the points as probability distributions (by normalizing)
# - find the pairwise hellinger distance between them
# - use "multidimensional scaling" to embed into two dimensions (See https://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html)


# Problems:
# - I don't think hellinger distance is appropriate
#   - In particular, we shouldn't normalize the data (unless we also want to normalize before clustering)
# - The MDS requires sklearn, which is very big. We can use this to generate a plot, but I don't think
#     running this part is appropriate for student use. 

# Possible steps 
# - Use some other standard clustering metrics to compare this clustering result against the MST clustering results.
