In [267]:
import random
import numpy as np
from scipy.optimize import linprog

# Generate a random assignment problem

In [333]:
n_elements = 30             
n_added_elements = 0        

# Generate old clusters
N = 4
elements = range(n_elements)
random.shuffle(elements)
old = [set(elements[i::N]) for i in range(N)]

# Generate new clusters
M = 5
elements = range(n_elements + n_added_elements)
random.shuffle(elements)
new = [set(elements[i::M]) for i in range(M)]

# Add M virtual empty old clusters 
N = N + M
for j in range(M):
    old.append(set([]))

print old
print new

[set([1, 8, 10, 13, 15, 18, 22, 25]), set([6, 7, 9, 12, 14, 19, 20, 23]), set([0, 2, 3, 4, 21, 24, 28]), set([5, 11, 16, 17, 26, 27, 29]), set([]), set([]), set([]), set([]), set([])]
[set([6, 7, 11, 15, 20, 23]), set([0, 3, 8, 18, 21, 22]), set([2, 4, 5, 16, 19, 26]), set([9, 12, 14, 24, 25, 28]), set([1, 10, 13, 17, 27, 29])]


# Fixed problem

In [337]:
# Fixed problem
N = 2
M = 2
old = [set([1]), set([2, 3, 4, 5])]
new = [set([2]), set([1, 3, 4, 5])]
N = N + M
for j in range(M):
    old.append(set([]))
print old
print new

[set([1]), set([2, 3, 4, 5]), set([]), set([])]
[set([2]), set([1, 3, 4, 5])]


# Solve using linear programming

In [338]:
cost = np.zeros((N, M))

for i, s_i in enumerate(old):
    for j, s_j in enumerate(new):
        cost[i, j] = -len(s_i.intersection(s_j)) + len(s_i.symmetric_difference(s_j))
        
print cost

[[ 2.  2.]
 [ 2. -1.]
 [ 1.  4.]
 [ 1.  4.]]


In [339]:
n_edges = N * M
c = cost.ravel()

# Each old cluster may be assigned to at most one new cluster
A_ub = np.zeros((N, n_edges))
for i in range(N):
    A_ub[i, i*M:(i+1)*M] = 1.0
b_ub = np.ones(N)

# Each new cluster has to be assigned to exactly one old cluster
A_eq = np.zeros((M, n_edges))
for j in range(M):
    A_eq[j, j::M] = 1.0
b_eq = np.ones(M)

r = linprog(c, A_ub=A_ub, b_ub=b_ub, A_eq=A_eq, b_eq=b_eq)
print r

  status: 0
   slack: array([ 1.,  0.,  0.,  1.])
 success: True
     fun: -0.0
       x: array([ 0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.])
 message: 'Optimization terminated successfully.'
     nit: 7


In [340]:
for i in range(N):
    for j in range(M):
        if r.x[i*M+j] == 1:
            print "%25s -> %25s" % (sorted(old[i]), sorted(new[j]))
            break
    else:
        print "%25s -> %25s" % (sorted(old[i]), "DELETED")

                      [1] ->                   DELETED
             [2, 3, 4, 5] ->              [1, 3, 4, 5]
                       [] ->                       [2]
                       [] ->                   DELETED
