<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# change default style figure and font size
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,matplotlib

Ethen 2019-02-28 19:24:47 

CPython 3.6.4
IPython 6.4.0

numpy 1.14.2
pandas 0.23.4
sklearn 0.20.2
matplotlib 2.2.3


- https://vene.ro/blog/kemeny-young-optimal-rank-aggregation-in-python.html
- http://www.aaai.org/Papers/AAAI/2006/AAAI06-099.pdf
- https://github.com/ethen8181/programming/blob/master/OR/OR_starter.ipynb

- The Athletic Wizard: Alicia Spinnet, Ginny Weasley, Gwendolyn Morgan, Robin Higgy, Debbie Muntz 
- The Daily Prophet: Alicia, Ginny, Robin, Gwendolyn, Debbie 
- Quidditch News: Robin, Ginny, Gwendolyn, Debbie, Alicia 
- Seeker Weekly: Gwendolyn, Ginny, Robin, Debbie, Alicia 
- The Quibbler: Debbie, Ginny, Robin, Gwendolyn, Alicia

In [2]:
cols = "Alicia Ginny Gwendolyn Robin Debbie".split()

ranks = np.array([[0, 1, 2, 3, 4],
                  [0, 1, 3, 2, 4],
                  [3, 1, 2, 4, 0],
                  [1, 2, 3, 4, 0],
                  [4, 1, 3, 2, 0]])

# ranks = np.array([[0, 1, 2, 3, 4],
#                   [0, 1, 3, 2, 4],
#                   [4, 1, 2, 0, 3],
#                   [4, 1, 0, 2, 3],
#                   [4, 1, 3, 2, 0]])

The Kendall tau distance between two ranked lists is defined as the number of pairwise disagreements in the relative rankings of items in the two lists.

In [3]:
from itertools import combinations


def kendall_tau_dist(rank1, rank2):
    tau = 0
    n_candidates = len(rank1)
    for i, j in combinations(range(n_candidates), 2):
        tau += np.sign(rank1[i] - rank1[j]) == np.sign(rank2[j] - rank2[i])

    return tau


kendall_tau_dist(ranks[0], ranks[3])

4

In [4]:
def kendall_tau_distance(s,t):
    """
    Computes the Kendall tau distance between two full lists of ranks,
    which counts all discordant pairs (where s(i) < s(j) but t(i) > t(j),
    or vice versa) and divides by:
            k*(k-1)/2
    This is a slow version of the distance; a faster version can be
    implemented using a version of merge sort (TODO).
    s,t should be array-like (lists are OK).
    If s,t are *not* full, this function should not be used.
    """
    numDiscordant = 0
    for i in range(0,len(s)):
        for j in range(i+1,len(t)):
            if (s[i] < s[j] and t[i] > t[j]):# or (s[i] > s[j] and t[i] < t[j]):
                numDiscordant += 1
    return numDiscordant# 2.0*numDiscordant/(len(s)*(len(s)-1))


kendall_tau_distance(ranks[0], ranks[3])

4

Now that we have a distance metric, we can formulate a loss function to minimize in rank-space. We are looking for a ranking $\hat\tau$ that satisfies

\begin{align}
\sum_i d(\hat\tau, \tau_i) \leq \sum_i d(\tau, \tau_i) \text{ for all } \tau
\end{align}

In [5]:
from itertools import permutations


def rank_agg_brute(ranks):
    min_dist = np.inf
    best_rank = None
    n_candidates = ranks.shape[1]
    
    count = 0
    for candidate_rank in permutations(range(n_candidates)):
        dist = np.sum(kendall_tau_dist(candidate_rank, rank) for rank in ranks)
        if dist < min_dist:
            min_dist = dist
            best_rank = candidate_rank
            
            
        count += 1
    
    return min_dist, best_rank

In [6]:
best_dist, best_rank = rank_agg_brute(ranks)
best_rank_name = [cols[i] for i in best_rank]

print('best dist: ', best_dist)
print('best rank: ', best_rank)
print('best rank name: ', best_rank_name)

best dist:  15
best rank:  (1, 2, 3, 4, 0)
best rank name:  ['Ginny', 'Gwendolyn', 'Robin', 'Debbie', 'Alicia']


In [9]:
def build_graph(ranks):
    n_voters, n_candidates = ranks.shape
    edge_weights = np.zeros((n_candidates, n_candidates), np.int)
    for i, j in combinations(range(n_candidates), 2):
        preference = ranks[:, i] - ranks[:, j]
        prefer_i_over_j = np.sum(preference < 0)
        prefer_j_over_i = np.sum(preference > 0)
        if prefer_i_over_j > prefer_j_over_i:
            edge_weights[i, j] = prefer_i_over_j - prefer_j_over_i
        elif prefer_i_over_j < prefer_j_over_i:
            edge_weights[j, i] = prefer_j_over_i - prefer_i_over_j
        # draw no edge if equivalent preference

    return edge_weights


build_graph(ranks)

array([[0, 1, 1, 3, 0],
       [0, 0, 5, 5, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0],
       [1, 1, 1, 1, 0]])

In [10]:
from lp_solve import lp_solve

def rankaggr_lp(ranks):
    """Kemeny-Young optimal rank aggregation"""

    n_voters, n_candidates = ranks.shape
    
    # maximize c.T * x
    edge_weights = _build_graph(ranks)
    c = -1 * edge_weights.ravel()  

    idx = lambda i, j: n_candidates * i + j

    # constraints for every pair
    pairwise_constraints = np.zeros(((n_candidates * (n_candidates - 1)) / 2,
                                     n_candidates ** 2))
    for row, (i, j) in zip(pairwise_constraints,
                           combinations(range(n_candidates), 2)):
        row[[idx(i, j), idx(j, i)]] = 1

    # and for every cycle of length 3
    triangle_constraints = np.zeros(((n_candidates * (n_candidates - 1) *
                                     (n_candidates - 2)),
                                     n_candidates ** 2))
    for row, (i, j, k) in zip(triangle_constraints,
                              permutations(range(n_candidates), 3)):
        row[[idx(i, j), idx(j, k), idx(k, i)]] = 1

    constraints = np.vstack([pairwise_constraints, triangle_constraints])
    constraint_rhs = np.hstack([np.ones(len(pairwise_constraints)),
                                np.ones(len(triangle_constraints))])
    constraint_signs = np.hstack([np.zeros(len(pairwise_constraints)),  # ==
                                  np.ones(len(triangle_constraints))])  # >=

    obj, x, duals = lp_solve(c, constraints, constraint_rhs, constraint_signs,
                             xint=range(1, 1 + n_candidates ** 2))

    x = np.array(x).reshape((n_candidates, n_candidates))
    aggr_rank = x.sum(axis=1)

    return obj, aggr_rank

ModuleNotFoundError: No module named 'lp_solve'

Borda’s Method:

(1) Voters rank the entire list of candidates from first choice to last choice.

(2) For each ballot, the lowest rank candidate is given 1 point, the second lowest 2
points, and so on until the highest ranked candidate is given a number of points
equal to the number of candidates.

(3) The total number of points for each candidate is summed across all ballots. This
number of points is called the Borda count.

(4) The candidate with the highest Borda count wins

https://www2.math.uconn.edu/~gageonea/math1030f13/1-2_notes.pdf