In [1]:
# adapted from https://github.com/tdunning/python-llr

def cmp(a, b):
    return (a > b) - (a < b) 

from collections import Counter 
import math
from functools import reduce

def denormEntropy(counts):
    '''Computes the entropy of a list of counts scaled by the sum of the counts. If the inputs sum to one, this is just the normal definition of entropy'''
    counts = list(counts)
    total = float(sum(counts))
    # Note tricky way to avoid 0*log(0)
    return -sum([k * math.log(k/total + (k==0)) for k in counts])

def llr_2x2(k11, k12, k21, k22):
    '''Special case of llr with a 2x2 table'''
    return 2 * abs(denormEntropy([k11+k12, k21+k22]) +
                denormEntropy([k11+k21, k12+k22]) -
                denormEntropy([k11, k12, k21, k22]))

def llr_root(k11, k12, k21, k22):
    '''Computes a score for a 2x2 contingency table, but then adds a sign according to whether k11 is larger (result is positive) or smaller (result is negative) than might be expected. The magnitude of the result can be roughly interpreted on a scale similar to standard deviations'''
    row = k11 + k21
    total = (k11 + k12 + k21 + k22)
    sign = cmp(float(k11) / (k11 + k12), float(row) / total)
    return math.copysign(math.sqrt(llr_2x2(k11, k12, k21, k22)), sign)

In [2]:
import numpy as np

rawdata = np.array([
    [5,5,0,0,0,0],
    [0,0,5,5,0,0],
    [0,0,0,0,5,5],
    [0,1,5,5,5,0],
    [1,1,5,0,5,5],
    [5,5,0,5,1,1],
    [5,0,0,5,0,1],
    [5,5,5,0,1,0]
    ])
    
likes = np.array([[1 if x == 5 else 0 for x in row] for row in rawdata])
likes

array([[1, 1, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 1, 1],
       [0, 0, 1, 1, 1, 0],
       [0, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0],
       [1, 0, 0, 1, 0, 0],
       [1, 1, 1, 0, 0, 0]])

In [3]:
cooccurrence_matrix = np.dot(likes.transpose(), likes)
cooccurrence_matrix

array([[4, 3, 1, 2, 0, 0],
       [3, 3, 1, 1, 0, 0],
       [1, 1, 4, 2, 2, 1],
       [2, 1, 2, 4, 1, 0],
       [0, 0, 2, 1, 3, 2],
       [0, 0, 1, 0, 2, 2]])

In [4]:
np.fill_diagonal(cooccurrence_matrix, 0)
cooccurrence_matrix

array([[0, 3, 1, 2, 0, 0],
       [3, 0, 1, 1, 0, 0],
       [1, 1, 0, 2, 2, 1],
       [2, 1, 2, 0, 1, 0],
       [0, 0, 2, 1, 0, 2],
       [0, 0, 1, 0, 2, 0]])

In [5]:
size = cooccurrence_matrix.shape[0]
sums = np.array([row.sum() for row in cooccurrence_matrix[:,0:size]])
total = sums.sum()
size, sums, total

(6, array([6, 5, 7, 6, 5, 3]), 32)

In [6]:
for i in range(0, size):
    for j in range(0, size):
        a_b = cooccurrence_matrix[i,j].tolist()
        a_not_b = (sums[i] - a_b).tolist()
        b_not_a = (sums[j] - a_b).tolist()
        not_ab = (total - (a_b + sums[i] + sums[j])).tolist()
        print(i, j, llr_root(a_b, a_not_b, b_not_a, not_ab))

0 0 -1.671630571360789
0 1 2.033620752749888
0 2 -0.4444608330501006
0 3 0.7721212322079151
0 4 -1.5102415112211136
0 5 -1.1470194399358786
1 0 2.033620752749888
1 1 -1.3647652539835755
1 2 -0.19582196118803194
1 3 1.1920928955078125e-07
1 4 -1.3647652539835755
1 5 -1.0369825130804433
2 0 -0.4444608330501006
2 1 -0.19582196118803194
2 2 -1.9932131287507497
2 3 0.5199643772432565
2 4 0.8219975087691875
2 5 0.4161095064184788
3 0 0.7721212322079151
3 1 1.1920928955078125e-07
3 2 0.5199643772432565
3 3 -1.671630571360789
3 4 1.1920928955078125e-07
3 5 -1.1470194399358786
4 0 -1.5102415112211136
4 1 -1.3647652539835755
4 2 0.8219975087691875
4 3 1.1920928955078125e-07
4 4 -1.3647652539835755
4 5 2.027561328129231
5 0 -1.1470194399358786
5 1 -1.0369825130804433
5 2 0.4161095064184788
5 3 -1.1470194399358786
5 4 2.027561328129231
5 5 -0.7885438972362838


### Exercise

So far we've used the "people who liked A were unusually likely to like B" statistics to make predictions, but we've ignored the information we have about dislikes. Can you find a way to improve the results by leveraging the 1 values in the rawdata (the dislikes)?