In [1]:
# adapted from https://github.com/tdunning/python-llr

def cmp(a, b):
    return (a > b) - (a < b) 

from collections import Counter 
import math
from functools import reduce

def denormEntropy(counts):
    '''Computes the entropy of a list of counts scaled by the sum of the counts. If the inputs sum to one, this is just the normal definition of entropy'''
    counts = list(counts)
    total = float(sum(counts))
    # Note tricky way to avoid 0*log(0)
    return -sum([k * math.log(k/total + (k==0)) for k in counts])

def llr_2x2(k11, k12, k21, k22):
    '''Special case of llr with a 2x2 table'''
    return 2 * abs(denormEntropy([k11+k12, k21+k22]) +
                denormEntropy([k11+k21, k12+k22]) -
                denormEntropy([k11, k12, k21, k22]))

def llr_root(k11, k12, k21, k22):
    '''Computes a score for a 2x2 contingency table, but then adds a sign according to whether k11 is larger (result is positive) or smaller (result is negative) than might be expected. The magnitude of the result can be roughly interpreted on a scale similar to standard deviations'''
    row = k11 + k21
    total = (k11 + k12 + k21 + k22)
    sign = cmp(float(k11) / (k11 + k12), float(row) / total)
    return math.copysign(math.sqrt(llr_2x2(k11, k12, k21, k22)), sign)

In [2]:
import numpy as np

rawdata = np.array([
    [5,5,0,0,0,0],
    [0,0,5,5,0,0],
    [0,0,0,0,5,5],
    [0,1,5,5,5,0],
    [1,1,5,0,5,5],
    [5,5,0,5,1,1],
    [5,0,0,5,0,1],
    [5,5,5,0,1,0]
    ])
    
likes = np.array([[1 if x == 5 else 0 for x in row] for row in rawdata])
dislikes = np.array([[1 if x == 1 else 0 for x in row] for row in rawdata])


In [3]:
cooccurrence_matrix = np.dot(likes.transpose(), likes)
cooccurrence_matrix
cooccurrence_matrix_d = np.dot(dislikes.transpose(), dislikes)
cooccurrence_matrix_d 


array([[1, 1, 0, 0, 0, 0],
       [1, 2, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 2, 1],
       [0, 0, 0, 0, 1, 2]])

In [4]:
np.fill_diagonal(cooccurrence_matrix, 0)
cooccurrence_matrix
np.fill_diagonal(cooccurrence_matrix_d, 0)
cooccurrence_matrix_d


array([[0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0]])

In [5]:
size = cooccurrence_matrix.shape[0]
sums = np.array([row.sum() for row in cooccurrence_matrix[:,0:size]])
total = sums.sum()
size, sums, total

(6, array([6, 5, 7, 6, 5, 3]), 32)

In [6]:
size_d = cooccurrence_matrix_d.shape[0]
sums_d = np.array([row.sum() for row in cooccurrence_matrix_d[:,0:size]])
total_d = sums.sum()
size_d, sums_d, total_d

(6, array([1, 1, 0, 0, 1, 1]), 32)

In [7]:
size

6

In [8]:
conc_mult = np.zeros((size, size))
for i in range(0, size):
    for j in range(0, size):
        a_b = cooccurrence_matrix[i,j].tolist()
        a_not_b = (sums[i] - a_b).tolist()
        b_not_a = (sums[j] - a_b).tolist()
        not_ab = (total - (a_b + sums[i] + sums[j])).tolist()
        conc_mult[i,j] = llr_root(a_b, a_not_b, b_not_a, not_ab)
        print(i, j, llr_root(a_b, a_not_b, b_not_a, not_ab))

0 0 -1.671630571360789
0 1 2.033620752749888
0 2 -0.4444608330501006
0 3 0.7721212322079151
0 4 -1.5102415112211136
0 5 -1.1470194399358786
1 0 2.033620752749888
1 1 -1.3647652539835755
1 2 -0.19582196118803194
1 3 1.1920928955078125e-07
1 4 -1.3647652539835755
1 5 -1.0369825130804433
2 0 -0.4444608330501006
2 1 -0.19582196118803194
2 2 -1.9932131287507497
2 3 0.5199643772432565
2 4 0.8219975087691875
2 5 0.4161095064184788
3 0 0.7721212322079151
3 1 1.1920928955078125e-07
3 2 0.5199643772432565
3 3 -1.671630571360789
3 4 1.1920928955078125e-07
3 5 -1.1470194399358786
4 0 -1.5102415112211136
4 1 -1.3647652539835755
4 2 0.8219975087691875
4 3 1.1920928955078125e-07
4 4 -1.3647652539835755
4 5 2.027561328129231
5 0 -1.1470194399358786
5 1 -1.0369825130804433
5 2 0.4161095064184788
5 3 -1.1470194399358786
5 4 2.027561328129231
5 5 -0.7885438972362838


In [9]:
conc_mult 

array([[ -1.67163057e+00,   2.03362075e+00,  -4.44460833e-01,
          7.72121232e-01,  -1.51024151e+00,  -1.14701944e+00],
       [  2.03362075e+00,  -1.36476525e+00,  -1.95821961e-01,
          1.19209290e-07,  -1.36476525e+00,  -1.03698251e+00],
       [ -4.44460833e-01,  -1.95821961e-01,  -1.99321313e+00,
          5.19964377e-01,   8.21997509e-01,   4.16109506e-01],
       [  7.72121232e-01,   1.19209290e-07,   5.19964377e-01,
         -1.67163057e+00,   1.19209290e-07,  -1.14701944e+00],
       [ -1.51024151e+00,  -1.36476525e+00,   8.21997509e-01,
          1.19209290e-07,  -1.36476525e+00,   2.02756133e+00],
       [ -1.14701944e+00,  -1.03698251e+00,   4.16109506e-01,
         -1.14701944e+00,   2.02756133e+00,  -7.88543897e-01]])

In [10]:
np.dot(conc_mult, likes[0,:].T)

array([ 0.36199018,  0.6688555 , -0.64028279,  0.77212135, -2.87500677,
       -2.18400195])

In [11]:
rawdata = np.array([
    [5,5,0,0,0,0],
    [0,0,5,5,0,0],
    [0,0,0,0,5,5],
    [0,1,5,5,5,0],
    [1,1,5,0,5,5],
    [5,5,0,5,1,1],
    [5,0,0,5,0,1],
    [5,5,5,0,1,0]
    ])

In [12]:
np.dot(conc_mult, likes[4,:].T)

array([-3.10172178, -2.59756973, -0.75510611, -0.62705494,  1.48479358,
        1.65512694])

In [13]:
conc_mult_d = np.zeros((size, size))
for i in range(0, size):
    for j in range(0, size):
        a_b = cooccurrence_matrix_d[i,j].tolist()
        a_not_b = (sums[i] - a_b).tolist()
        b_not_a = (sums[j] - a_b).tolist()
        not_ab = (total - (a_b + sums[i] + sums[j])).tolist()
        conc_mult_d[i,j] = llr_root(a_b, a_not_b, b_not_a, not_ab)
        print(i, j, llr_root(a_b, a_not_b, b_not_a, not_ab))

0 0 -1.671630571360789
0 1 1.1920928955078125e-07
0 2 -1.825093906345677
0 3 -1.671630571360789
0 4 -1.5102415112211136
0 5 -1.1470194399358786
1 0 1.1920928955078125e-07
1 1 -1.3647652539835755
1 2 -1.648455857370266
1 3 -1.5102415112211136
1 4 -1.3647652539835755
1 5 -1.0369825130804433
2 0 -1.825093906345677
2 1 -1.648455857370266
2 2 -1.9932131287507497
2 3 -1.825093906345677
2 4 -1.648455857370266
2 5 -1.2514057472789795
3 0 -1.671630571360789
3 1 -1.5102415112211136
3 2 -1.825093906345677
3 3 -1.671630571360789
3 4 -1.5102415112211136
3 5 -1.1470194399358786
4 0 -1.5102415112211136
4 1 -1.3647652539835755
4 2 -1.648455857370266
4 3 -1.5102415112211136
4 4 -1.3647652539835755
4 5 0.7499903863888109
5 0 -1.1470194399358786
5 1 -1.0369825130804433
5 2 -1.2514057472789795
5 3 -1.1470194399358786
5 4 0.7499903863888203
5 5 -0.7885438972362838


In [14]:
for i in range(0, size):
    for j in range(0, size):
        a_b = cooccurrence_matrix[i,j].tolist()
        a_not_b = (sums[i] + cooccurrence_matrix_d[i,j] - a_b).tolist()
        b_not_a = (sums[j] + cooccurrence_matrix_d[j,i] - a_b).tolist()
        not_ab = (total - (a_b + sums[i] + sums[j])).tolist()
        print(i, j, llr_root(a_b, a_not_b, b_not_a, not_ab))

0 0 -1.671630571360789
0 1 1.520155306996323
0 2 -0.4444608330501006
0 3 0.7721212322079151
0 4 -1.5102415112211136
0 5 -1.1470194399358786
1 0 1.520155306996323
1 1 -1.3647652539835755
1 2 -0.19582196118803194
1 3 1.1920928955078125e-07
1 4 -1.3647652539835755
1 5 -1.0369825130804433
2 0 -0.4444608330501006
2 1 -0.19582196118803194
2 2 -1.9932131287507497
2 3 0.5199643772432565
2 4 0.8219975087691875
2 5 0.4161095064184788
3 0 0.7721212322079151
3 1 1.1920928955078125e-07
3 2 0.5199643772432565
3 3 -1.671630571360789
3 4 1.1920928955078125e-07
3 5 -1.1470194399358786
4 0 -1.5102415112211136
4 1 -1.3647652539835755
4 2 0.8219975087691875
4 3 1.1920928955078125e-07
4 4 -1.3647652539835755
4 5 1.4677093811172777
5 0 -1.1470194399358786
5 1 -1.0369825130804433
5 2 0.4161095064184788
5 3 -1.1470194399358786
5 4 1.4677093811172777
5 5 -0.7885438972362838


In [15]:
conc_mult_d

array([[ -1.67163057e+00,   1.19209290e-07,  -1.82509391e+00,
         -1.67163057e+00,  -1.51024151e+00,  -1.14701944e+00],
       [  1.19209290e-07,  -1.36476525e+00,  -1.64845586e+00,
         -1.51024151e+00,  -1.36476525e+00,  -1.03698251e+00],
       [ -1.82509391e+00,  -1.64845586e+00,  -1.99321313e+00,
         -1.82509391e+00,  -1.64845586e+00,  -1.25140575e+00],
       [ -1.67163057e+00,  -1.51024151e+00,  -1.82509391e+00,
         -1.67163057e+00,  -1.51024151e+00,  -1.14701944e+00],
       [ -1.51024151e+00,  -1.36476525e+00,  -1.64845586e+00,
         -1.51024151e+00,  -1.36476525e+00,   7.49990386e-01],
       [ -1.14701944e+00,  -1.03698251e+00,  -1.25140575e+00,
         -1.14701944e+00,   7.49990386e-01,  -7.88543897e-01]])

### Exercise

So far we've used the "people who liked A were unusually likely to like B" statistics to make predictions, but we've ignored the information we have about dislikes. Can you find a way to improve the results by leveraging the 1 values in the rawdata (the dislikes)?

In [16]:
def get_llr_likes(i, j):
    a_b = cooccurrence_matrix[i,j].tolist()
    print(i,j)
    a_not_b = (sums[i] - a_b).tolist()
    b_not_a = (sums[j] - a_b).tolist()
    not_ab = (total - (a_b + sums[i] + sums[j])).tolist()
    print(a_b, a_not_b, b_not_a, not_ab) 
    return llr_root(a_b, a_not_b, b_not_a, not_ab)
def get_one_like(i, j):
 
    return cooccurrence_matrix[i,j]

np.fromfunction(lambda i, j :  get_llr_likes(i,j), (cooccurrence_matrix.shape[0],cooccurrence_matrix.shape[1]) , dtype=int)

[[0 0 0 0 0 0]
 [1 1 1 1 1 1]
 [2 2 2 2 2 2]
 [3 3 3 3 3 3]
 [4 4 4 4 4 4]
 [5 5 5 5 5 5]] [[0 1 2 3 4 5]
 [0 1 2 3 4 5]
 [0 1 2 3 4 5]
 [0 1 2 3 4 5]
 [0 1 2 3 4 5]
 [0 1 2 3 4 5]]
[[0, 3, 1, 2, 0, 0], [3, 0, 1, 1, 0, 0], [1, 1, 0, 2, 2, 1], [2, 1, 2, 0, 1, 0], [0, 0, 2, 1, 0, 2], [0, 0, 1, 0, 2, 0]] [[6, 3, 5, 4, 6, 6], [2, 5, 4, 4, 5, 5], [6, 6, 7, 5, 5, 6], [4, 5, 4, 6, 5, 6], [5, 5, 3, 4, 5, 3], [3, 3, 2, 3, 1, 3]] [[6, 2, 6, 4, 5, 3], [3, 5, 6, 5, 5, 3], [5, 4, 7, 4, 3, 2], [4, 4, 5, 6, 4, 3], [6, 5, 5, 5, 5, 1], [6, 5, 6, 6, 3, 3]] [[20, 18, 18, 18, 21, 23], [18, 22, 19, 20, 22, 24], [18, 19, 18, 17, 18, 21], [18, 20, 17, 20, 20, 23], [21, 22, 18, 20, 22, 22], [23, 24, 21, 23, 22, 26]]


TypeError: float() argument must be a string or a number, not 'list'

In [None]:
 cooccurrence_matrix

In [None]:
conc_mult_d , conc_mult

In [None]:
np.dot(conc_mult)