In [1]:
import math
import numpy as np
import pandas as pd

In [2]:
attributes_df = pd.read_csv(r'attributes.csv')

In [3]:
attributes_df

Unnamed: 0,docid,baseball,economics,politics,Europe,Asia,soccer,war,security,shopping,family,Unnamed: 11,Unnamed: 12
0,1,1,0,1,0,1,1,0,0,0,1,,
1,2,0,1,1,1,0,0,0,1,0,0,,
2,3,0,0,0,1,1,1,0,0,0,0,,
3,4,0,0,1,1,0,0,1,1,0,0,,
4,5,0,1,0,0,0,0,0,0,1,1,,
5,6,1,0,0,1,0,0,0,0,0,0,,
6,7,0,0,0,0,0,0,0,1,0,1,,
7,8,0,0,1,1,0,0,1,0,0,1,,
8,9,0,0,0,0,0,1,0,0,1,0,,
9,10,0,1,0,0,1,0,1,0,0,0,,


In [4]:
# this is our item content matrix (ICM)

attributes_df = attributes_df.drop(attributes_df.columns[11:13],axis=1)
attributes_df

Unnamed: 0,docid,baseball,economics,politics,Europe,Asia,soccer,war,security,shopping,family
0,1,1,0,1,0,1,1,0,0,0,1
1,2,0,1,1,1,0,0,0,1,0,0
2,3,0,0,0,1,1,1,0,0,0,0
3,4,0,0,1,1,0,0,1,1,0,0
4,5,0,1,0,0,0,0,0,0,1,1
5,6,1,0,0,1,0,0,0,0,0,0
6,7,0,0,0,0,0,0,0,1,0,1
7,8,0,0,1,1,0,0,1,0,0,1
8,9,0,0,0,0,0,1,0,0,1,0
9,10,0,1,0,0,1,0,1,0,0,0


In [5]:
ratings_df = pd.read_csv(r'ratings.csv').fillna(0)

In [6]:
ratings_df

Unnamed: 0,docid,user1,user2
0,1,1.0,-1.0
1,2,-1.0,1.0
2,3,0.0,0.0
3,4,0.0,1.0
4,5,0.0,0.0
5,6,1.0,0.0
6,7,0.0,0.0
7,8,0.0,0.0
8,9,0.0,0.0
9,10,0.0,0.0


In [7]:
attributes_np = attributes_df.drop(attributes_df.columns[0],axis=1).to_numpy()

reviews_np = ratings_df.drop(ratings_df.columns[0],axis=1).to_numpy()

reviews_np.T[0] # user 1

reviews_np.T[1] # user 2

array([-1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0., -1.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.])

Create basic user profiles - multiply each user ratings row by the attributes row for each document, then once complete sum each column

In [8]:
profile1 = []
profile2 = []
for i in range(0,10):
    column = attributes_np[:,i]
    product1 = np.dot(column,reviews_np.T[0])
    product2 = np.dot(column,reviews_np.T[1])
    profile1.append(product1)
    profile2.append(product2)

In [9]:
profile1_np = np.array(profile1)

In [10]:
profile1_np

array([ 3., -2., -1.,  0.,  0.,  2., -1., -1.,  1.,  0.])

In [11]:
profile2_np = np.array(profile2)

In [12]:
profile2_np

array([-2.,  2.,  2.,  3., -1., -2.,  0.,  3.,  0., -1.])

Compute the predicted score for each user for each document (a simple dot-product).

In [13]:
user1_prediction = [np.dot(i,profile1_np) for i in attributes_np]

In [14]:
user1_prediction

[4.0,
 -4.0,
 2.0,
 -3.0,
 -1.0,
 3.0,
 -1.0,
 -2.0,
 3.0,
 -3.0,
 0.0,
 4.0,
 -2.0,
 -2.0,
 0.0,
 6.0,
 -4.0,
 1.0,
 -4.0,
 -1.0]

In [15]:
user2_prediction = [np.dot(i,profile2_np) for i in attributes_np]

In [16]:
user2_prediction

[-4.0,
 10.0,
 0.0,
 8.0,
 1.0,
 1.0,
 2.0,
 4.0,
 -2.0,
 1.0,
 1.0,
 -4.0,
 7.0,
 7.0,
 4.0,
 -4.0,
 10.0,
 3.0,
 2.0,
 5.0]

model is consistent with the users’ ratings -- it predicts liking for all the positive documents and disliking for all the negative ones.

in our computation an article that had many attributes checked could have more influence on the overall profile than one that had only a few

explore whether our simple model may be counting these attribute-heavy documents too much. For example, we might conclude that liking doc6 says more about liking baseball (since it is one of only two attributes for the article along with Europe) than liking doc1 says (since doc1 is also about politics, Asia, soccer, and family).

### Attempt to remove bias to attribute heavy documents

make a copy of the attributes matrix on another sheet. Then we’re going to have you normalize each row to be a unit length vector

normalize each row to be a unit length vector. We can do this in two steps:

Count the total number of items in the row (you can do this via SUM or COUNT function).
Divide each value by the square root of that number of items. If you do this right, doc1’s values will all change from 1 to 0.447214 (approx). Documents with 4 attributes will change to 0.5 (since 4 * .5^2 = 1), and so forth. 

In [17]:
attributes_normalized = attributes_np.copy().astype('float64') 

In [18]:
for i in range(len(attributes_normalized)):
    num = np.sum(attributes_normalized[i])
    attributes_normalized[i] = attributes_normalized[i]/math.sqrt(num)

In [19]:
attributes_normalized

array([[0.4472136 , 0.        , 0.4472136 , 0.        , 0.4472136 ,
        0.4472136 , 0.        , 0.        , 0.        , 0.4472136 ],
       [0.        , 0.5       , 0.5       , 0.5       , 0.        ,
        0.        , 0.        , 0.5       , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.57735027, 0.57735027,
        0.57735027, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.5       , 0.5       , 0.        ,
        0.        , 0.5       , 0.5       , 0.        , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.57735027, 0.57735027],
       [0.70710678, 0.        , 0.        , 0.70710678, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.70710678],
       [0.        , 0.        , 0.5      

In [20]:
profile1_normalized = []
profile2_normalized = []
for i in range(0,10):
    column = attributes_normalized[:,i]
    product1 = np.dot(column,reviews_np.T[0])
    product2 = np.dot(column,reviews_np.T[1])
    profile1_normalized.append(product1)
    profile2_normalized.append(product2)

In [21]:
profile1_normalized

[1.731670645876131,
 -0.9472135954999579,
 -0.5,
 0.20710678118654746,
 0.0,
 1.0245638646895838,
 -0.4472135954999579,
 -0.5,
 0.5773502691896258,
 0.0]

In [22]:
profile2_normalized

[-1.0245638646895838,
 1.0,
 1.0527864045000421,
 1.5,
 -0.4472135954999579,
 -1.0245638646895838,
 -0.07735026918962584,
 1.5,
 0.0,
 -0.4472135954999579]

In [23]:
user1_prediction_normalized = [np.dot(i,profile1_normalized) for i in attributes_normalized]

In [24]:
user1_prediction_normalized

[1.0090187477611812,
 -0.8700534071567052,
 0.7111053789495447,
 -0.6200534071567052,
 -0.21354069100864065,
 1.370922665887427,
 -0.35355339059327373,
 -0.3700534071567052,
 1.132724346944564,
 -0.8050729140891353,
 0.044658198738520505,
 1.3331138468776909,
 -0.39644660940672627,
 -0.3313782725618923,
 0.1422285251880867,
 1.9246460699581855,
 -0.8700534071567052,
 0.5546948998705892,
 -0.8472135954999579,
 -0.08137827256189228]

In [25]:
user2_prediction_normalized = [np.dot(i,profile2_normalized) for i in attributes_normalized]

In [26]:
user2_prediction_normalized

[-0.8455773862443854,
 2.526393202250021,
 0.01629429095678303,
 1.9877180676552082,
 0.3191513794424647,
 0.3361841152991202,
 0.7444324057629833,
 1.0141112699052293,
 -0.724476056480701,
 0.2744931807039442,
 0.34962762429011646,
 -1.2277226448995098,
 1.8027864045000421,
 1.776393202250021,
 0.9490429330603951,
 -1.1830644461609892,
 2.526393202250021,
 1.0606601717798212,
 0.48344189675271276,
 1.2377180676552082]

In [27]:
user2_predictions_df = pd.DataFrame()
user2_predictions_df['normalized'] = user2_prediction_normalized
user2_predictions_df['docid'] = attributes_df['docid']
sorted_user2_predictions_df = user2_predictions_df.sort_values(by='normalized', ascending=False)
sorted_user2_predictions_df

Unnamed: 0,normalized,docid
16,2.526393,17
1,2.526393,2
3,1.987718,4
12,1.802786,13
13,1.776393,14
19,1.237718,20
17,1.06066,18
7,1.014111,8
14,0.949043,15
6,0.744432,7


### Using IDF to account for frequencies of attributes

This will account for the fact the the content attributes have vastly different frequencies

In [28]:
# inverse document frquency 1/df

attributes_np

array([[1, 0, 1, 0, 1, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 1, 1, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 1],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 0, 1, 1, 0, 0, 1, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0],
       [0, 1, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 1, 1, 1, 0, 0, 1, 0, 0],
       [0, 1, 1, 1, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1, 1, 1, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 1, 0],
       [0, 1, 1, 1, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 1, 0, 1, 0, 1, 0, 0, 1],
       [0, 0, 1, 1, 0, 0, 1, 0, 1, 0]], dtype=int64)

In [29]:
documentfrequency = attributes_np.sum(axis = 0) 

In [30]:
documentfrequency.astype('float64')

array([ 4.,  6., 10., 11.,  6.,  6.,  7.,  6.,  7.,  5.])

In [31]:
inverse_df = 1/documentfrequency
list(inverse_df)

[0.25,
 0.16666666666666666,
 0.1,
 0.09090909090909091,
 0.16666666666666666,
 0.16666666666666666,
 0.14285714285714285,
 0.16666666666666666,
 0.14285714285714285,
 0.2]

In [32]:
attributes_normalized

array([[0.4472136 , 0.        , 0.4472136 , 0.        , 0.4472136 ,
        0.4472136 , 0.        , 0.        , 0.        , 0.4472136 ],
       [0.        , 0.5       , 0.5       , 0.5       , 0.        ,
        0.        , 0.        , 0.5       , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.57735027, 0.57735027,
        0.57735027, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.5       , 0.5       , 0.        ,
        0.        , 0.5       , 0.5       , 0.        , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.57735027, 0.57735027],
       [0.70710678, 0.        , 0.        , 0.70710678, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.70710678],
       [0.        , 0.        , 0.5      

In [33]:
# multiply each term IDF by document term , then take sum of the product (dot product) of (document vector X IDF) * profile

user1_prediction_idf = [sum(i*inverse_df*profile1_normalized) for i in attributes_normalized]

In [34]:
user1_prediction_idf

[0.2476124657905287,
 -0.13618718835894128,
 0.10945899074393546,
 -0.08919655031727514,
 -0.043526623104614706,
 0.3194323422480594,
 -0.05892556509887895,
 -0.04752988365060848,
 0.17906719376543057,
 -0.1280312264018282,
 0.018751534159566336,
 0.3116482765546726,
 -0.05725272206727815,
 -0.05328121675015849,
 0.02118377174019017,
 0.3961528798518861,
 -0.13618718835894128,
 0.07163451247986463,
 -0.12153324130475628,
 -0.006290578708492346]

In [35]:
user2_prediction_idf = [sum(i*inverse_df*profile2_normalized) for i in attributes_normalized]

In [36]:
user2_prediction_idf

[-0.21716749806965677,
 0.3291544717401536,
 -0.06289226997572091,
 0.24029611917898988,
 0.0445852669155054,
 -0.08469536214019152,
 0.11353114209326928,
 0.0705747596289941,
 -0.12074600941345015,
 0.04681215389681261,
 0.017749503112534246,
 -0.2528515022938042,
 0.2085533387818238,
 0.2041544717401536,
 0.10227647689652244,
 -0.24647175961687273,
 0.3291544717401536,
 0.09642365197998373,
 0.04334257781348446,
 0.11529611917898988]

In [37]:
print(attributes_np[5])
print(documentfrequency)
print(inverse_df)
print(profile2_normalized[5])

[1 0 0 1 0 0 0 0 0 0]
[ 4  6 10 11  6  6  7  6  7  5]
[0.25       0.16666667 0.1        0.09090909 0.16666667 0.16666667
 0.14285714 0.16666667 0.14285714 0.2       ]
-1.0245638646895838


inverse document frequency concept from TFIDF of saying that perhaps terms that appear infrequently like baseball are more important than terms that appear frequently, like Europe. At least more important in that when they show up they should get higher weight because they are rarer.

For docoument 6, user 2 preference is now negative wheras before it was positive.  This changed because IDF reduced the weight of the more frequent term 'Europe'

In [38]:
print(user2_prediction[5], user2_prediction_idf[5])

1.0 -0.08469536214019152


In [45]:
# top documents for user 1 before using IDF

simple = pd.DataFrame()
simple['before'] = user2_prediction
simple['docid'] = attributes_df['docid']
sorted_simple_df = simple.sort_values(by='before', ascending=False).reset_index(drop=True)

In [46]:
sorted_simple_df

Unnamed: 0,before,docid
0,10.0,17
1,10.0,2
2,8.0,4
3,7.0,13
4,7.0,14
5,5.0,20
6,4.0,8
7,4.0,15
8,3.0,18
9,2.0,19


In [43]:
# top documents for user 1 after using IDF

after_df = pd.DataFrame()
after_df['after'] = user2_prediction_idf
after_df['docid'] = attributes_df['docid']
sorted_after_df = after_df.sort_values(by='after', ascending=False).reset_index(drop=True)

In [44]:
sorted_after_df

Unnamed: 0,after,docid
0,0.329154,17
1,0.329154,2
2,0.240296,4
3,0.208553,13
4,0.204154,14
5,0.115296,20
6,0.113531,7
7,0.102276,15
8,0.096424,18
9,0.070575,8
