# Content based filtering in recommender systems

This notebook demonstrates the content-based approach in recommender systems on document recommendation problem.

In [1]:
# import libraries for matrix manipulation

import pandas as pd
import numpy as np

## Step 1. Data import and cleaning 

In [4]:
# importing raw data from excel file

raw_data = pd.read_excel("cbf.xls")
raw_data

Unnamed: 0,baseball,economics,politics,Europe,Asia,soccer,war,security,shopping,family,Unnamed: 10,num-attr,Unnamed: 12,User 1,User 2,Unnamed: 15,Pred1,Pred2
doc1,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,,5.0,,1.0,-1.0,,,
doc2,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,,4.0,,-1.0,1.0,,,
doc3,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,,3.0,,,,,,
doc4,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,,4.0,,,1.0,,,
doc5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,,3.0,,,,,,
doc6,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,2.0,,1.0,,,,
doc7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,,2.0,,,,,,
doc8,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,,4.0,,,,,,
doc9,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,,2.0,,,,,,
doc10,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,,3.0,,,,,,


From our raw data, we need to select *documents x tags* matrix. 

The value of 1 means that the topic appears in the document.

In [5]:
docs = raw_data.loc['doc1':'doc20', 'baseball':'family']
docs

Unnamed: 0,baseball,economics,politics,Europe,Asia,soccer,war,security,shopping,family
doc1,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
doc2,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
doc3,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
doc4,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
doc5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
doc6,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
doc7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
doc8,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
doc9,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
doc10,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


We have rating of two users. 

The value of 1.0 means the user liked the document, the value of 0 - disliked.

NaN means that the user never seen the document (and we have to predict rating)

In [6]:
user_ranks = raw_data.loc['doc1':'doc20', 'User 1':'User 2']
user_ranks.fillna(0, inplace=True)
user_ranks

Unnamed: 0,User 1,User 2
doc1,1.0,-1.0
doc2,-1.0,1.0
doc3,0.0,0.0
doc4,0.0,1.0
doc5,0.0,0.0
doc6,1.0,0.0
doc7,0.0,0.0
doc8,0.0,0.0
doc9,0.0,0.0
doc10,0.0,0.0


Let's us basic matrix multiplication to predict user interest in particular topic

In [8]:
user_profiles = np.array(docs).T @ np.array(user_ranks)
pd.DataFrame(user_profiles, docs.columns, user_ranks.columns)

Unnamed: 0,User 1,User 2
baseball,3.0,-2.0
economics,-2.0,2.0
politics,-1.0,2.0
Europe,0.0,3.0
Asia,0.0,-1.0
soccer,2.0,-2.0
war,-1.0,0.0
security,-1.0,3.0
shopping,1.0,0.0
family,0.0,-1.0


In [29]:
user_preferences = np.matmul(np.array(docs), user_profiles)
updf = pd.DataFrame(user_preferences, docs.index, user_ranks.columns)
updf

Unnamed: 0,User 1,User 2
doc1,4.0,-4.0
doc2,-4.0,10.0
doc3,2.0,0.0
doc4,-3.0,8.0
doc5,-1.0,1.0
doc6,3.0,1.0
doc7,-1.0,2.0
doc8,-2.0,4.0
doc9,3.0,-2.0
doc10,-3.0,1.0


In [32]:
updf.loc[:, 'User 1'].sort_values(ascending=False)

doc16    6.0
doc1     4.0
doc12    4.0
doc9     3.0
doc6     3.0
doc3     2.0
doc18    1.0
doc15    0.0
doc11    0.0
doc5    -1.0
doc7    -1.0
doc20   -1.0
doc8    -2.0
doc13   -2.0
doc14   -2.0
doc4    -3.0
doc10   -3.0
doc19   -4.0
doc17   -4.0
doc2    -4.0
Name: User 1, dtype: float64

In [33]:
updf.loc[:, 'User 2'].sort_values(ascending=False)

doc17    10.0
doc2     10.0
doc4      8.0
doc14     7.0
doc13     7.0
doc20     5.0
doc15     4.0
doc8      4.0
doc18     3.0
doc7      2.0
doc19     2.0
doc5      1.0
doc6      1.0
doc10     1.0
doc11     1.0
doc3      0.0
doc9     -2.0
doc12    -4.0
doc16    -4.0
doc1     -4.0
Name: User 2, dtype: float64

In [55]:
normalized_docs = docs.div(docs.sum(axis=1).apply(np.sqrt), axis=0)
normalized_docs

Unnamed: 0,baseball,economics,politics,Europe,Asia,soccer,war,security,shopping,family
doc1,0.447214,0.0,0.447214,0.0,0.447214,0.447214,0.0,0.0,0.0,0.447214
doc2,0.0,0.5,0.5,0.5,0.0,0.0,0.0,0.5,0.0,0.0
doc3,0.0,0.0,0.0,0.57735,0.57735,0.57735,0.0,0.0,0.0,0.0
doc4,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.5,0.0,0.0
doc5,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735
doc6,0.707107,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0
doc7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.707107
doc8,0.0,0.0,0.5,0.5,0.0,0.0,0.5,0.0,0.0,0.5
doc9,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.707107,0.0
doc10,0.0,0.57735,0.0,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0


In [56]:
normalized_profiles = np.matmul(np.array(normalized_docs).T, np.array(user_ranks))
pd.DataFrame(normalized_profiles, docs.columns, user_ranks.columns)

Unnamed: 0,User 1,User 2
baseball,1.731671,-1.024564
economics,-0.947214,1.0
politics,-0.5,1.052786
Europe,0.207107,1.5
Asia,0.0,-0.447214
soccer,1.024564,-1.024564
war,-0.447214,-0.07735
security,-0.5,1.5
shopping,0.57735,0.0
family,0.0,-0.447214


In [57]:
normalized_preferences = np.matmul(np.array(normalized_docs), normalized_profiles)
npdf = pd.DataFrame(normalized_preferences, docs.index, user_ranks.columns)
npdf

Unnamed: 0,User 1,User 2
doc1,1.009019,-0.845577
doc2,-0.870053,2.526393
doc3,0.711105,0.016294
doc4,-0.620053,1.987718
doc5,-0.213541,0.319151
doc6,1.370923,0.336184
doc7,-0.353553,0.744432
doc8,-0.370053,1.014111
doc9,1.132724,-0.724476
doc10,-0.805073,0.274493


In [58]:
npdf.loc[:, 'User 1'].sort_values(ascending=False)

doc16    1.924646
doc6     1.370923
doc12    1.333114
doc9     1.132724
doc1     1.009019
doc3     0.711105
doc18    0.554695
doc15    0.142229
doc11    0.044658
doc20   -0.081378
doc5    -0.213541
doc14   -0.331378
doc7    -0.353553
doc8    -0.370053
doc13   -0.396447
doc4    -0.620053
doc10   -0.805073
doc19   -0.847214
doc17   -0.870053
doc2    -0.870053
Name: User 1, dtype: float64

In [59]:
docs

Unnamed: 0,baseball,economics,politics,Europe,Asia,soccer,war,security,shopping,family
doc1,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
doc2,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
doc3,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
doc4,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
doc5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
doc6,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
doc7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
doc8,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
doc9,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
doc10,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [62]:
DF = docs.sum(axis=0)
IDF = 1.0 / DF
np.array(IDF)

array([0.25      , 0.16666667, 0.1       , 0.09090909, 0.16666667,
       0.16666667, 0.14285714, 0.16666667, 0.14285714, 0.2       ])

In [78]:
weighted_preferences = np.matmul(np.array(normalized_docs), np.multiply(np.array(normalized_profiles).T, np.array(IDF)).T)
pd.DataFrame(weighted_preferences, docs.index, user_ranks.columns)

Unnamed: 0,User 1,User 2
doc1,0.247612,-0.217167
doc2,-0.136187,0.329154
doc3,0.109459,-0.062892
doc4,-0.089197,0.240296
doc5,-0.043527,0.044585
doc6,0.319432,-0.084695
doc7,-0.058926,0.113531
doc8,-0.04753,0.070575
doc9,0.179067,-0.120746
doc10,-0.128031,0.046812
