# Content-based recommendations (using cosine similarity)
This notebook shows the process used to calculate recommendations for user based on their previous ratings and in features extracted from content texts available in +TV4E platform.

Library requirements:
* Pandas
* NumPy
* Scikit-learn 
* NLTK (w/ portuguese stop words)

In [1]:
# for Python 2: use print only as a function
from __future__ import print_function

### Part 1 - Examine the data

In [2]:
# read CSV using a relative path
import pandas as pd
path = 'ratings_reduced.csv'
df_ratings = pd.read_csv(path)
path = 'videos_reduced.csv'
df_videos = pd.read_csv(path)

In [3]:
# show dataframe first row
df_ratings.head()

Unnamed: 0,user_id,content_id,rating,rating_timestamp,rating_type
0,1,11,1.0,2017-08-21 12:29:27+00:00,explicit
1,1,12,0.5,2017-08-21 12:29:27+00:00,explicit
2,1,13,-1.0,2017-08-21 12:29:27+00:00,explicit
3,1,14,1.0,2017-08-21 12:29:27+00:00,explicit
4,2,11,0.5,2017-08-22 08:21:16+00:00,explicit


In [4]:
#show the shape of dataframe
df_ratings.shape

(8, 5)

In [5]:
# show overall statistics of the dataframe
df_ratings.describe()

Unnamed: 0,user_id,content_id,rating
count,8.0,8.0,8.0
mean,1.5,12.875,0.125
std,0.534522,1.457738,0.954314
min,1.0,11.0,-1.0
25%,1.0,11.75,-1.0
50%,1.5,13.0,0.5
75%,2.0,14.0,1.0
max,2.0,15.0,1.0


In [6]:
# filter ratings from user 1 and show statistics
df_ratings_user = df_ratings[(df_ratings.user_id==1)]
df_ratings_user.describe()

Unnamed: 0,user_id,content_id,rating
count,4.0,4.0,4.0
mean,1.0,12.5,0.375
std,0.0,1.290994,0.946485
min,1.0,11.0,-1.0
25%,1.0,11.75,0.125
50%,1.0,12.5,0.75
75%,1.0,13.25,1.0
max,1.0,14.0,1.0


### Part 2 - Clean ratings data

In [7]:
# rtemoving unused columns
df_ratings.drop('rating_type', axis=1, inplace=True)
df_ratings.drop('rating_timestamp', axis=1, inplace=True)
df_ratings

Unnamed: 0,user_id,content_id,rating
0,1,11,1.0
1,1,12,0.5
2,1,13,-1.0
3,1,14,1.0
4,2,11,0.5
5,2,13,-1.0
6,2,14,-1.0
7,2,15,1.0


In [8]:
ratings_matrix = df_ratings.pivot_table(index=['user_id'], columns=['content_id'], values=['rating'])
ratings_matrix                                     

Unnamed: 0_level_0,rating,rating,rating,rating,rating
content_id,11,12,13,14,15
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,1.0,0.5,-1.0,1.0,
2,0.5,,-1.0,-1.0,1.0


### Part 3 - Vectorize texts

In [9]:
# import and retrieve portuguese stop words
# stop words are not considered as token as usually they carry no meaning (!)
from nltk.corpus import stopwords
portuguese_stop_words = stopwords.words('portuguese')

In [10]:
# import and instantiate CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2, max_df=0.5, stop_words=portuguese_stop_words)

In [11]:
# fit (train) the vectorizer with the corpus from video text contents
tfidf_matrix = tfidf.fit_transform(df_videos['text_contents'])
tfidf_matrix.toarray()

array([[ 0.        ,  0.24360439,  0.24360439,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.73081318,  0.        ,
         0.        ,  0.        ,  0.        ,  0.24360439,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.24360439,  0.41133514,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.24360439,  0.        ],
       [ 0.33878283,  0.        ,  0.        ,  0.33878283,  0.33878283,
         0.        ,  0.33878283,  0.        ,  0.        ,  0.        ,
         0.28602374,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.33878283,  0.        ,  0.        ,
         0.        ,  0.        ,  0.33878283,  0.33878283,  0.        ,
         0.        ,  0.        ,  0.33878283,  0.        ,  0.

In [12]:
tfidf.get_feature_names()

['30',
 'acordo',
 'ano',
 'aveiro',
 'ações',
 'breve',
 'candidatura',
 'central',
 'centro',
 'criar',
 'câmara',
 'dar',
 'devido',
 'dia',
 'eficácia',
 'estudo',
 'estudos',
 'freguesia',
 'gestão',
 'hoje',
 'instituto',
 'início',
 'município',
 'município local',
 'nacional',
 'novas',
 'novos',
 'onde',
 'permite',
 'poderão',
 'prevista',
 'primeira',
 'ser',
 'tecnologia',
 'terreno',
 'trabalho',
 'universidade',
 'visa',
 'zona']

### Part 4 - Build up user profile

In [13]:
# create dict content_id ==>> tfidf weights
tfidf_tokens = {}
tfidf_array = tfidf_matrix.toarray()
line_count = 0
for idx, row in df_videos.iterrows():
    tfidf_tokens[row.id] = tfidf_array[line_count]
    line_count += 1
tfidf_tokens

{11: array([ 0.        ,  0.24360439,  0.24360439,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.73081318,  0.        ,
         0.        ,  0.        ,  0.        ,  0.24360439,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.24360439,  0.41133514,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.24360439,  0.        ]),
 12: array([ 0.33878283,  0.        ,  0.        ,  0.33878283,  0.33878283,
         0.        ,  0.33878283,  0.        ,  0.        ,  0.        ,
         0.28602374,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.33878283,  0.        ,  0.        ,
         0.        ,  0.        ,  0.33878283,  0.33878283,  0.        ,
         0.        ,  0.        ,  0.33878283,  0.    

In [14]:
# define user ratings
user_ratings = df_ratings[(df_ratings.user_id==1)]
user_ratings

Unnamed: 0,user_id,content_id,rating
0,1,11,1.0
1,1,12,0.5
2,1,13,-1.0
3,1,14,1.0


In [15]:
# created weighted user profile vector (dotproduct of vectors of items consumed and user ratings)
#   "In the original implementation, the profile was the sum of the item-tag vectors of all items 
#   the user has rated positively (>= 3.5 stars). This approach was later improved with weighted 
#   user profile (with the older implementation commented out for reference). Weighted profile is 
#   computed with weighted sum of the item vectors for all items, with weights being based on the 
#   user's rating."
#   See: http://eugenelin89.github.io/recommender_content_based/
user_profile = [0] * len(tfidf.get_feature_names())
for i in range(len(user_profile)):
    for idx, row in user_ratings.iterrows():
        user_profile[i] += row.rating * tfidf_tokens[row.content_id][i]
    #user_profile = [v/len(user_ratings) for v in user_profile] # weight-ing user vector (?)
# normalize user profile vector
import numpy as np
user_profile_norm = user_profile / np.linalg.norm(user_profile)

### Part 5 - Calculate similarities between user profile and contents

In [16]:
# apply cosine similarity between user profile vector and content vectors
# See: http://eugenelin89.github.io/recommender_content_based/
from math import sqrt
def cosine_similarity(x, y):
    def square_rooted(v):
        return round(sqrt(sum([a * a for a in v])), 3)
   
    numerator = sum(a * b for a, b in zip(x, y))
    denominator = square_rooted(x) * square_rooted(y)
    return numerator/float(denominator)

estimated_user_ratings = {}
for content_id, token_weights in tfidf_tokens.items(): 
    if content_id not in user_ratings.content_id.values: # not calculating for contents already consumed 
        estimated_user_ratings[content_id] = cosine_similarity(user_profile, token_weights)
# order ratings
import operator
estimated_user_ratings = sorted(estimated_user_ratings.items(), key=operator.itemgetter(1))
estimated_user_ratings[::-1]

[(16, 0.066996955787480938), (15, 0.035996853726467576)]

In [17]:
# Using the dotproduct of user profile vector, content IDF and content vector
# See: https://www.analyticsvidhya.com/blog/2015/08/beginners-guide-learn-content-based-recommender-systems/
estimated_user_ratings = {}
for content_id, token_weights in tfidf_tokens.items():
    if content_id not in user_ratings.content_id.values: # removing contents already consumed 
        estimated_user_ratings[content_id] = 0
        for i in range(len(token_weights)):
            if token_weights[i] > 0:
                estimated_user_ratings[content_id] += user_profile[i] * tfidf.idf_[i] * token_weights[i]
# order ratings
import operator
estimated_user_ratings = sorted(estimated_user_ratings.items(), key=operator.itemgetter(1))
estimated_user_ratings[::-1]

[(16, 0.19923750192859285), (15, 0.11143069202983215)]