# Content-based recommendations (using cosine similarity)
This notebook shows the process used to calculate recommendations for user based on their previous ratings and in features extracted from content texts available in +TV4E platform.

Library requirements:
* Pandas
* NumPy
* Scikit-learn 
* NLTK (w/ portuguese stop words)

In [34]:
# for Python 2: use print only as a function
from __future__ import print_function

### Part 1 - Examine the data

In [35]:
# read CSV using a relative path
import pandas as pd
import numpy as np
import requests

URL_VIDEOS='http://api_mysql.tv4e.pt/api/recommendations/videos'
data= requests.get(URL_VIDEOS)
df_videos=pd.DataFrame(data.json())

In [36]:
# show dataframe first rows
df_videos.head()

Unnamed: 0,video_asgie_id,video_asgie_title_en,video_asgie_title_pt,video_date_creation,video_desc,video_id,video_location,video_title
0,3,Financial Services,Finanças,2017-09-26 15:21:16,Seja qual for o cenário para os escalões do IR...,4049,,As “habilidadezinhas” do Governo para descer o...
1,1,Health Care and Welfare Services,Saúde e Bem-Estar,2017-09-26 15:36:20,Mais pessoas recorrem aos centros de saúde par...,4051,,Deixar de fumar
2,1,Health Care and Welfare Services,Saúde e Bem-Estar,2017-09-26 15:38:36,Quase sete mil unidades de medicamentos apreen...,4053,,Medicamentos falsificados
3,1,Health Care and Welfare Services,Saúde e Bem-Estar,2017-09-26 15:40:12,Ministro da Saúde participa no encontro em Coi...,4054,,Conferência Mundial de Saúde Urbana
4,1,Health Care and Welfare Services,Saúde e Bem-Estar,2017-09-26 15:40:19,Rastreio visa despiste da ambliopia em criança...,4055,,CHTS | Saúde visual infantil


In [37]:
URL='http://api_mysql.tv4e.pt/api/recommendations/ratings'
data= requests.get(URL)
df_ratings=pd.DataFrame(data.json())

# XXX use a function to calculate implicit rating considering the video lead time
df_ratings['rating_implicit'] = (df_ratings['video_watch_time']/100) * 0.3
df_ratings['rating_explicit'] = (df_ratings['rating_value'])         * 0.7
# If the explicit rating was negative, the implicit will be negative
df_ratings['rating_implicit'][df_ratings.rating_explicit < 0] = df_ratings['rating_implicit'] * -1
# create a new column to put implicit or explicit rating rating_value
df_ratings['overall_rating_value'] = df_ratings['rating_implicit'] + df_ratings['rating_explicit']

# implicit rating is the watched time / explicit rating is the like-0-dislike
df_ratings['rating_implicit'] = df_ratings['video_watch_time']/100
df_ratings['rating_explicit'] = df_ratings['rating_value']
df_ratings['overall_rating_value']=df_ratings['overall_rating_value'].fillna(0)

# Right now, the overall rating will be NONE/NaN if no explicit rating was set
# So, we consider the implicit rating as positive if the user has seen at least 20% of the video
df_ratings.loc[(df_ratings['overall_rating_value'] == 0) & (df_ratings['video_watch_time'] >= 20),'overall_rating_value'] = (df_ratings['video_watch_time']/100) * 0.5

df_ratings = df_ratings[(df_ratings.video_watched_type != 'forced')]

df_ratings = df_ratings[(df_ratings.overall_rating_value > 0)]

import datetime
NOW = datetime.datetime.now()
df_ratings['rating_date_diff'] = NOW - pd.to_datetime(df_ratings['rating_date_creation'])
df_ratings = df_ratings[(df_ratings.rating_date_diff.dt.days < 14)]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [38]:
# show dataframe first rows
df_ratings.head()

Unnamed: 0,rating_date_creation,rating_value,user_id,video_id,video_watch_time,video_watched_type,rating_implicit,rating_explicit,overall_rating_value,rating_date_diff
12,2018-01-10 15:10:35,0.0,1,5162,100,injected,1.0,0.0,0.3,0 days 22:06:30.858811
13,2018-01-10 10:38:25,,1,5168,100,injected,1.0,,0.5,1 days 02:38:40.858811
14,2018-01-10 10:30:44,,1,5171,100,injected,1.0,,0.5,1 days 02:46:21.858811
16,2018-01-10 15:36:15,,1,5186,100,injected,1.0,,0.5,0 days 21:40:50.858811
45,2018-01-10 16:43:35,0.0,4,5039,100,injected,1.0,0.0,0.3,0 days 20:33:30.858811


In [39]:
df_ratings[['user_id', 'video_id', 'overall_rating_value']]

Unnamed: 0,user_id,video_id,overall_rating_value
12,1,5162,0.300
13,1,5168,0.500
14,1,5171,0.500
16,1,5186,0.500
45,4,5039,0.300
47,4,5068,0.500
49,4,5158,0.015
51,4,5184,0.300
52,4,5186,0.500
53,4,5187,0.300


In [40]:
#show the shape of dataframe
df_ratings.shape

(121, 10)

In [41]:
# show overall statistics of the dataframe
df_ratings.describe()

Unnamed: 0,rating_value,user_id,video_id,video_watch_time,rating_implicit,rating_explicit,overall_rating_value,rating_date_diff
count,59.0,121.0,121.0,121.0,121.0,59.0,121.0,121
mean,0.084746,8.231405,5131.090909,91.53719,0.915372,0.084746,0.404215,1 days 08:14:16.437323
std,0.280894,2.848275,54.52935,24.531626,0.245316,0.280894,0.158306,1 days 20:22:48.380712
min,0.0,1.0,4991.0,1.0,0.01,0.0,0.003,0 days 11:19:04.858811
25%,0.0,8.0,5095.0,100.0,1.0,0.0,0.3,0 days 13:34:04.858811
50%,0.0,8.0,5150.0,100.0,1.0,0.0,0.5,0 days 15:26:03.858811
75%,0.0,9.0,5172.0,100.0,1.0,0.0,0.5,0 days 20:28:30.858811
max,1.0,20.0,5195.0,100.0,1.0,1.0,0.901,13 days 00:20:05.858811


### Part 2 - Clean ratings data

In [43]:
# rtemoving unused columns
df_ratings.drop('video_watched_type', axis=1, inplace=True)
df_ratings.drop('rating_date_creation', axis=1, inplace=True)
df_ratings

Unnamed: 0,rating_value,user_id,video_id,video_watch_time,rating_implicit,rating_explicit,overall_rating_value,rating_date_diff
12,0.0,1,5162,100,1.00,0.0,0.300,0 days 22:06:30.858811
13,,1,5168,100,1.00,,0.500,1 days 02:38:40.858811
14,,1,5171,100,1.00,,0.500,1 days 02:46:21.858811
16,,1,5186,100,1.00,,0.500,0 days 21:40:50.858811
45,0.0,4,5039,100,1.00,0.0,0.300,0 days 20:33:30.858811
47,,4,5068,100,1.00,,0.500,0 days 20:41:46.858811
49,0.0,4,5158,5,0.05,0.0,0.015,2 days 19:48:32.858811
51,0.0,4,5184,100,1.00,0.0,0.300,0 days 20:07:45.858811
52,,4,5186,100,1.00,,0.500,0 days 20:13:30.858811
53,0.0,4,5187,100,1.00,0.0,0.300,0 days 20:16:31.858811


In [44]:
ratings_matrix = df_ratings.pivot_table(index=['user_id'], columns=['video_id'], values=['overall_rating_value'])
ratings_matrix                                  

Unnamed: 0_level_0,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value
video_id,4991,4993,4997,5000,5002,5004,5005,5006,5007,5031,...,5185,5186,5187,5188,5189,5190,5191,5192,5194,5195
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,,,,,,,,,...,,0.5,,,,,,,,
4,,,,,,,,,,,...,,0.5,0.3,0.5,0.5,,0.3,,0.5,
8,,,,,,,,,,0.5,...,0.5,0.3,,,,0.5,0.3,0.5,0.5,0.3
9,0.3,0.5,0.3,0.5,0.3,0.3,0.495,0.3,0.886,,...,0.3,0.3,,,,,0.901,0.5,0.3,0.3
20,,,,,,,,,,,...,,,,,,,,,,


### Part 3 - Vectorize texts

In [45]:
# import and retrieve portuguese stop words
# stop words are not considered as token as usually they carry no meaning (!)
from nltk.corpus import stopwords
portuguese_stop_words = stopwords.words('portuguese')

In [46]:
# import and instantiate TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2, max_df=0.5, stop_words=portuguese_stop_words)

In [47]:
# fit (train) the vectorizer with the corpus from video text contents
tfidf_matrix = tfidf.fit_transform(df_videos['video_desc'])
tfidf_matrix.toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [48]:
tfidf.get_feature_names()

['00',
 '00 00',
 '00 horas',
 '00 suspeitos',
 '000',
 '000 00',
 '000 euros',
 '000 habitantes',
 '000 m2',
 '000 pessoas',
 '000 prazo',
 '000 utentes',
 '018',
 '018 2017',
 '02',
 '033',
 '033 euros',
 '04',
 '041',
 '041 espaços',
 '06',
 '08',
 '09',
 '10',
 '10 10saúdealarga',
 '10 15',
 '10 30',
 '10 535',
 '10 aniversário',
 '10 anos',
 '10 dias',
 '10 euros',
 '10 horas',
 '10 maio',
 '10 meses',
 '10 mil',
 '10 milhões',
 '10 minutos',
 '10 novembro',
 '10 número',
 '10 outubro',
 '100',
 '100 aquisição',
 '100 doentes',
 '100 gramas',
 '100 metros',
 '100 mil',
 '100 milhões',
 '100 ramais',
 '1000',
 '1000 euros',
 '102',
 '102 mil',
 '10268',
 '10268 2017',
 '10285',
 '10285 2017',
 '103',
 '103 alunos',
 '104',
 '105',
 '105 129',
 '105 anos',
 '105 milhões',
 '106',
 '106 793',
 '106 milhões',
 '108',
 '108 acima',
 '109',
 '109 448',
 '10h30',
 '10h30 fundação',
 '10saúdealarga',
 '10saúdealarga elenco',
 '10º',
 '11',
 '11 00',
 '11 27saúde',
 '11 733',
 '11 anos',
 

### Part 4 - Build up user profile

In [49]:
# create dict video_id ==>> tfidf weights
tfidf_tokens = {}
tfidf_array = tfidf_matrix.toarray()
line_count = 0
for idx, row in df_videos.iterrows():
    tfidf_tokens[row.video_id] = tfidf_array[line_count]
    line_count += 1
tfidf_tokens

{4049: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4051: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4053: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4054: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4055: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4057: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4058: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4059: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4060: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4061: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4062: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4063: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4068: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4070: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4071: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4072: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4073: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4074: array([ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.04412282,  0.        ]),
 4075: array([ 0.,  0.,  0

In [50]:
# define user ratings
user_ratings = df_ratings[(df_ratings.user_id==9)]
user_ratings

Unnamed: 0,rating_value,user_id,video_id,video_watch_time,rating_implicit,rating_explicit,overall_rating_value,rating_date_diff
138,,8,5031,100,1.0,,0.5,13 days 00:20:05.858811
140,0.0,8,5077,100,1.0,0.0,0.3,8 days 01:07:05.858811
141,,8,5081,100,1.0,,0.5,8 days 01:09:05.858811
142,,8,5087,100,1.0,,0.5,0 days 11:19:04.858811
143,0.0,8,5088,100,1.0,0.0,0.3,0 days 11:24:04.858811
144,,8,5089,100,1.0,,0.5,0 days 11:29:04.858811
145,,8,5090,100,1.0,,0.5,0 days 11:39:03.858811
146,0.0,8,5091,100,1.0,0.0,0.3,0 days 11:44:04.858811
147,0.0,8,5092,100,1.0,0.0,0.3,0 days 11:49:04.858811
148,,8,5094,100,1.0,,0.5,0 days 11:54:04.858811


In [51]:
# created weighted user profile vector (dotproduct of vectors of items consumed and user ratings)
#   "In the original implementation, the profile was the sum of the item-tag vectors of all items 
#   the user has rated positively (>= 3.5 stars). This approach was later improved with weighted 
#   user profile (with the older implementation commented out for reference). Weighted profile is 
#   computed with weighted sum of the item vectors for all items, with weights being based on the 
#   user's rating."
#   See: http://eugenelin89.github.io/recommender_content_based/
user_profile = [0] * len(tfidf.get_feature_names())
for i in range(len(user_profile)):
    for idx, row in user_ratings.iterrows():
        # Apply time decay to ratings also!
        # timedelta = self.__NOW - dateutil.parser.parse(row.rating_date_creation)
        # weighted_rating = float(row.overall_rating_value/(timedelta.days + 1))
        weighted_rating = float(row.overall_rating_value/(row.rating_date_diff.days + 1))
        weight_of_word = float(tfidf_tokens[row.video_id][i])
        user_profile[i] += weighted_rating * weight_of_word    #user_profile = [v/len(user_ratings) for v in user_profile] # weight-ing user vector (?)
# normalize user profile vector
import numpy as np
user_profile = user_profile / np.linalg.norm(user_profile)
user_profile

array([ 0.00786749,  0.00637422,  0.        , ...,  0.        ,
        0.00569074,  0.01067612])

### Part 5 - Calculate similarities between user profile and contents

In [52]:
# apply cosine similarity between user profile vector and content vectors
# See: http://eugenelin89.github.io/recommender_content_based/
from math import sqrt
def my_cosine_similarity(x, y):
    def square_rooted(v):
        return round(sqrt(sum([a * a for a in v])), 3)
   
    numerator = sum(a * b for a, b in zip(x, y))
    denominator = square_rooted(x) * square_rooted(y)
    return numerator/float(denominator)

estimated_user_ratings = {}
for video_id, token_weights in tfidf_tokens.items(): 
    if video_id not in user_ratings.video_id.values: # not calculating for contents already consumed 
        estimated_user_ratings[video_id] = my_cosine_similarity(user_profile, token_weights)
# order ratings
import operator
estimated_user_ratings = sorted(estimated_user_ratings.items(), key=operator.itemgetter(1))
estimated_user_ratings[:-10:-1]

[(4950, 0.35643542412891738),
 (5060, 0.32281360530764247),
 (4711, 0.30018506032887204),
 (4595, 0.29529932849919327),
 (5202, 0.28943200323152063),
 (5141, 0.28103871309070833),
 (4922, 0.28026723987064273),
 (4748, 0.26818058632981157),
 (4201, 0.26649487558507834)]

In [54]:
# Using sklearn cosine_similarity (faster!)
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt
estimated_user_ratings = []
for video_id, token_weights in tfidf_tokens.items(): 
    # not calculating for contents already consumed
    if video_id not in user_ratings.video_id.values:
        estimated_user_ratings.append((
            video_id,
            df_videos[(df_videos.video_id == video_id)].video_date_creation.values[0],
            cosine_similarity([user_profile], [token_weights])[0]
        ))
#     if video_id not in user_ratings.video_id.values: # not calculating for contents already consumed 
#         estimated_user_ratings[video_id] = cosine_similarity([user_profile], [token_weights])
# order ratings
estimated_user_ratings = sorted(estimated_user_ratings, key=lambda tup: tup[2], reverse=True)[:12]
estimated_user_ratings

[(4950, '2017-12-21 08:12:11', array([ 0.35643542])),
 (5060, '2017-12-29 13:12:31', array([ 0.32281361])),
 (4711, '2017-11-30 13:13:41', array([ 0.30018506])),
 (4595, '2017-11-23 18:02:49', array([ 0.29529933])),
 (5202, '2018-01-11 13:12:58', array([ 0.289432])),
 (5141, '2018-01-05 19:14:44', array([ 0.28103871])),
 (4922, '2017-12-19 13:12:30', array([ 0.28026724])),
 (4748, '2017-12-05 08:13:22', array([ 0.26818059])),
 (4201, '2017-10-09 22:31:09', array([ 0.26649488])),
 (4288, '2017-10-18 14:22:43', array([ 0.26442881])),
 (4200, '2017-10-09 22:30:28', array([ 0.25965298])),
 (4978, '2017-12-22 16:14:35', array([ 0.25865496]))]

In [55]:
# Using the dotproduct of user profile vector, content IDF and content vector
# See: https://www.analyticsvidhya.com/blog/2015/08/beginners-guide-learn-content-based-recommender-systems/
estimated_user_ratings = {}
for video_id, token_weights in tfidf_tokens.items():
    if video_id not in user_ratings.video_id.values: # removing contents already consumed 
        estimated_user_ratings[video_id] = 0
        for i in range(len(token_weights)):
            if token_weights[i] > 0:
                estimated_user_ratings[video_id] += user_profile[i] * tfidf.idf_[i] * token_weights[i]
# order ratings
import operator
estimated_user_ratings = sorted(estimated_user_ratings.items(), key=operator.itemgetter(1))
estimated_user_ratings[:-10:-1]

[(4950, 1.2835843193366281),
 (5060, 1.2378568871735132),
 (5141, 1.0522533282854669),
 (5202, 1.0403757723657376),
 (5059, 0.96850868997472139),
 (4774, 0.95788278863166765),
 (4711, 0.95571288931670706),
 (4595, 0.94923996674596467),
 (4922, 0.93388428522361389)]