# Content-based recommendations (using cosine similarity)
This notebook shows the process used to calculate recommendations for user based on their previous ratings and in features extracted from content texts available in +TV4E platform.

Library requirements:
* Pandas
* NumPy
* Scikit-learn 
* NLTK (w/ portuguese stop words)

In [1]:
# for Python 2: use print only as a function
from __future__ import print_function

### Part 1 - Examine the data

In [2]:
# read CSV using a relative path
import pandas as pd
import numpy as np
import requests
URL_RATINGS='http://api_mysql.tv4e.pt/api/recommendations/ratings'
data=requests.get(URL_RATINGS)
df_ratings=pd.DataFrame(data.json())
URL_VIDEOS='http://api_mysql.tv4e.pt/api/recommendations/videos'
data= requests.get(URL_VIDEOS)
df_videos=pd.DataFrame(data.json())

In [3]:
# show dataframe first rows
df_videos = df_videos.transpose()
df_videos.head()

Unnamed: 0,video_asgie_id,video_asgie_title_en,video_asgie_title_pt,video_date_creation,video_desc,video_id,video_location,video_title
1,3,Financial Services,Finanças,2017-09-26 15:21:16,Seja qual for o cenário para os escalões do IR...,4049,,As “habilidadezinhas” do Governo para descer o...
10,6,Local Authority Services,Serviços Autárquicos,2017-09-26 15:40:34,Na próxima quinta-feira terão início os trabal...,4058,,QUALIFICAÇÃO DA PONTE DE SÃO JOÃO
100,6,Local Authority Services,Serviços Autárquicos,2017-10-04 10:30:50,"A presidente da Viver 100 Fronteiras, organiza...",4159,5.0,Feira: Instituição suspeita de vender roupa do...
101,6,Local Authority Services,Serviços Autárquicos,2017-10-04 10:31:14,Apesar de ter falhado a eleição para a presidê...,4160,5.0,"Oliveira do Bairro: ""Representatividade do UPO..."
102,1,Health Care and Welfare Services,Saúde e Bem-Estar,2017-10-04 11:25:45,Portugueses vão consumir menos 4.225 toneladas...,4161,,Redução do consumo de açúcar


In [4]:
# show dataframe first rows
df_ratings.head()

Unnamed: 0,rating_date_creation,rating_value,user_id,video_id,video_watch_time,video_watched_type
0,2017-11-03 16:48:50,0,1,4409,10,notified
1,2017-11-03 16:25:21,0,1,4410,30,notified
2,2017-11-03 16:23:42,0,1,4411,17,notified
3,2017-11-03 16:10:04,0,1,4412,44,notified
4,2017-11-03 16:01:12,0,1,4413,100,notified


In [5]:
df_ratings[['user_id', 'video_id', 'rating_value']]

Unnamed: 0,user_id,video_id,rating_value
0,1,4409,0
1,1,4410,0
2,1,4411,0
3,1,4412,0
4,1,4413,0
5,1,4414,0
6,1,4415,0
7,1,4416,0
8,1,4417,0
9,1,4418,0


In [6]:
#show the shape of dataframe
df_ratings.shape

(60, 6)

In [7]:
# show overall statistics of the dataframe
df_ratings.describe()

Unnamed: 0,rating_value,user_id,video_id,video_watch_time
count,60.0,60.0,60.0,60.0
mean,0.0,4.8,4207.716667,15.016667
std,0.0,2.984707,153.477884,26.665214
min,0.0,1.0,4051.0,1.0
25%,0.0,1.0,4092.75,1.0
50%,0.0,5.0,4112.5,1.0
75%,0.0,8.0,4411.25,13.25
max,0.0,9.0,4457.0,100.0


In [8]:
# filter ratings from user 1 and show statistics
df_ratings_user = df_ratings[(df_ratings.user_id==1)]
df_ratings_user.describe()

Unnamed: 0,rating_value,user_id,video_id,video_watch_time
count,18.0,18.0,18.0,18.0
mean,0.0,1.0,4428.444444,40.388889
std,0.0,0.0,19.039347,35.363888
min,0.0,1.0,4409.0,3.0
25%,0.0,1.0,4413.25,11.0
50%,0.0,1.0,4417.5,30.5
75%,0.0,1.0,4446.75,51.25
max,0.0,1.0,4457.0,100.0


### Part 2 - Clean ratings data

In [9]:
# rtemoving unused columns
df_ratings.drop('video_watched_type', axis=1, inplace=True)
df_ratings.drop('rating_date_creation', axis=1, inplace=True)
df_ratings

Unnamed: 0,rating_value,user_id,video_id,video_watch_time
0,0,1,4409,10
1,0,1,4410,30
2,0,1,4411,17
3,0,1,4412,44
4,0,1,4413,100
5,0,1,4414,14
6,0,1,4415,100
7,0,1,4416,3
8,0,1,4417,53
9,0,1,4418,9


In [10]:
# calculate implicit and explicit ratings
# XXX use a function to calculate implicit rating considering the video lead time
df_ratings['rating_implicit'] = (df_ratings['video_watch_time']/100)*0.3
df_ratings['rating_explicit'] = (df_ratings['rating_value'])        *0.7

# create a new column to put implicit or explicit rating value
df_ratings['overall_rating_value'] = df_ratings['rating_implicit'] + df_ratings['rating_explicit']
# OLD: if there's no explicit rating, consider the implicit rating as 
# df_ratings['overall_rating_value'] = np.where(df_ratings['rating_value'] == 0, df_ratings['video_watch_time'], df_ratings['rating_value'])
df_ratings


Unnamed: 0,rating_value,user_id,video_id,video_watch_time,rating_implicit,rating_explicit,overall_rating_value
0,0,1,4409,10,0.03,0.0,0.03
1,0,1,4410,30,0.09,0.0,0.09
2,0,1,4411,17,0.051,0.0,0.051
3,0,1,4412,44,0.132,0.0,0.132
4,0,1,4413,100,0.3,0.0,0.3
5,0,1,4414,14,0.042,0.0,0.042
6,0,1,4415,100,0.3,0.0,0.3
7,0,1,4416,3,0.009,0.0,0.009
8,0,1,4417,53,0.159,0.0,0.159
9,0,1,4418,9,0.027,0.0,0.027


In [11]:
ratings_matrix = df_ratings.pivot_table(index=['user_id'], columns=['video_id'], values=['overall_rating_value'])
ratings_matrix                                     

Unnamed: 0_level_0,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value,overall_rating_value
video_id,4051,4061,4062,4063,4068,4089,4090,4091,4092,4093,...,4417,4418,4419,4445,4446,4447,4453,4454,4456,4457
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,,,,,,,,,...,0.159,0.027,0.03,0.288,0.138,0.009,0.3,0.105,0.078,0.093
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,0.003,0.003,0.003,0.003,...,,,,,,,,,,
8,0.039,0.003,0.003,0.003,0.003,0.006,0.003,0.003,,0.147,...,,,,,,,,,,
9,,,,,0.012,0.024,0.003,0.003,,0.003,...,,,,,,,,,,


### Part 3 - Vectorize texts

In [12]:
# import and retrieve portuguese stop words
# stop words are not considered as token as usually they carry no meaning (!)
from nltk.corpus import stopwords
portuguese_stop_words = stopwords.words('portuguese')

In [13]:
# import and instantiate TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2, max_df=0.5, stop_words=portuguese_stop_words)

In [14]:
# fit (train) the vectorizer with the corpus from video text contents
tfidf_matrix = tfidf.fit_transform(df_videos['video_desc'])
tfidf_matrix.toarray()

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.10325919,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [15]:
tfidf.get_feature_names()

['00',
 '000',
 '000 00',
 '000 euros',
 '018',
 '018 2017',
 '06',
 '09',
 '10',
 '10 10saúdealarga',
 '10 aniversário',
 '10 anos',
 '10 horas',
 '10 mil',
 '10 milhões',
 '10 novembro',
 '10 outubro',
 '100',
 '100 aquisição',
 '100 mil',
 '102',
 '10h30',
 '10saúdealarga',
 '10saúdealarga elenco',
 '11',
 '11 novembro',
 '11 outubro',
 '112',
 '112 acordo',
 '112 instituto',
 '112 recorda',
 '113',
 '113 2011',
 '118',
 '12',
 '12 anos',
 '12 meses',
 '12 mil',
 '12 outubro',
 '120',
 '120 médicos',
 '125',
 '127',
 '13',
 '13 milhões',
 '13 novembro',
 '13 outubro',
 '131',
 '131 2017',
 '14',
 '14 17',
 '14 mil',
 '14 novembro',
 '140',
 '140 2016',
 '142',
 '15',
 '15 horas',
 '15 milhões',
 '15 novembro',
 '15 outubro',
 '15 pontos',
 '150',
 '150 000',
 '150 200',
 '150 mil',
 '150 milhões',
 '154',
 '154 administração',
 '154 profissionais',
 '154 vagas',
 '157',
 '16',
 '16 horas',
 '16 milhões',
 '16 outubro',
 '17',
 '17 anos',
 '17 horas',
 '17 junho',
 '17 outubro',
 '18

### Part 4 - Build up user profile

In [16]:
# create dict video_id ==>> tfidf weights
tfidf_tokens = {}
tfidf_array = tfidf_matrix.toarray()
line_count = 0
for idx, row in df_videos.iterrows():
    tfidf_tokens[row.video_id] = tfidf_array[line_count]
    line_count += 1
tfidf_tokens

{4049: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4051: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4053: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4054: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4055: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4057: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4058: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4059: array([ 0.        ,  0.18485889,  0.        , ...,  0.        ,
         0.        ,  0.        ]),
 4060: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4061: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4062: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4063: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4068: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4070: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4071: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4072: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4073: array([ 0.,  0.,  0., ...,  0.,  0.,  0.]),
 4074: array([ 0.        ,  0.        ,  0.        , ...,  0.        ,
      

In [17]:
# define user ratings
user_ratings = df_ratings[(df_ratings.user_id==8)]
user_ratings

Unnamed: 0,rating_value,user_id,video_id,video_watch_time,rating_implicit,rating_explicit,overall_rating_value
41,0,8,4051,13,0.039,0.0,0.039
42,0,8,4061,1,0.003,0.0,0.003
43,0,8,4062,1,0.003,0.0,0.003
44,0,8,4063,1,0.003,0.0,0.003
45,0,8,4068,1,0.003,0.0,0.003
46,0,8,4089,2,0.006,0.0,0.006
47,0,8,4090,1,0.003,0.0,0.003
48,0,8,4091,1,0.003,0.0,0.003
49,0,8,4093,49,0.147,0.0,0.147
50,0,8,4094,7,0.021,0.0,0.021


In [18]:
# created weighted user profile vector (dotproduct of vectors of items consumed and user ratings)
#   "In the original implementation, the profile was the sum of the item-tag vectors of all items 
#   the user has rated positively (>= 3.5 stars). This approach was later improved with weighted 
#   user profile (with the older implementation commented out for reference). Weighted profile is 
#   computed with weighted sum of the item vectors for all items, with weights being based on the 
#   user's rating."
#   See: http://eugenelin89.github.io/recommender_content_based/
user_profile = [0] * len(tfidf.get_feature_names())
for i in range(len(user_profile)):
    for idx, row in user_ratings.iterrows():
        user_profile[i] += row.overall_rating_value * tfidf_tokens[row.video_id][i]
    #user_profile = [v/len(user_ratings) for v in user_profile] # weight-ing user vector (?)
# normalize user profile vector
import numpy as np
user_profile = user_profile / np.linalg.norm(user_profile)
user_profile

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

### Part 5 - Calculate similarities between user profile and contents

In [19]:
# apply cosine similarity between user profile vector and content vectors
# See: http://eugenelin89.github.io/recommender_content_based/
from math import sqrt
def my_cosine_similarity(x, y):
    def square_rooted(v):
        return round(sqrt(sum([a * a for a in v])), 3)
   
    numerator = sum(a * b for a, b in zip(x, y))
    denominator = square_rooted(x) * square_rooted(y)
    return numerator/float(denominator)

estimated_user_ratings = {}
for video_id, token_weights in tfidf_tokens.items(): 
    if video_id not in user_ratings.video_id.values: # not calculating for contents already consumed 
        estimated_user_ratings[video_id] = my_cosine_similarity(user_profile, token_weights)
# order ratings
import operator
estimated_user_ratings = sorted(estimated_user_ratings.items(), key=operator.itemgetter(1))
estimated_user_ratings[:-10:-1]

[(4450, 0.16676989875617934),
 (4454, 0.15453816751607408),
 (4097, 0.14764841483066724),
 (4227, 0.13628945583659333),
 (4195, 0.12967272427555721),
 (4157, 0.12624504666864095),
 (4432, 0.12279163289120062),
 (4148, 0.12183797422857444),
 (4452, 0.11345403557767844)]

In [20]:
# Using sklearn cosine_similarity (faster!)
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt
estimated_user_ratings = {}
for video_id, token_weights in tfidf_tokens.items(): 
    if video_id not in user_ratings.video_id.values: # not calculating for contents already consumed 
        estimated_user_ratings[video_id] = cosine_similarity([user_profile], [token_weights])
# order ratings
import operator
estimated_user_ratings = sorted(estimated_user_ratings.items(), key=operator.itemgetter(1))
estimated_user_ratings[:-10:-1]

[(4450, array([[ 0.1667699]])),
 (4454, array([[ 0.15453817]])),
 (4097, array([[ 0.14764841]])),
 (4227, array([[ 0.13628946]])),
 (4195, array([[ 0.12967272]])),
 (4157, array([[ 0.12624505]])),
 (4432, array([[ 0.12279163]])),
 (4148, array([[ 0.12183797]])),
 (4452, array([[ 0.11345404]]))]

In [21]:
# Using the dotproduct of user profile vector, content IDF and content vector
# See: https://www.analyticsvidhya.com/blog/2015/08/beginners-guide-learn-content-based-recommender-systems/
estimated_user_ratings = {}
for video_id, token_weights in tfidf_tokens.items():
    if video_id not in user_ratings.video_id.values: # removing contents already consumed 
        estimated_user_ratings[video_id] = 0
        for i in range(len(token_weights)):
            if token_weights[i] > 0:
                estimated_user_ratings[video_id] += user_profile[i] * tfidf.idf_[i] * token_weights[i]
# order ratings
import operator
estimated_user_ratings = sorted(estimated_user_ratings.items(), key=operator.itemgetter(1))
estimated_user_ratings[:-10:-1]

[(4450, 0.74044083484521728),
 (4097, 0.62542084141745002),
 (4148, 0.47950784158590992),
 (4454, 0.47598643489034625),
 (4421, 0.46544185692352602),
 (4157, 0.45886086260068237),
 (4195, 0.44730721679871327),
 (4227, 0.44111922098781664),
 (4432, 0.42567263347495227)]