In [18]:
import pandas as pd
import numpy as np
import random
import math
from sklearn.metrics.pairwise import cosine_similarity

from functions import train_test

In [19]:
rating_df = pd.read_csv('data/user_rating_pt.csv')
rating_df.columns = rating_df.columns.astype(int)

In [20]:
def get_metrics(test_data, rec_matrix):
    true_positive = 0
    false_positive = 0
    false_negative = 0
    
    for user in range(rec_matrix.shape[0]):
        for rec in rec_matrix[user]:
            if (user, rec) in test_data:
                true_positive += 1
            else:
                false_positive += 1
                
    for u, m in test_data:
        if m not in rec_matrix[u]:
            false_negative += 1
        
    precision = true_positive/(true_positive + false_positive)
    recall = true_positive/(true_positive + false_negative)
    F1_score = 2*(precision*recall)/(precision + recall)
    
    print("Precision :", precision)
    print("Recall :", recall)
    print("F1 Score :", F1_score)

In [21]:
rating_treshold = 3.5

rating_df[rating_df < rating_treshold] = 0
rating_df[rating_df >= rating_treshold] = 1
rating_df.columns = range(len(rating_df.columns))

rating_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0


In [22]:
known = []
rating_matrix = rating_df.to_numpy()

for u in range(rating_matrix.shape[0]):
    for i in range(rating_matrix.shape[1]):
        if rating_matrix[u,i] > 0:
            known.append((u, i))

training, testing = train_test(known, test_size=0.3)

In [23]:
train_matrix = np.zeros((rating_matrix.shape[0], rating_matrix.shape[1]))
test_matrix = np.zeros((rating_matrix.shape[0], rating_matrix.shape[1]))

for u, i in training:
    train_matrix[u][i] = 1

In [24]:
movie_df = pd.read_csv('ml-latest-small/movies.csv')
genre_list = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary',
              'Drama' ,'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
              'Sci-Fi', 'Thriller', 'War', 'Western', '(no genres listed)']

movies_rated = pd.read_csv('ml-latest-small/ratings.csv')['movieId'].unique()
movie_df = movie_df[movie_df['movieId'].isin(movies_rated)].reset_index(drop=True)

In [25]:
movie_genres = []
total_genres = []

for index, row in movie_df.iterrows():
    row_genre = row['genres'].split('|')
    movie_genres.append(row_genre)
    
    total_genres.append(len(row_genre))

## TF-IDF Implementation

In [26]:
tf = pd.DataFrame([])

for genre in genre_list:
    tf_genre = []
    
    for movie in movie_genres:
        if genre in movie:
            tf_genre.append(1/len(movie))
        else:
            tf_genre.append(0)
    
    tf[genre] = tf_genre

In [27]:
tf.loc[22]

Action                0.333333
Adventure             0.000000
Animation             0.000000
Children              0.000000
Comedy                0.000000
Crime                 0.333333
Documentary           0.000000
Drama                 0.000000
Fantasy               0.000000
Film-Noir             0.000000
Horror                0.000000
Musical               0.000000
Mystery               0.000000
Romance               0.000000
Sci-Fi                0.000000
Thriller              0.333333
War                   0.000000
Western               0.000000
(no genres listed)    0.000000
Name: 22, dtype: float64

In [28]:
movie_df.loc[22]

movieId                       23
title           Assassins (1995)
genres     Action|Crime|Thriller
Name: 22, dtype: object

In [29]:
tf

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,0.00,0.200000,0.200000,0.200000,0.200000,0.0,0.0,0.000000,0.200000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.00,0.333333,0.000000,0.333333,0.000000,0.0,0.0,0.000000,0.333333,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.00,0.000000,0.000000,0.000000,0.500000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.500000,0.0,0.0,0.0,0.0,0.0
3,0.00,0.000000,0.000000,0.000000,0.333333,0.0,0.0,0.333333,0.000000,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0
4,0.00,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,0.25,0.000000,0.250000,0.000000,0.250000,0.0,0.0,0.000000,0.250000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
9720,0.00,0.000000,0.333333,0.000000,0.333333,0.0,0.0,0.000000,0.333333,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
9721,0.00,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
9722,0.50,0.000000,0.500000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [30]:
total_documents = len(movie_df)
idf = []

for genre in genre_list:
    genre_count = 0
    
    for movie in movie_genres:
        if genre in movie:
            genre_count += 1
    
    idf.append(math.log(total_documents/genre_count, 10))

idf = pd.Series(idf, index=genre_list)

In [31]:
idf

Action                0.725869
Adventure             0.886786
Animation             1.202515
Children              1.165677
Comedy                0.413466
Crime                 0.910114
Documentary           1.346371
Drama                 0.349456
Fantasy               1.096865
Film-Noir             2.058426
Horror                0.997950
Musical               1.465401
Mystery               1.229690
Romance               0.786175
Sci-Fi                0.996619
Thriller              0.711613
War                   1.406920
Western               1.765128
(no genres listed)    2.456366
dtype: float64

In [32]:
tf_idf = tf*idf
tf_idf

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,0.000000,0.177357,0.240503,0.233135,0.082693,0.0,0.0,0.000000,0.219373,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.295595,0.000000,0.388559,0.000000,0.0,0.0,0.000000,0.365622,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.206733,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.393087,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.137822,0.0,0.0,0.116485,0.000000,0.0,0.0,0.0,0.0,0.262058,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.413466,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,0.181467,0.000000,0.300629,0.000000,0.103367,0.0,0.0,0.000000,0.274216,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
9720,0.000000,0.000000,0.400838,0.000000,0.137822,0.0,0.0,0.000000,0.365622,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
9721,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.349456,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
9722,0.362934,0.000000,0.601258,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [33]:
sim_matrix = cosine_similarity(tf_idf)
np.fill_diagonal(sim_matrix, 0)

sim_matrix

array([[0.        , 0.82118246, 0.08636985, ..., 0.        , 0.46201134,
        0.18555274],
       [0.82118246, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.08636985, 0.        , 0.        , ..., 0.        , 0.        ,
        0.46547329],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.46201134, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.18555274, 0.        , 0.46547329, ..., 0.        , 0.        ,
        0.        ]])

# Get Recommendations

In [34]:
a = train_matrix[0]
ind = np.where(a==1)

In [35]:
avg = sim_matrix[ind].mean(axis=0)

In [36]:
avg[ind] 

array([0.25267234, 0.23429092, 0.14402859, 0.19231368, 0.12270105,
       0.14049198, 0.23876175, 0.15952952, 0.20376691, 0.27027087,
       0.14049198, 0.10850078, 0.30144601, 0.17291776, 0.29581054,
       0.09770998, 0.11546242, 0.13888142, 0.22761099, 0.13090214,
       0.22310901, 0.0664324 , 0.26499757, 0.08258514, 0.12262098,
       0.16977324, 0.22761099, 0.16355483, 0.26913562, 0.16355483,
       0.18354646, 0.14402859, 0.10372153, 0.14402859, 0.15778703,
       0.25869907, 0.27943691, 0.31705236, 0.21156777, 0.17653551,
       0.09862563, 0.16994298, 0.10372153, 0.19231368, 0.10087617,
       0.23027639, 0.13323752, 0.186111  , 0.28709774, 0.17300728,
       0.26913562, 0.08044213, 0.25731468, 0.17581417, 0.24901543,
       0.28709774, 0.09297401, 0.1134082 , 0.14049198, 0.18399523,
       0.17653551, 0.08258514, 0.20441964, 0.30967876, 0.22937698,
       0.22295434, 0.20627163, 0.28417994, 0.21936678, 0.2317757 ,
       0.27943691, 0.12262098, 0.18836706, 0.27943691, 0.22937

In [37]:
avg.min(), avg.max()

(0.0, 0.3562080011130355)

In [40]:
rec_matrix = []
k = 15

for user in train_matrix:
    liked_index = np.where(user==1)
    
    if len(liked_index[0]) != 0:
        avg_sim = sim_matrix[liked_index].mean(axis=0)
        avg_sim[liked_index] = 0
    else:
        avg_sim = [0]*train_matrix.shape[1]
    
    top_sim = np.argpartition(avg_sim, -k)[-k:]
    rec_matrix.append(top_sim)

rec_matrix = np.array(rec_matrix)

In [41]:
get_metrics(testing, rec_matrix)

Precision : 0.010710382513661203
Recall : 0.005293005671077505
F1 Score : 0.007084764142418219


In [46]:
user = train_matrix[0]
liked_index = np.where(user==1)

In [52]:
sim_matrix[liked_index].shape

(139, 9724)