In [1]:
import pandas as pd
import numpy as np
import random
import math
from sklearn.metrics.pairwise import cosine_similarity
import time

from functions import train_test

In [2]:
rating_df = pd.read_csv('data/user_rating_pt.csv')
rating_df.columns = rating_df.columns.astype(int)

In [3]:
rating_treshold = 3.5

rating_df[rating_df < rating_treshold] = 0
rating_df[rating_df >= rating_treshold] = 1
rating_df.columns = range(len(rating_df.columns))

rating_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0


In [4]:
known = []
rating_matrix = rating_df.to_numpy()

for u in range(rating_df.shape[0]):
    for i in range(rating_df.shape[1]):
        if rating_df.loc[u,i] > 0:
            known.append((u, i))

training, testing = train_test(known, test_size=0.3)

In [5]:
train_matrix = np.zeros((rating_matrix.shape[0], rating_matrix.shape[1]))
test_matrix = np.zeros((rating_matrix.shape[0], rating_matrix.shape[1]))

for u, i in training:
    train_matrix[u][i] = 1

## TF-IDF Implementation

In [6]:
movie_df = pd.read_csv('ml-latest-small/movies.csv')
genre_list = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary',
              'Drama' ,'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 
              'Sci-Fi', 'Thriller', 'War', 'Western', '(no genres listed)']

movies_rated = pd.read_csv('ml-latest-small/ratings.csv')['movieId'].unique()
movie_df = movie_df[movie_df['movieId'].isin(movies_rated)].reset_index(drop=True)

In [7]:
movie_genres = []
total_genres = []

for index, row in movie_df.iterrows():
    row_genre = row['genres'].split('|')
    movie_genres.append(row_genre)
    
    total_genres.append(len(row_genre))

In [8]:
tf = pd.DataFrame([])

for genre in genre_list:
    tf_genre = []
    
    for movie in movie_genres:
        if genre in movie:
            tf_genre.append(1/len(movie))
        else:
            tf_genre.append(0)
    
    tf[genre] = tf_genre

In [9]:
total_documents = len(movie_df)
idf = []

for genre in genre_list:
    genre_count = 0
    
    for movie in movie_genres:
        if genre in movie:
            genre_count += 1
    
    idf.append(math.log(total_documents/genre_count, 10))

idf = pd.Series(idf, index=genre_list)

In [10]:
tf_idf = tf*idf

In [25]:
tf_idf.shape

(9724, 19)

In [11]:
sim_matrix = cosine_similarity(tf_idf)
np.fill_diagonal(sim_matrix, 0)

sim_matrix.shape

(9724, 9724)

In [12]:
avg_sim_matrix = []

for user in train_matrix:
    liked_index = np.where(user==1)
    
    if len(liked_index[0]) != 0:
        avg_sim = sim_matrix[liked_index].mean(axis=0)
        avg_sim[liked_index] = 0
    else:
        avg_sim = [0]*train_matrix.shape[1]
    
    avg_sim_matrix.append(avg_sim)
    
avg_sim_matrix = np.array(avg_sim_matrix)

In [13]:
avg_sim_matrix.shape

(610, 9724)

## Hybrid Implementation TSUISIMCF + TFIDF

In [14]:
def get_metrics(test_set, recommend_list):
    true_positive = 0
    false_positive = 0
    false_negative = 0

    for item in recommend_list:
        if item in test_set:
            true_positive += 1
        else:
            false_positive += 1

    for item in test_set:
        if item not in recommend_list:
            false_negative+=1

    precision = true_positive/(true_positive + false_positive)
    recall = true_positive/(true_positive + false_negative)
    F1_score = 2*(precision*recall)/(precision + recall)

    print("Precision :", precision)
    print("Recall :", recall)
    print("F1 Score :", F1_score)
    
    return precision, recall, F1_score
    true_positive = 0
    false_positive = 0
    false_negative = 0
    
    for user in range(rec_matrix.shape[0]):
        for rec in rec_matrix[user]:
            if (user, rec) in test_data:
                true_positive += 1
            else:
                false_positive += 1
                
    for u, m in test_data:
        if m not in rec_matrix[u]:
            false_negative += 1
        
    precision = true_positive/(true_positive + false_positive)
    recall = true_positive/(true_positive + false_negative)
    F1_score = 2*(precision*recall)/(precision + recall)
    
    print("Precision :", precision)
    print("Recall :", recall)
    print("F1 Score :", F1_score)

In [15]:
def common_neighbors(dataset, by='user'):
    #since values are either 1 or 0, the dot product of two users/movies 
    #is used to find the number of common neighbors 
    if by == 'user':
        common_matrix = np.dot(dataset, dataset.T)
    elif by == 'movie':
        common_matrix = np.dot(dataset.T, dataset)
    
    return pd.DataFrame(common_matrix)

In [16]:
def jacard_coefﬁcient(dataset, by='user'):
    if by == 'user':
        jc = common_neighbors(dataset, by='user')
        sum_matrix = dataset.T.sum()      
        
    elif by == 'movie':
        jc = common_neighbors(dataset, by='movie') 
        sum_matrix = dataset.sum()
        sum_matrix.index = range(len(sum_matrix))
        
    size = len(jc)
    ones_matrix = pd.DataFrame(np.ones((size,size)))
    
    #get size of the union of two users or movies
    union_count = (sum_matrix * ones_matrix).T + (sum_matrix * ones_matrix) - jc
    
    jc = jc/union_count
    
    jc = jc.fillna(0)
    
    return jc

In [17]:
def get_recommendations(train_data, user_sim, movie_sim, k_user, n_movie, users_evaluated, weight=.5):
    #list of recommeded movies to users
    recommeded_movies = []
    
    for user in users_evaluated:
        #get movies user liked
        movies = train_data.loc[user]
        liked_movies = list(movies[movies == 1].index)
       
        #get top k most similar users
        top_k = list(user_sim.loc[user].sort_values(ascending=False)[:k_user+1].index)
     
        if user in top_k:
            top_k.remove(user)
        else:
            top_k = top_k[:k_user]
        
        #get possible movie recommendations
        possible_rec = []
        
        for top_user in top_k:
         
            movies = train_data.loc[top_user]
            user_likes = list(movies[movies == 1].index)
                                
            for movie in user_likes:
                if (movie not in liked_movies) and (movie not in possible_rec):
                    possible_rec.append(movie)
        
        #get score of possible recommendations
        rec_scores = []
        
        for movie in possible_rec:
            score = weight*movie_sim.loc[movie, liked_movies].max() + (1-weight)*avg_sim_matrix[user][movie]
            rec_scores.append(score)
        
        #get top n score index
        top_index = np.argpartition(rec_scores,-n_movie)[-n_movie:]
        
        #get top movie recommendations
        for index in top_index:
            recommeded_movies.append((user, possible_rec[index]))
            
    return recommeded_movies

In [24]:
train_df = pd.DataFrame(train_matrix)

cn_movie = common_neighbors(train_df, by='movie')
jc_user = jacard_coefﬁcient(train_df, by='user')

In [19]:
recommend_list = get_recommendations(train_df, jc_user, cn_movie, 10, 5, range(train_matrix.shape[0]), weight=1)
get_metrics(testing, recommend_list)

Precision : 0.24360655737704917
Recall : 0.04012962462867945
F1 Score : 0.0689079527011361


(0.24360655737704917, 0.04012962462867945, 0.0689079527011361)

In [20]:
recommend_list = get_recommendations(train_df, jc_user, cn_movie, 10, 5, range(train_matrix.shape[0]), weight=.75)
get_metrics(testing, recommend_list)

Precision : 0.24459016393442623
Recall : 0.04029165541452876
F1 Score : 0.0691861813123116


(0.24459016393442623, 0.04029165541452876, 0.0691861813123116)

In [21]:
recommend_list = get_recommendations(train_df, jc_user, cn_movie, 10, 5, range(train_matrix.shape[0]), weight=.5)
get_metrics(testing, recommend_list)

Precision : 0.24459016393442623
Recall : 0.04029165541452876
F1 Score : 0.0691861813123116


(0.24459016393442623, 0.04029165541452876, 0.0691861813123116)

In [22]:
recommend_list = get_recommendations(train_df, jc_user, cn_movie, 10, 5, range(train_matrix.shape[0]), weight=.3)
get_metrics(testing, recommend_list)

Precision : 0.24524590163934426
Recall : 0.0403996759384283
F1 Score : 0.0693716670530953


(0.24524590163934426, 0.0403996759384283, 0.0693716670530953)

In [23]:
recommend_list = get_recommendations(train_df, jc_user, cn_movie, 10, 5, range(train_matrix.shape[0]), weight=.15)
get_metrics(testing, recommend_list)

Precision : 0.24622950819672132
Recall : 0.04056170672427761
F1 Score : 0.06964989566427081


(0.24622950819672132, 0.04056170672427761, 0.06964989566427081)

## Single User

In [None]:
def get_recommendations_user(train_data, user_sim, movie_sim, k_user, n_movie, users_evaluated, weight=.15):
    #list of recommeded movies to users
    recommeded_movies = []
    
    for user in users_evaluated:
        #get movies user liked
        movies = train_data.loc[user]
        liked_movies = list(movies[movies == 1].index)
       
        #get top k most similar users
        top_k = list(user_sim.loc[user].sort_values(ascending=False)[:k_user+1].index)
     
        if user in top_k:
            top_k.remove(user)
        else:
            top_k = top_k[:k_user]
        
        #get possible movie recommendations
        possible_rec = []
        
        for top_user in top_k:
         
            movies = train_data.loc[top_user]
            user_likes = list(movies[movies == 1].index)
                                
            for movie in user_likes:
                if (movie not in liked_movies) and (movie not in possible_rec):
                    possible_rec.append(movie)
        
        #get score of possible recommendations
        rec_scores = []
        
        for movie in possible_rec:
            score = weight*movie_sim.loc[movie, liked_movies].max() + (1-weight)*avg_sim_matrix[user][movie]
            rec_scores.append(score)
        
        #get top n score index
        top_index = np.argpartition(rec_scores,-n_movie)[-n_movie:]
        
        #get top movie recommendations
        for index in top_index:
            recommeded_movies.append((user, possible_rec[index]))
            
    return recommeded_movies