In [22]:
import pandas as pd
import numpy as np
import random

In [23]:
rating_df = pd.read_csv('data/user_rating_pt.csv')
rating_df.columns = rating_df.columns.astype(int)

In [24]:
#filter if a user liked a movie with by a certain rating.
#Values will be 1 if they like a movie, 0 if they don't or have not seen the movie
rating_treshold = 3.5

rating_df[rating_df < rating_treshold] = 0
rating_df[rating_df >= rating_treshold] = 1
rating_df.columns = range(len(rating_df.columns))

rating_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,...,0.0,0,0,0,0.0,0,0.0,0.0,0.0,0


In [19]:
#get users with with one or more likes
user_count = rating_df.T.sum()
user_list = list(user_count[user_count > 0].index)

#get movies with one or more likes
movie_count = rating_df.sum()
movie_list = list(movie_count[movie_count > 0].index)

#filter movies
rating_df = rating_df.loc[user_list,movie_list]

#set index and columns to index numbers
rating_df.columns = range(len(rating_df.columns))
rating_df.index = range(len(rating_df.index))

#get user being evaluated. Users need to have a atleast a certain number of ratings
user_count = rating_df.T.sum()
evaluation_list = list(user_count[user_count >= 15].index)

In [20]:
test_set = []

#get indices in where a user liked a movie
for user in evaluation_list:
    liked_index = [] 
    
    for movie in rating_df.columns:
        if rating_df.loc[user, movie] == 1:
            liked_index.append((user, movie))

    #get test data
    test_length = int(len(liked_index) * .3)
    random.shuffle(liked_index)
    test_set = test_set + liked_index[:test_length]

#get training data
train_data = rating_df.copy()

for row, col in test_set:
    train_data.loc[row, col] = 0

In [6]:
user_sets = []
for index, row in rating_df.iterrows():
    user_sets.append(set(row[row == 1].index))

In [7]:
movie_sets = []
for index, row in rating_df.T.iterrows():
    movie_sets.append(set(row[row == 1].index))

In [65]:
def common_neighbors(dataset, by='user'):
    #since values are either 1 or 0, the dot product of two users/movies 
    #is used to find the number of common neighbors 
    if by == 'user':
        common_matrix = np.dot(dataset, dataset.T)
    elif by == 'movie':
        common_matrix = np.dot(dataset.T, dataset)
    
    return pd.DataFrame(common_matrix)


In [66]:
def jacard_coefﬁcient(dataset, by='user'):
    if by == 'user':
        jc = common_neighbors(dataset, by='user')
        sum_matrix = dataset.T.sum()      
        
    elif by == 'movie':
        jc = common_neighbors(dataset, by='movie') 
        sum_matrix = dataset.sum()
        sum_matrix.index = range(len(sum_matrix))
        
    size = len(jc)
    ones_matrix = pd.DataFrame(np.ones((size,size)))
    
    #get size of the union of two users or movies
    union_count = (sum_matrix * ones_matrix).T + (sum_matrix * ones_matrix) - jc
    
    jc = jc/union_count
    
    jc = jc.fillna(0)
    
    return jc

In [67]:
def preferential_attachment(dataset, by='user'):
    if by == 'user':
        sum_matrix = dataset.T.sum()
    elif by == 'movie':
        sum_matrix = dataset.sum()
        sum_matrix.index = range(len(sum_matrix))
        
    ones_matrix = pd.DataFrame(np.ones((len(sum_matrix),len(sum_matrix))))
    pa = sum_matrix * ones_matrix * (sum_matrix * ones_matrix).T

    return pa

In [57]:
def Adamic_Adar(dataset, by='user'):
    if by == 'user':
        dataset=dataset.T
        
    element_size = dataset.sum()
    element_size[element_size == 0] = 1

    log_size = np.log(element_size)
    log_size[log_size == 0] = 1

    inv_log = 1/log_size

    weighted_elem = inv_log*rating_df
    sim_matrix = np.dot(rating_df.T, weighted_elem)
    
    return sim_matrix

In [163]:
def TSUISIMCF(train_data, user_sim, movie_sim, k_user, n_movie, users_evaluated):
    #list of recommeded movies to users
    recommeded_movies = []
    
    for user in users_evaluated:
        #get movies user liked
        movies = train_data.loc[user]
        liked_movies = list(movies[movies == 1].index)
       
        #get top k most similar users
        top_k = list(user_sim.loc[user].sort_values(ascending=False)[:k_user+1].index)
        
        if user in top_k:
            top_k.remove(user)
        else:
            top_k = top_k[:k_user]
        
        #get possible movie recommendations
        possible_rec = []
        
        for top_user in top_k:
            movies = train_data.loc[top_user]
            user_likes = list(movies[movies == 1].index)
                                
            for movie in user_likes:
                if (movie not in liked_movies) and (movie not in possible_rec):
                    possible_rec.append(movie)
        
        #get score of possible recommendations
        rec_scores = []
        
        for movie in possible_rec:
            score = movie_sim.loc[movie, liked_movies].max()
            rec_scores.append(score)
        
        #get top n score index
        top_index = np.argpartition(rec_scores,-n_movie)[-n_movie:]
        
        #get top movie recommendations
        for index in top_index:
            recommeded_movies.append((user, possible_rec[index]))
            
    return recommeded_movies

In [198]:
def get_metrics(test_set, recommend_list):
    true_positive = 0
    false_positive = 0
    false_negative = 0

    for item in recommend_list:
        if item in test_set:
            true_positive += 1
        else:
            false_positive += 1

    for item in test_set:
        if item not in recommend_list:
            false_negative+=1

    precision = true_positive/(true_positive + false_positive)
    recall = true_positive/(true_positive + false_negative)
    F1_score = 2*(precision*recall)/(precision + recall)

    print("Precision :", precision)
    print("Recall :", recall)
    print("F1 Score :", F1_score)
    
    return precision, recall, F1_score

In [68]:
#get similarities
cn_user = common_neighbors(train_data, by='user')
cn_movie = common_neighbors(train_data, by='movie')

jc_user = jacard_coefﬁcient(train_data, by='user')
jc_movie = jacard_coefﬁcient(train_data, by='movie')

pa_user = preferential_attachment(train_data, by='user')
pa_movie = preferential_attachment(train_data, by='movie')

In [164]:
recommend_list = TSUISIMCF(train_data, cn_user, cn_movie, 10, 5, evaluation_list)

In [200]:
results = get_metrics(test_set, recommend_list)

Precision : 0.26243194192377495
Recall : 0.039984514987280166
F1 Score : 0.06939578634160387


In [201]:
#cn_user and jc_movie
recommend_list = TSUISIMCF(train_data, cn_user, jc_movie, 10, 5, evaluation_list)
results = get_metrics(test_set, recommend_list)

Precision : 0.08166969147005444
Recall : 0.012443313792722044
F1 Score : 0.021596199068963862


In [202]:
#cn_user and pa_movie
recommend_list = TSUISIMCF(train_data, cn_user, pa_movie, 10, 5, evaluation_list)
results = get_metrics(test_set, recommend_list)

Precision : 0.22323049001814882
Recall : 0.034011724366773584
F1 Score : 0.05902961078850122


In [203]:
#jc_user and cn_movie
recommend_list = TSUISIMCF(train_data, jc_user, cn_movie, 10, 5, evaluation_list)
results = get_metrics(test_set, recommend_list)

Precision : 0.2747731397459165
Recall : 0.041864837960402614
F1 Score : 0.07265921197869175


In [204]:
#jc_user and jc_movie
recommend_list = TSUISIMCF(train_data, jc_user, jc_movie, 10, 5, evaluation_list)
results = get_metrics(test_set, recommend_list)

Precision : 0.16878402903811252
Recall : 0.025716181838292225
F1 Score : 0.04463214474252532


In [205]:
#jc_user and pa_movie
recommend_list = TSUISIMCF(train_data, jc_user, pa_movie, 10, 5, evaluation_list)
results = get_metrics(test_set, recommend_list)

Precision : 0.24283121597096188
Recall : 0.036998119677026875
F1 Score : 0.06421269856505256


In [206]:
#pa_user and cn_movie
recommend_list = TSUISIMCF(train_data, pa_user, cn_movie, 10, 5, evaluation_list)
results = get_metrics(test_set, recommend_list)

Precision : 0.2620689655172414
Recall : 0.03992921137042363
F1 Score : 0.06929980323463071


In [207]:
#pa_user and jc_movie
recommend_list = TSUISIMCF(train_data, pa_user, jc_movie, 10, 5, evaluation_list)
results = get_metrics(test_set, recommend_list)

Precision : 0.08421052631578947
Recall : 0.01283043911071784
F1 Score : 0.02226808081777607


In [208]:
#pa_user and pa_movie
recommend_list = TSUISIMCF(train_data, pa_user, pa_movie, 10, 5, evaluation_list)
results = get_metrics(test_set, recommend_list)

Precision : 0.22250453720508168
Recall : 0.0339011171330605
F1 Score : 0.058837644574554884


In [16]:
rating_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7353,7354,7355,7356,7357,7358,7359,7360,7361,7362
0,1.0,0.0,1.0,0.0,1.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0
4,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
604,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0
605,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0
606,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,...,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0
607,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,...,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0


In [48]:
element_size = rating_df.sum()
element_size[element_size == 0] = 1

log_size = np.log(element_size)
log_size[log_size == 0] = 1

inv_log = 1/log_size

weighted_elem = inv_log*rating_df

In [52]:
sim_matrix = np.dot(rating_df.T, weighted_elem)

In [55]:
sim_matrix.shape

(9724, 9724)

In [58]:
np.dot(rating_df.T, weighted_elem)

array([[32.31526871,  7.54097572,  4.46500584, ...,  0.        ,
         0.        ,  0.        ],
       [ 6.07135351, 14.83869416,  1.59464494, ...,  0.        ,
         0.        ,  0.        ],
       [ 2.74190159,  1.21628641,  7.33536674, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  1.        ]])

In [60]:
np.dot(weighted_elem.T, rating_df)

(9724, 9724)

In [64]:
sim_matrix[25][4]

0.3898712452512801

In [80]:
import time

In [85]:
t = time.time()

for user_ind1 in range(100):
    for user_ind2 in range(user_ind1+1, 100):
        sim_score = sum(rating_df.T[user_ind1]*rating_df.T[user_ind2]*inv_log)
        
time.time() - t

145.39290404319763

In [79]:
sum(rating_df.T[10]*rating_df.T[13]*inv_log)

0.753274874708207

In [74]:
inv_log

0       0.195850
1       0.243257
2       0.318929
3       1.000000
4       0.389871
          ...   
9719    1.000000
9720    1.000000
9721    1.000000
9722    1.000000
9723    1.000000
Length: 9724, dtype: float64

In [86]:
le_input = np.array([
    [0, 0, 1],
    [0, 1, 0]
])

le_input

array([[0, 0, 1],
       [0, 1, 0]])

In [88]:
ra = np.array(rating_df)
a = np.einsum('ik,jk->ijk', le_input, le_input)

In [89]:
a.shape

(2, 2, 3)

In [93]:
ra = np.array(rating_df)
mult_ = np.einsum('ik,jk->ijk', rating_df, rating_df)

In [102]:
mult_[0][4]

(9724,)

In [104]:
sum(rating_df[0]*rating_df[4])

8.0

In [108]:
t = time.time()

for user_ind1 in range(610):
    for user_ind2 in range(user_ind1+1, 610):
        sim_score = sum(mult_[user_ind1][user_ind2]*inv_log)
        
time.time() - t

180.0629448890686

In [None]:
mult_*inv_log