# Calculation of the intra-list similarity

Input: 
+ Top10 recommendation as ordered list
+ matrix with cosine similarity between features

Calculation:
+ sum all cosine similarities of each movie to each other movie
+ calculate mean of all cosine similarities

Output:
+ one value between 0 and 1 for the list

In [1]:
import pandas as pd
import numpy as np
from numpy import load



## Input

In [2]:
# dataframe with cosine similarities between each movie
df_cosine_sim = pd.read_csv("../data/cos_sim_matrix.csv")

In [3]:
df_cosine_sim.head()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,175475,175569,175577,175585,175693,175705,175707,175743,175781,176051
0,1,1.0,0.0339,0.010186,0.010879,0.0,0.0,0.012142,0.046066,0.0,...,0.015958,0.0,0.016464,0.0,0.018334,0.014679,0.008266,0.025425,0.052926,0.0
1,2,0.0339,1.0,0.020544,0.0,0.010148,0.030094,0.0,0.009292,0.100504,...,0.0,0.042701,0.0,0.0,0.0,0.0,0.008336,0.0,0.0,0.0
2,3,0.010186,0.020544,1.0,0.013186,0.0,0.0,0.0,0.011167,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.010879,0.0,0.013186,1.0,0.026053,0.019316,0.031439,0.023855,0.0,...,0.020659,0.0,0.04263,0.01472,0.023736,0.019004,0.010701,0.032915,0.068519,0.013027
4,5,0.0,0.010148,0.0,0.026053,1.0,0.017865,0.014539,0.0,0.073432,...,0.0,0.0,0.019714,0.0,0.0,0.017576,0.0,0.0,0.0,0.024096


In [4]:
df_cosine_sim.tail()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,175475,175569,175577,175585,175693,175705,175707,175743,175781,176051
9660,175705,0.014679,0.0,0.0,0.019004,0.017576,0.013031,0.0,0.0,0.0,...,0.027875,0.0,0.0,0.0,0.0,1.0,0.014438,0.044412,0.0,0.0
9661,175707,0.008266,0.008336,0.0,0.010701,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.011184,0.0,0.014438,1.0,0.0,0.0,0.0
9662,175743,0.025425,0.0,0.0,0.032915,0.0,0.0,0.036736,0.0,0.0,...,0.04828,0.0,0.049814,0.0,0.05547,0.044412,0.0,1.0,0.160128,0.0
9663,175781,0.052926,0.0,0.0,0.068519,0.0,0.0,0.076472,0.0,0.0,...,0.100504,0.0,0.103695,0.0,0.11547,0.0,0.0,0.160128,1.0,0.0
9664,176051,0.0,0.0,0.0,0.013027,0.024096,0.008932,0.0,0.022063,0.036716,...,0.0,0.012674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
df_cosine_sim.at[1, "1"]

0.0338995224120614

### Set the index of the df_cosine_sim to the movieId

In [6]:
# set the index of the df_cosine_sim to the movieId
df_cosine_sim = df_cosine_sim.set_index("movieId")
df_cosine_sim.tail()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,175475,175569,175577,175585,175693,175705,175707,175743,175781,176051
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
175705,0.014679,0.0,0.0,0.019004,0.017576,0.013031,0.0,0.0,0.0,0.0,...,0.027875,0.0,0.0,0.0,0.0,1.0,0.014438,0.044412,0.0,0.0
175707,0.008266,0.008336,0.0,0.010701,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.011184,0.0,0.014438,1.0,0.0,0.0,0.0
175743,0.025425,0.0,0.0,0.032915,0.0,0.0,0.036736,0.0,0.0,0.0,...,0.04828,0.0,0.049814,0.0,0.05547,0.044412,0.0,1.0,0.160128,0.0
175781,0.052926,0.0,0.0,0.068519,0.0,0.0,0.076472,0.0,0.0,0.0,...,0.100504,0.0,0.103695,0.0,0.11547,0.0,0.0,0.160128,1.0,0.0
176051,0.0,0.0,0.0,0.013027,0.024096,0.008932,0.0,0.022063,0.036716,0.02229,...,0.0,0.012674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### load data from .NPZ file

In [7]:
# load dict of arrays
cosine_sim = load('../data/cosine_sim.npz')
# extract the first array
cosine_sim = cosine_sim['arr_0']
# print the array
print(cosine_sim)

[[1.         0.03389952 0.01018554 ... 0.02542464 0.05292561 0.        ]
 [0.03389952 1.         0.02054445 ... 0.         0.         0.        ]
 [0.01018554 0.02054445 1.         ... 0.         0.         0.        ]
 ...
 [0.02542464 0.         0.         ... 1.         0.16012815 0.        ]
 [0.05292561 0.         0.         ... 0.16012815 1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [8]:
cosine_sim.shape

(9665, 9665)

### Recommendations
In list form, 10 movies

In [9]:
# lists with recommendations
pred_list1 = [356,  318,  296,  593, 2571,  260,  480,  110,  589,  527]
pred_list2 = [3942, 147250, 115122, 86237, 1151, 146662, 114265, 146684, 3851, 5416]
pred_list4 = [2348, 2348, 2348, 2348, 2348, 2348, 2348, 2348, 2348, 2348]

## Function to calculate the intra-list similarities

Pseudocode:

Task 1: get cos sim for each movie to each other movie in the recommendation list
    + e.g. movie 1 to movie 2,3,4,5,6,7,8,9,10
    + movie 2 to movie 1,3,4,5,6,7,8,9,10

Task 2: write all cos sim in an array

Task 3: calculate the mean of the array

Task 4: return the mean as intra-list similarity

In [21]:
def intra_list_similarity(reco_list, df_cosine_sim):
    
    df_reco = df_cosine_sim.loc[reco_list]

    #get indicies for upper right triangle w/o diagonal
    upper_right = np.triu_indices(similarity.shape[0], k=1)

    #calculate average similarity score of all recommended items in list
    ils = np.mean(df_reco[upper_right])
    return ils

In [22]:
def intralist_similarity(list_of_movieIds, df_cos_sim=df_cosine_sim):
    '''calculates the intralist similarity of each item '''
    str_list_of_movieIds = [str(i) for i in list_of_movieIds]

    df_reco = df_cos_sim[str_list_of_movieIds]
    
    df_reco = df_reco.loc[list_of_movieIds]
    np_reco = df_reco.to_numpy()
    np_reco_triu = np_reco[np.triu_indices(np_reco.shape[0], k = 1)]    # keeping only the above diagonal values
    return np.mean(np_reco_triu)

In [23]:
intralist_similarity(pred_list1)

0.018136220018403867