In [1]:
import pandas as pd
import numpy as np

from recoxplainer.evaluator import Splitter, Evaluator
from recoxplainer.config import cfg
from recoxplainer.data_reader import DataReader 
from recoxplainer.explain import KNNPostHocExplainer
from recoxplainer.evaluator import ExplanationEvaluator

from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ratings = pd.read_csv('datasets/ml-1m/ratings.csv', sep=',', encoding='latin-1')
users = pd.read_csv('datasets/ml-1m/users.csv', sep=',', encoding='latin-1')
movies = pd.read_csv('datasets/ml-1m/movies.csv', sep=',',encoding='latin-1')

In [3]:
data = DataReader(**cfg.ml1m)
data.make_consecutive_ids_in_dataset()
data.binarize(binary_threshold=1)
sp = Splitter()
train, test = sp.split_leave_n_out(data, frac=0.1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [4]:
users.head()

Unnamed: 0,origin_uid,userId,gender,age,occupation,zipcode
0,1,0,F,1,10,48067
1,2,1,M,56,16,70072
2,3,2,M,25,15,55117
3,4,3,M,45,7,2460
4,5,4,M,25,20,55455


In [5]:
movies.head()

Unnamed: 0,origin_iid,itemId,title,genre
0,1193,0,One Flew Over the Cuckoo's Nest (1975),Drama
1,661,1,James and the Giant Peach (1996),Animation|Children's|Musical
2,914,2,My Fair Lady (1964),Musical|Romance
3,3408,3,Erin Brockovich (2000),Drama
4,2355,4,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [6]:
ratings.head()

Unnamed: 0,origin_uid,origin_iid,rating,timestamp,userId,itemId
0,1,1193,5,978300760,0,0
1,1,661,3,978302109,0,1
2,1,914,3,978301968,0,2
3,1,3408,4,978300275,0,3
4,1,2355,5,978824291,0,4


In [7]:
genre_labels = set()
for s in movies['genre'].str.split('|').values:
    genre_labels = genre_labels.union(set(s))

# Function that counts the number of times each of the genre keywords appear
def count_word(dataset, ref_col, census):
    keyword_count = dict()
    for s in census: 
        keyword_count[s] = 0
    for census_keywords in dataset[ref_col].str.split('|'):        
        if type(census_keywords) == float and pd.isnull(census_keywords): 
            continue        
        for s in [s for s in census_keywords if s in census]: 
            if pd.notnull(s): 
                keyword_count[s] += 1
    #______________________________________________________________________
    # convert the dictionary in a list to sort the keywords by frequency
    keyword_occurences = []
    for k,v in keyword_count.items():
        keyword_occurences.append([k,v])
    keyword_occurences.sort(key = lambda x:x[1], reverse = True)
    return keyword_occurences, keyword_count

# Calling this function gives access to a list of genre keywords which are sorted by decreasing frequency
keyword_occurences, dum = count_word(movies, 'genre', genre_labels)
keyword_occurences[:5]

[['Drama', 1493],
 ['Comedy', 1163],
 ['Action', 495],
 ['Thriller', 485],
 ['Romance', 459]]

In [8]:
# Break up the big genre string into a string array
movies['genre'] = movies['genre'].str.split('|')
# Convert genres to string value
movies['genre'] = movies['genre'].fillna("").astype('str')

In [9]:
movies.head()

Unnamed: 0,origin_iid,itemId,title,genre
0,1193,0,One Flew Over the Cuckoo's Nest (1975),['Drama']
1,661,1,James and the Giant Peach (1996),"['Animation', ""Children's"", 'Musical']"
2,914,2,My Fair Lady (1964),"['Musical', 'Romance']"
3,3408,3,Erin Brockovich (2000),['Drama']
4,2355,4,"Bug's Life, A (1998)","['Animation', ""Children's"", 'Comedy']"


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genre']) #genres are the corpus
tfidf_matrix

<3706x127 sparse matrix of type '<class 'numpy.float64'>'
	with 9314 stored elements in Compressed Sparse Row format>

In [11]:
tfidf_matrix.shape

(3706, 127)

In [12]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[:4, :4]

array([[1.        , 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.22773644, 0.        ],
       [0.        , 0.22773644, 1.        , 0.        ],
       [1.        , 0.        , 0.        , 1.        ]])

In [13]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

In [14]:
indices

title
One Flew Over the Cuckoo's Nest (1975)            0
James and the Giant Peach (1996)                  1
My Fair Lady (1964)                               2
Erin Brockovich (2000)                            3
Bug's Life, A (1998)                              4
                                               ... 
Modulations (1998)                             3701
Broken Vessels (1998)                          3702
White Boys (1999)                              3703
One Little Indian (1973)                       3704
Five Wives, Three Secretaries and Me (1998)    3705
Length: 3706, dtype: int64

In [15]:
#list(enumerate(cosine_sim[27]))
#sim_score = list(enumerate(cosine_sim[27])) #Titanic
#sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)
#sim_score = sim_score[1:21]
#movie_indices = [i[0] for i in sim_score]

In [16]:
#titles.iloc[movie_indices]

In [17]:
# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [18]:
#result = genre_recommendations('Titanic (1997)').head(10)

In [19]:
#result

In [20]:
#recommendations_df = pd.DataFrame(result)

In [21]:
#recommendations_df

In [22]:
#recommendations_array = np.asarray(recommendations)

In [23]:
#recommendations_array 

# Getting user ratings in an array

In [24]:
#usersList = ratings.groupby(by='userId')

In [25]:
#user_ratings = usersList.get_group(7).itemId.values #give it userID

In [26]:
#user_ratings

In [27]:
#len(user_ratings)

In [28]:
#user_ratings_df = ratings.query("userId == 7")

In [29]:
#user_ratings_df

# Getting similar items

In [30]:
#knn_items_dict = {}

In [31]:
#num_items = int(ratings[['itemId']].nunique())
#num_users = int(ratings[['userId']].nunique())

In [32]:
#generating knn items dict for all movies
#ds = np.zeros((num_items, num_users))
#ds[ratings.itemId, ratings.userId] = ratings.rating
#ds = sparse.csr_matrix(ds)
#sim_matrix = cosine_similarity(ds)
#min_val = sim_matrix.min() - 1

#for i in range(num_items):
#            sim_matrix[i, i] = min_val
#            knn_to_item_i = (-sim_matrix[i, :]).argsort()[:10]
#            knn_items_dict[i] = knn_to_item_i

# looping to get explnations for all recommendations

In [33]:
#explanations = []
#for x in recommendations_array:
#    title = x[0]
#    recommendedMovie = movies.loc[movies.title == title]
#    #print(recommendedMovie)
#    #print(recommendedMovie.origin_iid)
#    rec_Origin_id = int(recommendedMovie.origin_iid)
#    sim_items = knn_items_dict[rec_Origin_id]
#    #print(sim_items)
#    explanation =  set(sim_items) & set(user_ratings)
#    #print(explanation)
#    explanations.append(explanation)
#print(explanations)
#recommendations_df['explanations'] = explanations

# Getting similar items

In [34]:
#knn_items_dict = {}

In [35]:
#num_items = int(ratings[['itemId']].nunique())
#num_users = int(ratings[['userId']].nunique())

In [36]:
#generating knn items dict for all movies
#ds = np.zeros((num_items, num_users))
#ds[ratings.itemId, ratings.userId] = ratings.rating
#ds = sparse.csr_matrix(ds)
#sim_matrix = cosine_similarity(ds)
#min_val = sim_matrix.min() - 1

#for i in range(num_items):
#            sim_matrix[i, i] = min_val
#            knn_to_item_i = (-sim_matrix[i, :]).argsort()[:10]
#            knn_items_dict[i] = knn_to_item_i

In [37]:
#recommendations_df

# Evaluating model fidelity per one user

In [38]:
#expl = recommendations_df[[len(x) > 0 for x in recommendations_df.explanations]]
#fidelity = expl['title'].count()/10

In [39]:
#fidelity

# Setting up test example and getting id's

In [40]:
#movies.query("title == 'Titanic (1997)'")

In [41]:
#usersWhoLikeTitanic = ratings.query("itemId == 27")

In [42]:
#usersWhoLikeTitanic = usersWhoLikeTitanic.query("rating == 5")

In [43]:
#usersWhoLikeTitanic #users who watched titanic and gave it 5 stars (389)

# Choose origin uid = 8 and userId = 7 and origin iid = 1721 and itemId = 27

In [44]:
#movies.query("title == 'Toy Story (1995)'")

In [45]:
#toyStoryRatings = ratings.query("origin_iid == 1")

In [46]:
#toyStoryRatings = toyStoryRatings.query("rating == 5")

In [47]:
#toyStoryRatings

# Choose origin uid = 1 and userId = 0 and origin iid = 1 and itemId = 40

# Automation for all users

In [48]:
moviesUsedForRecommendations = []
usersList = ratings.groupby(by='userId')

In [49]:
#for loop to fill moviesUsedForRecommendations with the movie that will be used for each user to recommend with
for x in range(0, 6040):
    allUserRatings = pd.DataFrame(usersList.get_group(x))
    allUserRatings = allUserRatings.sort_values(['rating'], ascending=[False])
#    allUserRatings = allUserRatings.query("rating >= 4")
    itemUsedForRecommendation = allUserRatings.head(1)
    moviesUsedForRecommendations.append(itemUsedForRecommendation)

In [50]:
#moviesUsedForRecommendations[3597]

In [51]:
#movieName = movies.loc[2560].title
#print(movieName)
#result = genre_recommendations(movieName).head(10)
#print(result)

In [52]:
#loop on moviesUsedForRecommendations and recommend for each user 10 movies & explain
ids = []
recommendations = []
for x in range (0,6040):
#    print(x)
    userId = x
    movieId = int(moviesUsedForRecommendations[x].itemId)
#    print(movieId)
    movieName = movies.loc[movieId].title
#    print(movieName)
    result = genre_recommendations(movieName).head(10)
#    print(result)
    recommendations_df = pd.DataFrame(result)
#    print(recommendations_df)
    recommendations_array = np.asarray(recommendations_df)
#    print(recommendations_array)
    for i in range (len(recommendations_array)):
        ids.append(x)
        recommendations.append(recommendations_array[i])
allUsersRecommendations_df = pd.DataFrame(list(zip(ids, recommendations)),
            columns =['userId', 'movie'])

In [53]:
allUsersRecommendations_df

Unnamed: 0,userId,movie
0,0,[Erin Brockovich (2000)]
1,0,[Miracle on 34th Street (1947)]
2,0,[Awakenings (1990)]
3,0,[Ponette (1996)]
4,0,"[Girl, Interrupted (1999)]"
...,...,...
60395,6039,"[Last Days of Disco, The (1998)]"
60396,6039,[Apollo 13 (1995)]
60397,6039,[Rain Man (1988)]
60398,6039,[Driving Miss Daisy (1989)]


In [54]:
recommendations[0][0]

'Erin Brockovich (2000)'

# Getting similar items

In [55]:
knn_items_dict = {}

In [56]:
num_items = int(ratings[['itemId']].nunique())
num_users = int(ratings[['userId']].nunique())

In [57]:
num_items

3706

In [58]:
#generating knn items dict for all movies
ds = np.zeros((num_items, num_users))
ds[ratings.itemId, ratings.userId] = ratings.rating
ds = sparse.csr_matrix(ds)
sim_matrix = cosine_similarity(ds)
min_val = sim_matrix.min() - 1

for i in range(num_items):
            sim_matrix[i, i] = min_val
            knn_to_item_i = (-sim_matrix[i, :]).argsort()[:10]
            knn_items_dict[i] = knn_to_item_i

In [59]:
sim_matrix.shape

(3706, 3706)

In [60]:
movies.query('title == "Erin Brockovich (2000)"')

Unnamed: 0,origin_iid,itemId,title,genre
3,3408,3,Erin Brockovich (2000),['Drama']


In [61]:
knn_items_dict[3]

array([ 68, 104,  38, 134, 305, 541, 444, 499, 406, 519])

In [62]:
movies.query('itemId == 38')

Unnamed: 0,origin_iid,itemId,title,genre
38,2762,38,"Sixth Sense, The (1999)",['Thriller']


In [63]:
ratings.query('userId == 0')

Unnamed: 0,origin_uid,origin_iid,rating,timestamp,userId,itemId
0,1,1193,5,978300760,0,0
1,1,661,3,978302109,0,1
2,1,914,3,978301968,0,2
3,1,3408,4,978300275,0,3
4,1,2355,5,978824291,0,4
5,1,1197,3,978302268,0,5
6,1,1287,5,978302039,0,6
7,1,2804,5,978300719,0,7
8,1,594,4,978302268,0,8
9,1,919,4,978301368,0,9


# Getting user ratings in an array

In [64]:
usersList = ratings.groupby(by='userId')
pointer = 0
explanations = []
for x in range (0,6040):
    user_ratings = usersList.get_group(x).itemId.values #give it userID
    counter = 0
    while counter < 10:
        title = recommendations[pointer][0]
        recommendedMovie = movies.loc[movies.title == title]
        #print(recommendedMovie)
        #rec_Origin_id = int(recommendedMovie.origin_iid)
        rec_item_id = int(recommendedMovie.itemId)
        #print(rec_item_id)
        sim_items = knn_items_dict[rec_item_id]
        explanation =  set(sim_items) & set(user_ratings)
        explanations.append(explanation)
        pointer = pointer + 1
        counter = counter + 1
allUsersRecommendations_df['explanations'] = explanations

In [65]:
allUsersRecommendations_df

Unnamed: 0,userId,movie,explanations
0,0,[Erin Brockovich (2000)],{38}
1,0,[Miracle on 34th Street (1947)],"{8, 9, 2, 42}"
2,0,[Awakenings (1990)],"{52, 39}"
3,0,[Ponette (1996)],{}
4,0,"[Girl, Interrupted (1999)]",{}
...,...,...,...
60395,6039,"[Last Days of Disco, The (1998)]","{373, 2491, 284, 365}"
60396,6039,[Apollo 13 (1995)],"{128, 164, 167, 520, 280}"
60397,6039,[Rain Man (1988)],"{547, 105, 42, 52, 22, 26}"
60398,6039,[Driving Miss Daisy (1989)],"{41, 428, 142, 1586, 52, 1049}"


# Model Fidelity Calculation

In [66]:
expl = allUsersRecommendations_df[[len(x) > 0 for x in allUsersRecommendations_df.explanations]]
fidelity = expl.groupby('userId')['movie'].count() / 10
modelFidelity = sum(fidelity)/6040

In [67]:
modelFidelity

0.6083940397351044