In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from recoxplainer.evaluator import Splitter, Evaluator
from recoxplainer.config import cfg
from recoxplainer.data_reader import DataReader 
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
ratings_df = pd.read_csv('datasets/ml-1m/ratings.csv', sep=',', encoding='latin-1')
movies_df = pd.read_csv('datasets/ml-1m/movies.csv', sep=',',encoding='latin-1')

In [3]:
data = DataReader(**cfg.ml1m)
data.make_consecutive_ids_in_dataset()
data.binarize(binary_threshold=1)
sp = Splitter()
train, test = sp.split_leave_n_out(data, frac=0.1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [4]:
movies_df.head()

Unnamed: 0,origin_iid,itemId,title,genre
0,1193,0,One Flew Over the Cuckoo's Nest (1975),Drama
1,661,1,James and the Giant Peach (1996),Animation|Children's|Musical
2,914,2,My Fair Lady (1964),Musical|Romance
3,3408,3,Erin Brockovich (2000),Drama
4,2355,4,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [5]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['year']

0       (1975)
1       (1996)
2       (1964)
3       (2000)
4       (1998)
         ...  
3701    (1998)
3702    (1998)
3703    (1999)
3704    (1973)
3705    (1998)
Name: year, Length: 3706, dtype: object

In [6]:
#Removing paranthesis
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
movies_df['year']

0       1975
1       1996
2       1964
3       2000
4       1998
        ... 
3701    1998
3702    1998
3703    1999
3704    1973
3705    1998
Name: year, Length: 3706, dtype: object

In [7]:
movies_df['title']

0            One Flew Over the Cuckoo's Nest (1975)
1                  James and the Giant Peach (1996)
2                               My Fair Lady (1964)
3                            Erin Brockovich (2000)
4                              Bug's Life, A (1998)
                           ...                     
3701                             Modulations (1998)
3702                          Broken Vessels (1998)
3703                              White Boys (1999)
3704                       One Little Indian (1973)
3705    Five Wives, Three Secretaries and Me (1998)
Name: title, Length: 3706, dtype: object

In [8]:
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

In [9]:
movies_df.head()

Unnamed: 0,origin_iid,itemId,title,genre,year
0,1193,0,One Flew Over the Cuckoo's Nest (1975),Drama,1975
1,661,1,James and the Giant Peach (1996),Animation|Children's|Musical,1996
2,914,2,My Fair Lady (1964),Musical|Romance,1964
3,3408,3,Erin Brockovich (2000),Drama,2000
4,2355,4,"Bug's Life, A (1998)",Animation|Children's|Comedy,1998


In [10]:
#Every genre is separated by a | so we simply have to call the split function on |
movies_df['genre'] = movies_df.genre.str.split('|')
movies_df.head()

Unnamed: 0,origin_iid,itemId,title,genre,year
0,1193,0,One Flew Over the Cuckoo's Nest (1975),[Drama],1975
1,661,1,James and the Giant Peach (1996),"[Animation, Children's, Musical]",1996
2,914,2,My Fair Lady (1964),"[Musical, Romance]",1964
3,3408,3,Erin Brockovich (2000),[Drama],2000
4,2355,4,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",1998


In [11]:
#Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
moviesWithGenres_df = movies_df.copy()

#For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movies_df.iterrows():
    for genre in row['genre']:
        moviesWithGenres_df.at[index, genre] = 1
        
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,origin_iid,itemId,title,genre,year,Drama,Animation,Children's,Musical,Romance,...,Fantasy,Sci-Fi,War,Thriller,Crime,Mystery,Western,Horror,Film-Noir,Documentary
0,1193,0,One Flew Over the Cuckoo's Nest (1975),[Drama],1975,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,661,1,James and the Giant Peach (1996),"[Animation, Children's, Musical]",1996,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,914,2,My Fair Lady (1964),"[Musical, Romance]",1964,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3408,3,Erin Brockovich (2000),[Drama],2000,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2355,4,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",1998,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
ratings_df.head()

Unnamed: 0,origin_uid,origin_iid,rating,timestamp,userId,itemId
0,1,1193,5,978300760,0,0
1,1,661,3,978302109,0,1
2,1,914,3,978301968,0,2
3,1,3408,4,978300275,0,3
4,1,2355,5,978824291,0,4


In [13]:
#Drop removes a specified row or column from a dataframe
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

Unnamed: 0,origin_uid,origin_iid,rating,userId,itemId
0,1,1193,5,0,0
1,1,661,3,0,1
2,1,914,3,0,2
3,1,3408,4,0,3
4,1,2355,5,0,4


# Getting 10 Recommendations for all users

In [14]:
usersList = ratings_df.groupby(by='userId')
ids = []
recommendationsList = []
for x in range (0,6040):
    user = usersList.get_group(x)
    inputTitle = movies_df[movies_df['itemId'].isin(user['itemId'].tolist())]
    user = pd.merge(user, inputTitle)
    user = user.drop('genre', 1).drop('year', 1)
    hotEnc = moviesWithGenres_df[moviesWithGenres_df['itemId'].isin(user['itemId'].tolist())]
    hotEnc = hotEnc.reset_index(drop=True)
    userGenre = hotEnc.drop('origin_iid', 1).drop('itemId', 1).drop('title', 1).drop('genre', 1).drop('year', 1)
    profile = userGenre.transpose().dot(user['rating'])
    allMoviesGenreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['itemId'])
    allMoviesGenreTable = allMoviesGenreTable.drop('origin_iid', 1).drop('itemId', 1).drop('title', 1).drop('genre', 1).drop('year', 1)
    recommendations_df = ((allMoviesGenreTable*profile).sum(axis=1))/(profile.sum())
    recommendations_df = recommendations_df.sort_values(ascending=False)
    recommendations = movies_df.loc[movies_df['itemId'].isin(recommendations_df.head(10).keys())]
    recommendations_array = np.asarray(recommendations)
    for i in range (len(recommendations_array)):
        ids.append(x)
        recommendationsList.append(recommendations_array[i][2])
allUsersRecommendations_df = pd.DataFrame(list(zip(ids, recommendationsList)), columns =['userId', 'movie'])
        

In [15]:
allUsersRecommendations_df.head(10)

Unnamed: 0,userId,movie
0,0,"Wizard of Oz, The (1939)"
1,0,Pocahontas (1995)
2,0,Hercules (1997)
3,0,Aladdin (1992)
4,0,"Little Mermaid, The (1989)"
5,0,Watership Down (1978)
6,0,"Jungle Book, The (1967)"
7,0,Lady and the Tramp (1955)
8,0,Space Jam (1996)
9,0,Steamboat Willie (1940)


# Explanations using KNN

# Getting similar items to all movies in a dict

In [16]:
knn_items_dict = {}
num_items = int(ratings_df[['itemId']].nunique())
num_users = int(ratings_df[['userId']].nunique())

In [17]:
#generating knn items dict for all movies
ds = np.zeros((num_items, num_users))
ds[ratings_df.itemId, ratings_df.userId] = ratings_df.rating
ds = sparse.csr_matrix(ds)
sim_matrix = cosine_similarity(ds)
min_val = sim_matrix.min() - 1

for i in range(num_items):
            sim_matrix[i, i] = min_val
            knn_to_item_i = (-sim_matrix[i, :]).argsort()[:10]
            knn_items_dict[i] = knn_to_item_i

# Getting user ratings in an array

In [18]:
usersList = ratings_df.groupby(by='userId')
pointer = 0
explanations = []
for x in range (0,6040):
    user_ratings = usersList.get_group(x).itemId.values #give it userID
    counter = 0
    while counter < 10:
        title = recommendationsList[pointer]
        #print(title)
        recommendedMovie = movies_df.loc[movies_df.title == title]
        #print(recommendedMovie)
        #rec_Origin_id = int(recommendedMovie.origin_iid)
        rec_item_id = int(recommendedMovie.itemId)
        #print(rec_item_id)
        sim_items = knn_items_dict[rec_item_id]
        explanation =  set(sim_items) & set(user_ratings)
        explanations.append(explanation)
        pointer = pointer + 1
        counter = counter + 1
allUsersRecommendations_df['explanations'] = explanations

In [19]:
allUsersRecommendations_df

Unnamed: 0,userId,movie,explanations
0,0,"Wizard of Oz, The (1939)","{8, 26, 44, 45}"
1,0,Pocahontas (1995),"{32, 33, 34, 35, 37, 10, 46}"
2,0,Hercules (1997),"{33, 34, 35, 37, 10, 16, 25}"
3,0,Aladdin (1992),"{4, 37, 40, 8, 10, 45}"
4,0,"Little Mermaid, The (1989)","{33, 37, 10, 45, 46}"
...,...,...,...
60395,6039,"Purple Rose of Cairo, The (1985)","{392, 170, 1962, 1292, 1651, 183, 797}"
60396,6039,Brassed Off (1996),"{176, 382}"
60397,6039,Twelfth Night (1996),"{417, 382, 221, 361}"
60398,6039,Best Men (1997),{}


In [20]:
allUsersRecommendations_df.query('userId == 897')

Unnamed: 0,userId,movie,explanations
8970,897,"Bodyguard, The (1992)",{}
8971,897,Starman (1984),{}
8972,897,Star Wars: Episode V - The Empire Strikes Back...,"{64, 44, 127}"
8973,897,"Devil's Own, The (1997)",{}
8974,897,Excalibur (1981),{124}
8975,897,First Knight (1995),{}
8976,897,Deep Impact (1998),{432}
8977,897,Breathless (1983),{}
8978,897,Runaway Train (1985),{}
8979,897,Diva (1981),{}


In [21]:
movies_df.query('itemId == 797')

Unnamed: 0,origin_iid,itemId,title,genre,year
797,1288,797,This Is Spinal Tap (1984),"[Comedy, Drama, Musical]",1984


In [22]:
movies_df.query('title == "Purple Rose of Cairo, The (1985)"')

Unnamed: 0,origin_iid,itemId,title,genre,year
1492,2065,1492,"Purple Rose of Cairo, The (1985)","[Comedy, Drama, Romance]",1985


In [23]:
ratings_df.query('userId == 6039').query('itemId == 392')

Unnamed: 0,origin_uid,origin_iid,rating,userId,itemId
999901,6040,3072,3,6039,392


# Model Fidelity Calculation

In [24]:
expl = allUsersRecommendations_df[[len(x) > 0 for x in allUsersRecommendations_df.explanations]]
fidelity = expl.groupby('userId')['movie'].count() / 10
modelFidelity = sum(fidelity)/6040

In [25]:
modelFidelity

0.5736754966887422