# Learn Sparse Arrays by Building a Movie Recommender


We will build a movie recommendation system to learn scipy's sparse array handling.

In [0]:
import numpy as np
import pandas as pd
from urllib.request import urlopen


## Data Files
These files are pre-processed from somewhere.

In [0]:
# read pickled data files
#http://files.grouplens.org/datasets/movielens/ml-latest-README.html
path = 'https://focods.s3.us-east-2.amazonaws.com/movies.pcl'
movies = pd.read_pickle(urlopen(path), compression=None)


In [3]:
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
len(movies)

58098

In [0]:
#Ratings: (takes about 20 seconds)
path = 'https://focods.s3.us-east-2.amazonaws.com/ratings.pcl'
ratings = pd.read_pickle(urlopen(path), compression=None)


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,2009-10-27 21:00:21
1,1,481,3.5,2009-10-27 21:04:16
2,1,1091,1.5,2009-10-27 21:04:31
3,1,1257,4.5,2009-10-27 21:04:20
4,1,1449,4.5,2009-10-27 21:01:04


In [7]:
len(ratings)

27753444

In [0]:
#just the good ratings
good_ratings = ratings.query('rating > 3.0')

In [9]:
len(good_ratings)

17243755

## Make Users x Movies Sparse Array

In [0]:
from scipy.sparse import csr_matrix, coo_matrix

In [0]:
cust_ratings = csr_matrix(([1]*len(good_ratings), (good_ratings.userId, good_ratings.movieId)),dtype=np.int32)


In [12]:
cust_ratings

<283229x193887 sparse matrix of type '<class 'numpy.int32'>'
	with 17243755 stored elements in Compressed Sparse Row format>

## Create Co-occurrence Matrix

In [0]:
# takes about 1 minute to run
co_occurrence = cust_ratings.T.dot(cust_ratings)

The shape of the co_occurence matrix is movies x movies. **the cell co_occurrence[i,j] is the number of users who reviewed (watched) movie i AND movie j**

In [14]:
co_occurrence

<193887x193887 sparse matrix of type '<class 'numpy.int32'>'
	with 321428159 stored elements in Compressed Sparse Column format>

In [15]:
# will need these for later for Jaccard scoring
# total reviews of each movie is main diagonal of co_occurrence matrix
tot_reviews = co_occurrence.diagonal()
tot_reviews.shape


(193887,)

In [16]:
# zero out the main diagonal, so that recommendations aren't self-referential
co_occurrence.setdiag(0)
# just to be tidy, eliminate any zeros introduced above
co_occurrence.eliminate_zeros()

  self._set_arrayXarray(i, j, x)


## Jaccard Calculations

later!

## Some Helper Routines

In [0]:
def get_movieId(moviename):
  """
  return the movie id given the title/moviename
  """
  mvindex = movies.query('title == @moviename').index
  if len(mvindex) == 0:
    raise Exception(f'Invalid movie name: {moviename}')
  return mvindex[0] # just the first one in case there were multiples
  

In [0]:
#run this cell to test invalid input
#get_movieId('jdkjkj')

In [19]:
get_movieId('Father of the Bride Part II (1995)')

5

In [0]:
def find_movies(title_regexp):
  """
  searches the list of movies with a reg exp
  """
  return movies.title[movies.title.str.match(title_regexp, case=False)]

In [21]:
find_movies('.*toy story.*')

movieId
1                                  Toy Story (1995)
3114                             Toy Story 2 (1999)
78499                            Toy Story 3 (2010)
106022                   Toy Story of Terror (2013)
115875    Toy Story Toons: Hawaiian Vacation (2011)
115879            Toy Story Toons: Small Fry (2011)
120468      Toy Story Toons: Partysaurus Rex (2012)
120474            Toy Story That Time Forgot (2014)
Name: title, dtype: object

## Step-by-Step Recommender

In [0]:
#get the movieId for the movie for which we'll make recommendations
fob = get_movieId('Father of the Bride Part II (1995)')

# what other movies did the people who watched FOB watch?
# this is the ROW in the co_occurrence matrix corresponding to the movieId
fob_cohorts = co_occurrence.getrow(fob)

In [0]:
# the movieIds of the cohorts
fob_cohort_ids = fob_cohorts.indices #i.e. the column indexes which are movieIds

# the number of times each cohort was watched (reviewed actually)
fob_cohort_vals = fob_cohorts.data

In [0]:
#get the top 5 cohort ids

#sort the number of times watched (reviewed) into ascending order
fob_cohort_sort = fob_cohort_vals.argsort()

# slicer for the last 5 elements of a vector in reverse order
slicer = slice(-1,-6,-1)


# grab the last 5 id's and times watched using the slicer
top5_ids  = fob_cohort_ids [fob_cohort_sort[slicer]]
top5_vals = fob_cohort_vals[fob_cohort_sort[slicer]]

In [27]:
top5_ids

array([  1, 356, 780,  62, 736], dtype=int32)

In [28]:
top5_vals

array([2392, 2235, 2198, 1920, 1866], dtype=int32)

In [29]:
# look up the titles assoc'd with the top 5 ids
movies.loc[top5_ids].title

movieId
1                          Toy Story (1995)
356                     Forrest Gump (1994)
780    Independence Day (a.k.a. ID4) (1996)
62                Mr. Holland's Opus (1995)
736                          Twister (1996)
Name: title, dtype: object

## Function to do it All

In [0]:
def get_recommendations(moviename, nrec=5, Jaccard=True):
  """
  Returns at most nrec movie recommenations given an input movie name
  using Jaccard normalization (default) or just number of co-occurrences
  """
  
  # which movie?
  movieId = get_movieId(moviename)
  
  #get the cohorts of this movie 
  #(number of times users rated each movie AND this movieId)
  cohorts = co_occurrence.getrow(movieId)
  
  #get the movieIds and # times reviewed for each cohort
  cohort_ids = cohorts.indices
  cohort_n   = cohorts.data
  
  if Jaccard:
    # normalize by Jaccard measure
    nreviews = tot_reviews[movieId]
    cohort_jaccard = cohort_n/(nreviews+tot_reviews[cohort_ids]-cohort_n)
    colname = 'Jaccard'
  else:
    # no normalization, just use the review counts
    cohort_jaccard = cohort_n
    colname = 'Score'
  
  #sort by Jaccard measure
  cohort_sort = cohort_jaccard.argsort() #note, argsort() only sorts ascending

  #slicer to get last nrec elements in reverse order
  last_n_slice = slice(-1, -1*(nrec+1), -1)
  
  #get the ids and jaccards of the last(highest) nrec elements
  last_n_ids =     cohort_ids[cohort_sort[last_n_slice]]
  last_n_jaccard = cohort_jaccard[cohort_sort[last_n_slice]]
  
  #create a dataframe to contain the results
  ret_df = pd.DataFrame({'title': movies.loc[last_n_ids].title,
                         colname:last_n_jaccard,
                         'movieId':last_n_ids}).set_index('movieId')
  
  #ship it!
  return ret_df


In [31]:
mname = 'Father of the Bride Part II (1995)'
get_recommendations(mname, nrec=10)

Unnamed: 0_level_0,title,Jaccard
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3,Grumpier Old Men (1995),0.138736
7,Sabrina (1995),0.116772
62,Mr. Holland's Opus (1995),0.10659
736,Twister (1996),0.097233
494,Executive Decision (1996),0.091037
317,"Santa Clause, The (1994)",0.089347
788,"Nutty Professor, The (1996)",0.087102
376,"River Wild, The (1994)",0.085064
802,Phenomenon (1996),0.084343
6944,Father of the Bride (1991),0.08417


In [32]:
mname = 'Father of the Bride Part II (1995)'
get_recommendations(mname, nrec=10, Jaccard=False)

Unnamed: 0_level_0,title,Score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),2392
356,Forrest Gump (1994),2235
780,Independence Day (a.k.a. ID4) (1996),2198
62,Mr. Holland's Opus (1995),1920
736,Twister (1996),1866
318,"Shawshank Redemption, The (1994)",1842
150,Apollo 13 (1995),1812
260,Star Wars: Episode IV - A New Hope (1977),1812
500,Mrs. Doubtfire (1993),1785
733,"Rock, The (1996)",1742


In [33]:
get_recommendations('Star Wars: Episode IV - A New Hope (1977)')

Unnamed: 0_level_0,title,Jaccard
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1196,Star Wars: Episode V - The Empire Strikes Back...,0.598249
1210,Star Wars: Episode VI - Return of the Jedi (1983),0.533106
1198,Raiders of the Lost Ark (Indiana Jones and the...,0.438439
2571,"Matrix, The (1999)",0.38548
1291,Indiana Jones and the Last Crusade (1989),0.333655


## Some Notable Recommendations

In [34]:
get_recommendations('The Shape of Water (2017)')

Unnamed: 0_level_0,title,Jaccard
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
177615,Lady Bird (2017),0.249284
178061,"I, Tonya (2017)",0.222472
177593,"Three Billboards Outside Ebbing, Missouri (2017)",0.220504
179819,Star Wars: The Last Jedi (2017),0.191381
122906,Black Panther (2017),0.186115


In [35]:
get_recommendations('Pulp Fiction (1994)')

Unnamed: 0_level_0,title,Jaccard
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
593,"Silence of the Lambs, The (1991)",0.44182
318,"Shawshank Redemption, The (1994)",0.414455
50,"Usual Suspects, The (1995)",0.406862
47,Seven (a.k.a. Se7en) (1995),0.384332
2959,Fight Club (1999),0.381126


In [36]:
find_movies('friday')

movieId
69                                            Friday (1995)
1974                                 Friday the 13th (1980)
1975                          Friday the 13th Part 2 (1981)
1976                      Friday the 13th Part 3: 3D (1982)
1977      Friday the 13th Part IV: The Final Chapter (1984)
1978         Friday the 13th Part V: A New Beginning (1985)
1979            Friday the 13th Part VI: Jason Lives (1986)
1980         Friday the 13th Part VII: The New Blood (1988)
1981      Friday the 13th Part VIII: Jason Takes Manhatt...
5874                               Friday After Next (2002)
7880                    Friday Night (Vendredi Soir) (2002)
8937                             Friday Night Lights (2004)
66783                                Friday the 13th (2009)
120408                                 Friday Foster (1975)
161157                            Friday (Pyatnitsa) (2016)
Name: title, dtype: object

In [37]:
get_recommendations('Friday the 13th (2009)')

Unnamed: 0_level_0,title,Jaccard
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
70994,Halloween II (2009),0.152709
65601,My Bloody Valentine 3-D (2009),0.148718
54785,Halloween (2007),0.133787
77798,"Nightmare on Elm Street, A (2010)",0.128788
51937,"Hills Have Eyes II, The (2007)",0.123506


In [38]:
get_recommendations('Friday the 13th (2009)', Jaccard=False)

Unnamed: 0_level_0,title,Score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
58559,"Dark Knight, The (2008)",101
593,"Silence of the Lambs, The (1991)",95
2959,Fight Club (1999),90
2571,"Matrix, The (1999)",88
296,Pulp Fiction (1994),86


In [39]:
find_movies('Repo Man')

movieId
1965    Repo Man (1984)
Name: title, dtype: object

In [40]:
get_recommendations('Repo Man (1984)')

Unnamed: 0_level_0,title,Jaccard
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1077,Sleeper (1973),0.142954
2968,Time Bandits (1981),0.135888
2289,"Player, The (1992)",0.132812
2064,Roger & Me (1989),0.132747
1199,Brazil (1985),0.130374
