In [0]:
import numpy as np
import pandas as pd
from urllib.request import urlopen


In [0]:
# read pickled data files
#http://files.grouplens.org/datasets/movielens/ml-latest-README.html
path = 'https://focods.s3.us-east-2.amazonaws.com/movies.pcl'
movies = pd.read_pickle(urlopen(path), compression=None)


In [0]:
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [0]:
#Ratings: (takes about 20 seconds)
path = 'https://focods.s3.us-east-2.amazonaws.com/ratings.pcl'
ratings = pd.read_pickle(urlopen(path), compression=None)


In [0]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,2009-10-27 21:00:21
1,1,481,3.5,2009-10-27 21:04:16
2,1,1091,1.5,2009-10-27 21:04:31
3,1,1257,4.5,2009-10-27 21:04:20
4,1,1449,4.5,2009-10-27 21:01:04


In [0]:
#just the good ratings
good_ratings = ratings.query('rating > 3.0')

In [0]:
from scipy.sparse import csr_matrix, coo_matrix

In [0]:
cust_ratings = coo_matrix(([1]*len(good_ratings), (good_ratings.userId, good_ratings.movieId)),dtype=np.int32).tocsr()


In [0]:
cust_ratings

<283229x193887 sparse matrix of type '<class 'numpy.int32'>'
	with 17243755 stored elements in Compressed Sparse Row format>

In [0]:
# takes about 1 minute to run
co_occurrence = cust_ratings.T.dot(cust_ratings)

In [0]:
co_occurrence

<193887x193887 sparse matrix of type '<class 'numpy.int32'>'
	with 321428159 stored elements in Compressed Sparse Column format>

In [0]:
co_occurrence.max()

89472

In [0]:
def get_movieId(moviename):
  """
  return the movie id given the title/moviename
  """
  mvindex = movies.query('title == @moviename').index
  if len(mvindex) == 0:
    raise Exception(f'Invalid movie name: {moviename}')
  return mvindex[0] # just the first one in case there were multiples
  

In [0]:
get_movieId('jdkjkj')

Exception: ignored

In [0]:
get_movieId('Father of the Bride Part II (1995)')

5

In [0]:
fob = co_occurrence.getrow(5)

In [0]:
fob

<1x193887 sparse matrix of type '<class 'numpy.int64'>'
	with 17942 stored elements in Compressed Sparse Row format>

In [0]:
fob.sorted_indices()

<1x193887 sparse matrix of type '<class 'numpy.int64'>'
	with 17942 stored elements in Compressed Sparse Row format>

In [0]:
fob.shape

(1, 193887)

In [0]:
fob.max()

5069

In [0]:
#mname = 'Father of the Bride Part II (1995)'
mname = 'kjdkjkjf'
zz=movies.query('title == @mname')

In [0]:
def get_recommendations(moviename, nrec=5):
  """
  Returns at most nrec movie recommenations given an input movie name
  """
  
  movieId = get_movieId(moviename)
  
  #get the cohorts of this movie 
  #(number of times users rated each movie AND this movieId)
  cohorts = co_occurrence.getrow(movieId)
  
  #get the movieIds and # times reviewed for each cohort
  cohort_ids = cohorts.indices
  cohort_n   = cohorts.data
  
  # normalize by Jaccard measure (future)
  cohort_jaccard = cohort_n
  
  #sort by Jaccard measure
  cohort_sort = cohort_jaccard.argsort() #note, only sorts ascending

  #slicer for last nrec elements in reverse order
  last_n_slice = slice(-1, -1*(nrec+1), -1)
  
  #get the ids and jaccards of the last(highest) nrec elements
  last_n_ids =     cohort_ids[cohort_sort[last_n_slice]]
  last_n_jaccard = cohort_n[cohort_sort[last_n_slice]]
  
  #create a dataframe to contain the results
  ret_df = pd.DataFrame({'title': movies.loc[last_n_ids].title,
                         'Jaccard':last_n_jaccard,
                         'movieId':last_n_ids}).set_index('movieId')
  
  #ship it!
  return ret_df


In [0]:
mname = 'Father of the Bride Part II (1995)'
get_recommendations(mname, nrec=10)

Unnamed: 0_level_0,title,Jaccard
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
5,Father of the Bride Part II (1995),5069
1,Toy Story (1995),2392
356,Forrest Gump (1994),2235
780,Independence Day (a.k.a. ID4) (1996),2198
62,Mr. Holland's Opus (1995),1920
736,Twister (1996),1866
318,"Shawshank Redemption, The (1994)",1842
260,Star Wars: Episode IV - A New Hope (1977),1812
150,Apollo 13 (1995),1812
500,Mrs. Doubtfire (1993),1785


In [0]:
get_recommendations('Star Wars: Episode IV - A New Hope (1977)')

Unnamed: 0_level_0,title,Jaccard
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
260,Star Wars: Episode IV - A New Hope (1977),67057
1196,Star Wars: Episode V - The Empire Strikes Back...,45583
1210,Star Wars: Episode VI - Return of the Jedi (1983),41191
2571,"Matrix, The (1999)",38553
1198,Raiders of the Lost Ark (Indiana Jones and the...,36778


In [0]:
fob=5 #index of father of the bride
fob_cohorts = co_occurrence.getrow(fob)

In [0]:
fob_cohort_ids = fob_co_horts.indices
fob_cohort_vals = fob_cohorts.data

In [0]:
#get the top 5 cohort ids
slicer = slice(-1,-6,-1) # slicer for the last 5 elements in reverse order
fob_cohort_sort = fob_cohort_vals.argsort()
top5_ids = fob_cohort_ids[fob_cohort_sort[slicer]]
top5_vals = fob_cohort_vals[fob_cohort_sort[slicer]]

In [0]:
top5_ids

array([  5,   1, 356, 780,  62], dtype=int32)

In [0]:
top5_vals

array([5069, 2392, 2235, 2198, 1920], dtype=int32)

In [0]:
movies.loc[top5_ids].title

movieId
5        Father of the Bride Part II (1995)
1                          Toy Story (1995)
356                     Forrest Gump (1994)
780    Independence Day (a.k.a. ID4) (1996)
62                Mr. Holland's Opus (1995)
Name: title, dtype: object

In [0]:
def find_movies(title_regexp):
  return movies.title[movies.title.str.match(title_regexp, case=False)]

In [0]:
find_movies('.*toy.*')

movieId
1                                          Toy Story (1995)
2017                                Babes in Toyland (1961)
2253                                            Toys (1992)
2480                  Dry Cleaning (Nettoyage à sec) (1997)
3086                                Babes in Toyland (1934)
3114                                     Toy Story 2 (1999)
4929                                        Toy, The (1982)
5843                                    Toy Soldiers (1991)
26033              Giants and Toys (Kyojin to gangu) (1958)
78062     Puppet Master vs. Demonic Toys (Puppet Master ...
78499                                    Toy Story 3 (2010)
80141                             Christmas Toy, The (1986)
81981                            Toy, The (Le jouet) (1976)
82698                                   Demonic Toys (1992)
83439                              Toys in the Attic (1963)
90252           Devil Times Five (a.k.a. Peopletoys) (1974)
93574     Resurrect Dead: The My

In [0]:
yy = find_movies('.*water.*')

In [0]:
yy

movieId
208                                       Waterworld (1995)
265       Like Water for Chocolate (Como agua para choco...
1565                                Head Above Water (1996)
1945                               On the Waterfront (1954)
2138                                  Watership Down (1978)
2335                                   Waterboy, The (1998)
3631                               It's in the Water (1998)
4988                              White Water Summer (1987)
5023                                 Waterdance, The (1992)
5338                         Full Moon in Blue Water (1988)
5351      Warm Water Under a Red Bridge (Akai hashi no s...
5808                            Weight of Water, The (2000)
5896                              New Waterford Girl (1999)
6858               Knife in the Water (Nóz w wodzie) (1962)
6972                           Watermelon Woman, The (1996)
7814                                        Waterloo (1970)
8019        Dark Water (Honogura

In [0]:
get_recommendations('The Shape of Water (2017)')

Unnamed: 0_level_0,title,Jaccard
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
180031,The Shape of Water (2017),1052
79132,Inception (2010),802
296,Pulp Fiction (1994),782
58559,"Dark Knight, The (2008)",775
2959,Fight Club (1999),770


In [0]:
get_recommendations('Pulp Fiction (1994)')

Unnamed: 0_level_0,title,Jaccard
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
296,Pulp Fiction (1994),76363
318,"Shawshank Redemption, The (1994)",48592
593,"Silence of the Lambs, The (1991)",46282
356,Forrest Gump (1994),40312
50,"Usual Suspects, The (1995)",38088


In [0]:
find_movies('friday')

movieId
69                                            Friday (1995)
1974                                 Friday the 13th (1980)
1975                          Friday the 13th Part 2 (1981)
1976                      Friday the 13th Part 3: 3D (1982)
1977      Friday the 13th Part IV: The Final Chapter (1984)
1978         Friday the 13th Part V: A New Beginning (1985)
1979            Friday the 13th Part VI: Jason Lives (1986)
1980         Friday the 13th Part VII: The New Blood (1988)
1981      Friday the 13th Part VIII: Jason Takes Manhatt...
5874                               Friday After Next (2002)
7880                    Friday Night (Vendredi Soir) (2002)
8937                             Friday Night Lights (2004)
66783                                Friday the 13th (2009)
120408                                 Friday Foster (1975)
161157                            Friday (Pyatnitsa) (2016)
Name: title, dtype: object

In [0]:
get_recommendations('Friday the 13th (1980)')

Unnamed: 0_level_0,title,Jaccard
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1974,Friday the 13th (1980),1840
593,"Silence of the Lambs, The (1991)",1180
1258,"Shining, The (1980)",1110
1214,Alien (1979),1072
260,Star Wars: Episode IV - A New Hope (1977),1070


In [0]:
# Fetch a single <1MB file using the raw GitHub URL.
!curl --remote-name \
     -H 'Accept: application/vnd.github.v3.raw' \
     --location https://api.github.com/repos/jakevdp/PythonDataScienceHandbook/contents/notebooks/data/california_cities.csv

In [0]:
diag = co_occurrence.diagonal()

In [0]:
diag

array([    0, 50986, 12348, ...,     0,     0,     1], dtype=int32)

In [0]:
len(diag)

193887

In [0]:
diag[5]

5069