# Data exploration of Movie Lens 100k data sets
Dataset: https://grouplens.org/datasets/movielens/

In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

dir_path = './ml-latest-small/'
movies_csv = dir_path + 'movies.csv'
ratings_csv = dir_path + 'ratings.csv'
links_csv = dir_path + 'links.csv'
#tags_csv = dir_path + 'tags.csv'

df_movie = pd.read_csv(movies_csv)
df_rating = pd.read_csv(ratings_csv)
df_link = pd.read_csv(links_csv)
#df_tag = pd.read_csv(tags_csv)

In [7]:
df_movie.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
df_rating.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


# Prepare the dataset for training

In [10]:
from UTILS.CFData import CFData
from UTILS.CFModel import CFModel
from surprise import SVD

from IPython.display import display, HTML, Image

In [11]:
# Load rating datat to CFData class
df_data = df_rating[['userId','movieId', 'rating',]]
df_data = df_data.rename(index=str, columns={'userId': 'userID', 'movieId': 'itemID', 'rating': 'rating'})
df_id_name_table = df_movie[['movieId', 'title']]
df_id_name_table = df_id_name_table.rename(index=str, columns={'movieId':'itemID', 'title':'itemName'})
data_movie = CFData(df_data, None, df_id_name_table, (0.5, 5))

# Model-based collaborative filtering by Funk-SVD

In [12]:
model_svd = CFModel(SVD, lr_all=0.005, reg_all=0.4, n_epochs=30)
model_svd.fit(data_movie.trainset)
test_ratio = None

### Utility function

In [13]:
from UTILS.TMDBPoster import TMDBPoster
class ShowRecommendedMovies:
    @classmethod
    def get_image_from_url_list(self, images, header=None, width="100%"): 
        """
        Read in a list of image sources and display images in html format
        :param images: List. list of image src strings
        :param header: String. To place head on images
        :param width: Str. Display width
        :return: None. Only diplay text and images
         """
    # Input: images: list of image rource
    # Output: None. Only displace images
        if type(width)==type(1): width = "{}px".format(width)
        html = ["<table style='width:{}'><tr>".format(width)]
        if header is not None:
            html += ["<th>{}</th>".format(h) for h in header] + ["</tr><tr>"]

        for image in images:
            html.append("<td><img src='{}' /></td>".format(image))
        html.append("</tr></table>")
        display(HTML(''.join(html)))
    @classmethod
    def show_recommended_movies(self, movie_name, cf_data_movie, cf_model, k): 
        """
        Read in a movie name. Use the CFData data and trained CFModel to show top-k recommended movie posters
        TMDB poster images will be requested via TMDB API
        :param movie_name: Str. Input movie name
        :param cf_data_movie: CFData. Movie data the model is trained on. Used to do movie name and id conversion
        :param cf_model: CFModel. Trained model. Used to output top-k similar movies
        :param k: Recommend k movies
        :return: None. Only diplay text and images
         """
        # Convert user-selected movie name to movie id then obtain the top-k similar movies
        movie_item_id = cf_data_movie.convert_name_to_id(movie_name)
        #movie_neighbor_name = [cf_data_movie.convert_id_to_name(i) for i in cf_model.get_similar_item(movie_item_id, k)]
        movie_neighbor_name = [cf_data_movie.convert_id_to_name(i) for i in cf_model.get_similar_item(movie_item_id, k)]
        movie_neighbor_id = [cf_data_movie.convert_name_to_id(i) for i in movie_neighbor_name]
    
        # Request the hyperlinks of movie poster from TMDB
        get_poster = TMDBPoster()
        input_movie_imdb_id = int(df_link[df_link['movieId']==movie_item_id].iloc[0]['imdbId']) 
        input_poster_url = get_poster.get_poster_urls('tt{:07}'.format(input_movie_imdb_id))[0]
    
        # Output the top-k recommended movieds
        print "Movie you select is '%s'"%(movie_name)
        self.get_image_from_url_list([input_poster_url], header='', width="10%" )
    
        # convert ml movie id to imdb movie id and request movie poster images from TMDB
        movie_neighbor_imdb_id = [int(df_link[df_link['movieId']==i].iloc[0]['imdbId']) for i in movie_neighbor_id]
        poster_urls = [get_poster.get_poster_urls('tt{:07}'.format(i))[0] for i in movie_neighbor_imdb_id]
        print "Based on '%s', we recommend %d movies below:"%(movie_name, k)
        #print movie_neighbor_name
        self.get_image_from_url_list(poster_urls)
        #self.get_image_from_url_list(poster_urls[0:k//2])
        #self.get_image_from_url_list(poster_urls[k//2:])

def get_most_rated_movie(df_movie_in, df_rating_in, n_output):
    movie_list_tmp1 = pd.merge(df_movie, df_rating, on='movieId', how='inner').groupby('title').count()   
    movie_list_top_k = movie_list_tmp1['rating'].sort_values(ascending=False).index[:n_output]
    return movie_list_top_k

# Movie recommendation based on user-selected movie

In [24]:
import ipywidgets as widgets
from IPython.display import Javascript, display
# Create a top-50 most rated movie list
movie_list_top_50 = get_most_rated_movie(df_movie, df_rating, 50)

print "Please select a movie and let us recommend top 10 movies to you."
selected_movie_name = widgets.Dropdown(options=movie_list_top_50, value='Forrest Gump (1994)', description='Movie:')
def run_next_cell(ev):
    display(Javascript('IPython.notebook.execute_cell_range(IPython.notebook.get_selected_index()+1, \
                      IPython.notebook.get_selected_index()+2)'))

button = widgets.Button(description="Top-10 movies for you")
button.on_click(run_next_cell)
widgets.VBox([selected_movie_name, button])

Please select a movie and let us recommend top 10 movies to you.


VkJveChjaGlsZHJlbj0oRHJvcGRvd24oZGVzY3JpcHRpb249dSdNb3ZpZTonLCBvcHRpb25zPSgnRm9ycmVzdCBHdW1wICgxOTk0KScsICdQdWxwIEZpY3Rpb24gKDE5OTQpJywgJ1NoYXdzaGHigKY=


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
# Obtain input movie and k of 'top-k'
movie_name = selected_movie_name.value
ShowRecommendedMovies.show_recommended_movies(movie_name, data_movie, model_svd,  k=10)  

Movie you select is 'Men in Black (a.k.a. MIB) (1997)'


Based on 'Men in Black (a.k.a. MIB) (1997)', we recommend 10 movies below:
