In [132]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import os
import pandas as pd
import numpy as np
import re
import ipywidgets as widgets
from ipywidgets import *
from IPython.display import display
from zipfile import ZipFile
from fuzzywuzzy import fuzz
from itertools import chain
from io import BytesIO
from urllib.request import urlopen

In [133]:
# Read data from dataset online directly
resp = urlopen('https://files.grouplens.org/datasets/movielens/ml-25m.zip')
zf = ZipFile(BytesIO(resp.read()))
ratings = pd.read_csv(zf.open('ml-25m/ratings.csv'))
movies = pd.read_csv(zf.open('ml-25m/movies.csv'))
tags = pd.read_csv(zf.open('ml-25m/tags.csv'))
links = pd.read_csv(zf.open('ml-25m/links.csv'))
genome_scores = pd.read_csv(zf.open('ml-25m/genome-scores.csv'))

In [134]:
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [135]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [136]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [137]:
# Search engine that returns closest title
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [138]:
# Displays title search engine as widget
movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            display(search(title))

movie_input.observe(on_type, names = 'value')

#display(movie_input, movie_list)

In [139]:
movie = movies[movies["movieId"] == movie_id]

In [140]:
# Collaborative filtering based on other users' ratings
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 3)]["userId"].unique()

similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 3)]["movieId"]
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .10]

all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 3)]

all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [141]:
# Produces recommendation score with users who have similar taste in movies
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)
#rec_percentages.head(10).merge(movies, left_index=True, right_on = "movieId")

In [142]:
# Genre-based filtering additions
# Create unique list of genres
unique_genres = sorted(list(set(list(chain(*[i.split('|') for i in movies.genres.unique().tolist()])))))
unique_genres.pop(0) # Get rid of "no genres listed" element in list
unique_genres.append("All Genres")

In [143]:
# Year filtering additions
# Parse the year from a movie title
def find_year(title):
  year = title[title.find('(')+1:title.find(')')]
  if year.isnumeric():
    if (len(year) == 4):
      return year
    else:
      pass


In [144]:
movies["year"] = movies["title"].apply(find_year)

In [145]:
# Create unique list of years
unique_years = movies.year.copy().unique().tolist()
unique_years.remove(None)
unique_years.append("All Years")

In [146]:
# Movie recommender with genre and year filters
def find_similar_movies(movie_id, genre_filter, year_filter):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 3)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .01]

    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 2)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)

    # Genre and year filters
    unfiltered_df = rec_percentages.merge(movies, left_index=True, right_on = "movieId")[["score", "title", "genres", "year"]]

    if genre_filter == "All Genres":
      filtered_genre_df = unfiltered_df
    else:
      filtered_genre_df = unfiltered_df[unfiltered_df.genres.str.contains(genre_filter)]
      filtered_genre_df.genres = filtered_genre_df.genres.str.replace('|',', ', regex=True)
    
    if year_filter == "All Years":
      filtered_year_df = filtered_genre_df.head(10)
      
    else:
      filtered_year_df = filtered_genre_df[filtered_genre_df.year.str.contains(year_filter, na = False)].head(10)
    
    output = filtered_year_df.reset_index().rename(columns={'index': 'movie_id'})

    return output

# find_similar_movies(89745, 'Action', '2012') # Test the function with The Avengers and an added genre filter

In [147]:
w_genre = widgets.Dropdown(
    options = sorted(unique_genres), 
    description = 'Genre:',
    value = 'All Genres'
)

w_year = widgets.Dropdown(
    options = sorted(unique_years),
    description = 'Release Year:',
    value = 'All Years'
)

w_title = widgets.Text(
    value = 'Toy Story',
    description = 'Movie Title:',
    disabled = False
)

def movie_recommender(genre, year, title):
    if len(title) > 3:
        results = search(title) 
        movie_id = results.iloc[0]["movieId"]
        display(find_similar_movies(movie_id, genre, year))

w_box = interactive(movie_recommender, genre=w_genre, year=w_year, title=w_title)

display(w_box)

interactive(children=(Dropdown(description='Genre:', index=2, options=('Action', 'Adventure', 'All Genres', 'A…