# Movie Recommendation AI

In [None]:
import numpy as np
import pandas as pd
import re


In [None]:
from google.colab import drive 
drive.mount ('/content/drive')
!ls '/content/drive/My Drive/AI Group/'  

Mounted at /content/drive
 Ashesi_IntroToAI_PosterTemplate.pptx
'Copy of RECOMMENDATION.ipynb'
'Copy of video1319237603.mp4'
 ml-25m
'MOVIE RECOMMENDATION SYSTEM.pptx'
'MOVIE RECOMMENDATION SYSTEM - TECHNICAL REPORT.gdoc'
'MOVIE RECOMMENDATION SYSTEM - TECHNICAL REPORT.pdf'
'Presentation Notes.gdoc'
'PRESENTATION SCRIPT.gdoc'
'Project Proposal.docx'
'Project Update.gdoc'
'Recommendation AI.pptx'
 RECOMMENDATION.ipynb
'Untitled presentation.gslides'
 video1319237603.mp4


In [None]:
movies = pd.read_csv('/content/drive/My Drive/AI Group/ml-25m/movies.csv')

In [None]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [None]:
ratings = pd.read_csv('/content/drive/My Drive/AI Group/ml-25m/ratings.csv')

# Cleaning movie titles with regex

In [None]:
def Movie_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [None]:
movies["Movie_title"] = movies["title"].apply(Movie_title)

In [None]:
movies

Unnamed: 0,movieId,title,genres,Movie_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


# Creating a tfidf matrix


Matrix factorization algorithms work by decomposing the user-item interaction matrix into the product of two lower dimensionality rectangular matrices.

Matrix factorization is a collaborative filtering method to find the relationship between items' and users' entities. Latent features, the association between users and movies matrices, are determined to find similarity and make a prediction based on both item and user entities.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["Movie_title"])

# Creating a search function

This function allows us to compute the similarity between a term that we enter. we're going to use a cosine similarity
which is implemented in scikit-learn.



In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def search(title):
    title = Movie_title(title)
    # This turns the title that the user has enred into a set of numbers
    query_vec = vectorizer.transform([title])
    #finds similarity between our search term(title) and other titles in the dataset
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    #find the five most similar titles to our search term
    indices = np.argpartition(similarity, -5)[-5:] 
    output = movies.iloc[indices].iloc[::-1]
    
    return output

# Building a search box

In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
 #       if len(title) > 5:
        display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

In [None]:
ratings = pd.read_csv('/content/drive/My Drive/AI Group/ml-25m/ratings.csv')

In [None]:
ratings.tail(10)


Unnamed: 0,userId,movieId,rating,timestamp
25000085,162541,8983,4.5,1240953211
25000086,162541,31658,4.5,1240953287
25000087,162541,33794,4.0,1240951792
25000088,162541,41566,4.0,1240952749
25000089,162541,45517,4.5,1240953353
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434
25000094,162541,63876,5.0,1240952515


In [None]:
movie_id = 89745
movie = movies[movies["movieId"] == movie_id]

In [None]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [None]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]


In [None]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [None]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]


In [None]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())


In [None]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [None]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [None]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [None]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,Movie_title
17067,1.0,0.040459,24.716368,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
20513,0.103711,0.005289,19.610199,106072,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX,Thor The Dark World 2013
25058,0.241054,0.012367,19.49177,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,Avengers Age of Ultron 2015
19678,0.216534,0.012119,17.867419,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,Iron Man 3 2013
16725,0.215043,0.012052,17.843074,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War,Captain America The First Avenger 2011
16312,0.175447,0.010142,17.299824,86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,Thor 2011
21348,0.287608,0.016737,17.183667,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,Captain America The Winter Soldier 2014
25071,0.214049,0.012856,16.649399,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,Captain America Civil War 2016
25061,0.136017,0.008573,15.865628,122900,Ant-Man (2015),Action|Adventure|Sci-Fi,AntMan 2015
14628,0.242876,0.015517,15.651921,77561,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX,Iron Man 2 2010


In [None]:

def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [81]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    description='Movie title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)


Text(value='', description='Movie title:')

Output()