In [50]:
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import train_test_split
import numpy as np

### **Reading the datasets**

In [2]:
movies_df = pd.read_csv("ml-20m/movies.csv")
ratings_df = pd.read_csv("ml-20m/ratings.csv")
tags_df = pd.read_csv("ml-20m/tags.csv")

In [3]:
movies_df.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [4]:
ratings_df.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676


In [5]:
tags_df.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078


### **Preprocessing**

In [6]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

In [7]:
def seperate_genres(genre):
    return " ".join(genre.split("|"))

In [8]:
movies_df['title'] = movies_df['title'].apply(clean_title)

In [9]:
movies_df['genres'] = movies_df['genres'].apply(seperate_genres)

In [10]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story 1995,Adventure Animation Children Comedy Fantasy
1,2,Jumanji 1995,Adventure Children Fantasy
2,3,Grumpier Old Men 1995,Comedy Romance


### **Popularity Based Recommendation Function**

In [11]:
#This function is only gonna return n number of popular movies when it is called.
#The logic is gonna be finding the highest rating movies, with highest count of ratings.

def popularity_based(top_n = 5, ratings_df = ratings_df, movies_df = movies_df,
                     purpose = "popularity", movie_name = "None",
                     return_df = False):
    #Mean rating for each movie
    mean_ratings = ratings_df[['movieId','rating']].groupby('movieId',as_index=False).apply("mean")
    
    #Rating counts
    count_ratings = ratings_df['movieId'].value_counts()

    #Merging two tables
    merged_ratings = pd.merge(mean_ratings, count_ratings, on ='movieId')

    #Merging with movies table
    final_table = pd.merge(movies_df,merged_ratings, on = 'movieId')
    
    #Returning the movies with more than 4.0 rating and has the highest rating count.
    popular_n = final_table[(final_table['rating'] >= 4)].sort_values('count',ascending=False).head(top_n)

    #Also returning high rating movies which have 
    #less then %10 rating count of the highest amount of times rated movie.
    max_rating_count = final_table['count'].max()
    less_known_n = final_table[(max_rating_count*0.01 <= final_table['count']) &(final_table['count'] <= max_rating_count*0.10)].sort_values('rating',ascending=False).head(int(top_n/2))

    if return_df == False:
        if purpose == "content":
            print(f"Top {top_n} movies similar with {movie_name}:")
        else:
            print(f"Top {top_n} movies in the world:")
        for i,item in enumerate(popular_n['title']):
            print(f"{i+1}: {item}")
        
        print(f"\nYou may also like these hidden gems:")
        i = 0
        for i,item in enumerate(less_known_n['title']):
            print(f"{i+1}: {item}")
        
    else:
        return popular_n

In [12]:
#Testing the function
popularity_based(4)

Top 4 movies in the world:
1: Pulp Fiction 1994
2: Forrest Gump 1994
3: Shawshank Redemption The 1994
4: Silence of the Lambs The 1991

You may also like these hidden gems:
1: Band of Brothers 2001
2: Sunset Blvd aka Sunset Boulevard 1950


In [13]:
popularity_based(4,return_df = True)

Unnamed: 0,movieId,title,genres,rating,count
293,296,Pulp Fiction 1994,Comedy Crime Drama Thriller,4.174231,67310
352,356,Forrest Gump 1994,Comedy Drama Romance War,4.029,66172
315,318,Shawshank Redemption The 1994,Crime Drama,4.44699,63366
587,593,Silence of the Lambs The 1991,Crime Horror Thriller,4.177057,63299


### **Content Filter**

In [14]:
#Content filtering recommendation function will take movie name as an input
#and will return most popular movies with similar genres.

In [15]:
vectorizer_name = TfidfVectorizer()
vectorizer_genre = TfidfVectorizer()

In [16]:
tfidf_name =  vectorizer_name.fit_transform(movies_df['title'])
tfidf_genre = vectorizer_genre.fit_transform(movies_df['genres'])

In [17]:
#This function takes a movie name as an input and 
#returns most similar movie name.
def title_searcher(input_word, top_n = 1):
    returned_vector = vectorizer_name.transform([input_word])
    cosine_similarities = cosine_similarity(returned_vector, tfidf_name)
    most_similar_indexes = np.argsort(cosine_similarities[0])[-1*top_n:][::-1]
    return movies_df.iloc[most_similar_indexes].reset_index(drop=True)

In [18]:
#Testing the function
title_searcher("potter harry phoenix",3)

Unnamed: 0,movieId,title,genres
0,54001,Harry Potter and the Order of the Phoenix 2007,Adventure Drama Fantasy IMAX
1,4896,Harry Potter and the Sorcerers Stone aka Harry...,Adventure Children Fantasy
2,131168,Phoenix 2014,Drama


In [19]:
#This function takes genre names as an input and 
#retrieves most related movies with the given genres
#with over %60 of similarity.
def genre_searcher(input_word):
    returned_vector = vectorizer_genre.transform([input_word])
    cosine_similarities = cosine_similarity(returned_vector,tfidf_genre)
    most_similar_indexes = np.where(cosine_similarities[0] > 0.6)[0]
    sorted_indexes = np.argsort(cosine_similarities[0])[::-1]
    return movies_df.iloc[sorted_indexes[most_similar_indexes]].reset_index(drop = True)

In [20]:
#Testing the function
genre_searcher("action drama comedy").head(10)

Unnamed: 0,movieId,title,genres
0,85401,Super 2010,Action Comedy Drama
1,438,Cowboy Way The 1994,Action Comedy Drama
2,2170,Wrongfully Accused 1998,Action Comedy
3,92698,Israeli Intelligence Hamosad Hasagur 2007,Action Comedy
4,99478,FDR American Badass 2012,Action Comedy
5,32551,Speedy 1928,Action Comedy
6,106064,Race 2008,Action Comedy
7,119029,Hit Squad 1976,Action Comedy
8,119035,Swindle 1977,Action Comedy
9,119039,The Gang That Sold America 1979,Action Comedy


In [21]:
#The main function
def content_based(input_word, top_n = 5, return_df = False):
    movie_name = title_searcher(input_word)
    genre = movie_name['genres'].iloc[0]
    similar_movies = genre_searcher(genre)

    #Now we we can use similar_movies dataframe on our very first function
    #to find out which one of these movies are the most popular ones
    
    return popularity_based(top_n = top_n, movies_df = similar_movies, 
                     purpose= "content", movie_name=movie_name['title'].iloc[0],
                     return_df = return_df)

In [51]:
#Testing the function
content_based("rings fellowship",5)

Top 5 movies similar with Lord of the Rings The Fellowship of the Ring The 2001:
1: Godfather The 1972
2: Good Will Hunting 1997
3: Taxi Driver 1976
4: Full Metal Jacket 1987
5: Citizen Kane 1941

You may also like these hidden gems:
1: Rififi Du rififi chez les hommes 1955
2: Matewan 1987


In [23]:
content_based("lord rings fellowship",5, return_df= True)

Unnamed: 0,movieId,title,genres,rating,count
256,858,Godfather The 1972,Crime Drama,4.364732,41355
283,1704,Good Will Hunting 1997,Drama Romance,4.032517,28324
259,111,Taxi Driver 1976,Crime Drama Thriller,4.110576,24481
292,1222,Full Metal Jacket 1987,Drama War,4.03318,21926
289,923,Citizen Kane 1941,Drama Mystery,4.130443,17774


### **Collaborative Filter**

In [24]:
#This function takes a movie name as an input and then returns related movies.
#Related movies will be decided by which users liked this movie and what other movies
#commonly liked by these users.

In [25]:
def collaborative(input_word, top_n = 5 , ratings_df = ratings_df, movies_df = movies_df,
                  return_df  = False):
    #First we collect the movie name and movie id.
    movie_name = title_searcher(input_word)['title'].iloc[0]
    movie_id = title_searcher(input_word)['movieId'].iloc[0]
    
    #Finding the users who liked this movie
    user_ids = ratings_df[(ratings_df['movieId'] == movie_id) & 
                          (ratings_df['rating'] >=  4.5)]['userId'].tolist()

    #Movies commonly liked by these users
    common_movies = ratings_df[(ratings_df['userId'].isin(user_ids)) & 
                               (ratings_df['rating'] >= 4.5) & 
                               (ratings_df['movieId'] != movie_id)]
    #Find out which one of them are liked by at least %25 of these users
    common_movies = pd.DataFrame(common_movies['movieId'].value_counts()).reset_index()
    common_movies = common_movies[common_movies['count'] > 0.25*len(user_ids)]
    
    #Merging tables to get movie names and movie ratings
    mean_ratings = ratings_df[['movieId','rating']].groupby('movieId',as_index=False).apply("mean")
    common_movies = pd.merge(common_movies,movies_df, on='movieId')
    common_movies = pd.merge(common_movies,mean_ratings, on='movieId')

    #Filtering only the top_n movies
    common_movies = common_movies.head(top_n)

    if return_df == False:
        print(f"People who liked {movie_name} liked these movies too.")
        for i,movie in enumerate(common_movies['title']):
            print(f"{i+1}: {movie}")
    else:
        return common_movies

    

In [52]:
#Testing the function
collaborative("dark knight returns")

People who liked Batman The Dark Knight Returns Part 1 2012 liked these movies too.
1: Dark Knight The 2008
2: Matrix The 1999
3: Dark Knight Rises The 2012
4: Batman The Dark Knight Returns Part 2 2013
5: Batman Begins 2005


### **Matrix Factorization**

In [27]:
#Loading and preprocessing data.
reader = Reader(rating_scale= (1,5))
size = 1.0
data = Dataset.load_from_df(ratings_df[['userId','movieId','rating']].head(int(len(ratings_df)*size)), reader)

#Train test splitting
train_set, test_set = train_test_split(data, test_size = 0.2, random_state= 42)

In [28]:
#Using the SVD (Singular Value Decomposition)
model = SVD(random_state= 42)
model.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd3a10bfe20>

In [29]:
#accuracy of the model
predictions = model.test(test_set)
mse = accuracy.mse(predictions)

MSE: 0.6186


In [36]:
#Score could be increased with some parameter tuning but it takes around 2 hours to compute one model :)
#I believe that the main idea here is creating the certain algorithms.

In [37]:
#Main function
def matrix_factorization(user_id = 1, top_n = 5, movies_df = movies_df):
    recommanedation_for_user = [(movie_id, model.predict(user_id,movie_id).est) for movie_id in movies_df['movieId']]
    recommanedation_for_user = pd.DataFrame(recommanedation_for_user, columns=['movieId','estimation']).sort_values(by='estimation', ascending= False).head(top_n)
    recommanedation_for_user = pd.merge(recommanedation_for_user,movies_df, on='movieId')
    
    print(f"Top {top_n} recommendation for User '{user_id}':")
    for i,movie in enumerate(recommanedation_for_user['title']):
        print(f"{i+1}: {movie}")

In [38]:
#Testing the function
matrix_factorization(user_id= 15)

Top 5 recommendation for User '15':
1: Day of the Doctor The 2013
2: Frozen Planet 2011
3: Come and See Idi i smotri 1985
4: Schindlers List 1993
5: City Lights 1931


In [59]:
matrix_factorization(user_id= 42, top_n = 7)

Top 7 recommendation for User '42':
1: Interstellar 2014
2: Forrest Gump 1994
3: Braveheart 1995
4: Matrix The 1999
5: Shawshank Redemption The 1994
6: Green Mile The 1999
7: Gladiator 2000


In [69]:
matrix_factorization(user_id = 1904, top_n= 5)

Top 5 recommendation for User '1904':
1: Lord of the Rings The Return of the King The 2003
2: Lord of the Rings The Fellowship of the Ring The 2001
3: Lord of the Rings The Two Towers The 2002
4: Star Wars Episode IV  A New Hope 1977
5: Star Wars Episode V  The Empire Strikes Back 1980


### **Hybrid Model**

In [40]:
#In the hybrid model we are gonna combine content based and collaborative recommendation algorithms.
def hybrid_model(input_word, top_n = 5, return_df = False):
    #Getting both collaborative and content based recommendations and taking top movies from each table.
    df_collaborative = collaborative(input_word = input_word, top_n= top_n, return_df= True)
    df_content_based = content_based(input_word = input_word, top_n= top_n, return_df= True)

    i = 1
    merged_df = []
    #Getting first top_n /2 rows of two tables while dropping the duplicates
    #until it reaches the length of top_n
    while len(merged_df) < top_n:
        merged_df = pd.concat([df_content_based.head(int(top_n/2 + i)),df_collaborative.head(int(top_n/2 +i))], axis = 0)
        i+=1
        merged_df = merged_df.drop_duplicates(keep = 'first')[['title', 'rating']]
    
    #Limiting the length of the final dataframe with top_n.
    merged_df = merged_df.head(top_n).reset_index(drop = True)
    movie_name = title_searcher(input_word)['title'].iloc[0]
    if return_df == False:
        print(f"Top {top_n} movie recommendation for {movie_name}:")
        for i,movie in enumerate(merged_df['title']):
            print(f"{i+1}: {movie}")
    else:
        return merged_df


In [42]:
hybrid_model("matrix", 5)

Top 5 movie recommendation for Matrix The 1999:
1: Forrest Gump 1994
2: Braveheart 1995
3: Schindlers List 1993
4: Shawshank Redemption The 1994
5: Fight Club 1999


In [49]:
hybrid_model("wolf of wall street", 5)

Top 5 movie recommendation for Wolf of Wall Street The 2013:
1: Star Wars Episode V  The Empire Strikes Back 1980
2: Saving Private Ryan 1998
3: Lord of the Rings The Return of the King The 2003
4: Shawshank Redemption The 1994
5: Inception 2010


In [58]:
hybrid_model("saw")

Top 5 movie recommendation for Saw V 2008:
1: Silence of the Lambs The 1991
2: Star Wars Episode VI  Return of the Jedi 1983
3: Fargo 1996
4: Saw 2004
5: Saw IV 2007
