In [1]:
#Dan Clayton
#DSC-630
#Exercise 10.2--Recommender System

In [2]:
#Import some libraries that we will need
import pandas as pd
import numpy as np
from sys import exit
#Using fuzzywuzzy for fuzzy matching when looking for movie titles
from fuzzywuzzy import fuzz

In [3]:
#Import the data sets
df_tags = pd.read_csv('tags.csv')
df_ratings = pd.read_csv('ratings.csv')
df_movies = pd.read_csv('movies.csv')

In [4]:
#Check out the data that we are working with
print('movies:\n',df_movies.head(5))
print('\nratings:\n',df_ratings.head(5))
print('\ntags:\n',df_tags.head(5))

movies:
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

ratings:
    userId  movieId  rating   timestamp
0       1      296     5.0  1147880044
1       1      306     3.5  1147868817
2       1      307     5.0  1147868828
3       1      665     5.0  1147878820
4       1      899     3.5  1147868510

tags:
    userId  movieId               tag   timestamp
0       3      260           classic  1439472355
1       3      260            sci-fi  1439472

In [5]:
#Function to populate the movie years as numbers--populates 0 where movie year is missing
def pop_year(last4):
    if last4.isnumeric():
        return int(last4)
    else:
        return 0

#Split out the movie year from the title
df_movies['year'] = df_movies['title'].str[-6:]

#Remove the parenthesis from the movie year
df_movies['year'] = df_movies['year'].str.replace('(','', regex=False)
df_movies['year'] = df_movies['year'].str.replace(')','', regex=False)

#Process the movie years
df_movies['year'] = df_movies['year'].apply(pop_year)

#Convert year to number
#df_movies['year'] = df_movies['year'].astype(int)

#Remove the year from the movie title
df_movies['title'] = df_movies['title'].str[:-7]

#Remove Roman numerals from movie titles
df_movies['title'].replace(' II ',' 2 ',inplace=True)
df_movies['title'].replace(' III ',' 3 ',inplace=True)
df_movies['title'].replace(' IV ',' 4 ',inplace=True)
df_movies['title'].replace(' V ',' 5 ',inplace=True)
df_movies['title'].replace(' VI ',' 6 ',inplace=True)
df_movies['title'].replace(' VII ',' 7 ',inplace=True)

#Capitalize all movie titles for easier searching later on
df_movies['title'] = df_movies['title'].str.upper()

In [6]:
#Create movie title and year variables to be populated by user
title = ''
year = ''

user_prompt = 'Please enter a movie title.\nTo find your movie more quickly, please enter a movie title and year separated by a comma.  \nFor example, enter "Terminator, 1984"\n'

while title == '':
    #Promt the user to enter a movie title
    #a = input('Please enter a movie title and year separated by a comma.  \nFor example, enter "Terminator, 1984"\n')
    a = input(user_prompt)

    #Count how many commas were entered--you should only have one
    if a.count(',') > 1:
        print('Please omit any commas from the movie title and only use a comma to separate the title from the year.')
    else:
        if a.count(',') == 0:
            #Only title was provided
            title = a.split(',')[0].strip()
        else:
            #Both title and year provided
            title = a.split(',')[0].strip()
            year = a.split(',')[1].strip()

Please enter a movie title.
To find your movie more quickly, please enter a movie title and year separated by a comma.  
For example, enter "Terminator, 1984"
predator


In [7]:
#Attempt to locate the movie title that was entered

#Show the movie year--useful for debugging and an FYI for the user
if year == '':
    print('No movie year specified\n')
else:
    print(f'Movied year:{year}\n')
    

#Start by narrowing down the list of movies by the year entered, if a year was entered
if year != '':    
    search_df = df_movies[df_movies['year']==int(year)]
else:
    search_df = df_movies

#If there are no movies for this year, then prompt the user to try again
if len(search_df.index) == 0:
    print('No movies were found for this year, please try again')
else:
    #Attempt to locate the movie title in the list of titles--using fuzzyway fuzzy matching algorythm
    #https://www.datacamp.com/tutorial/fuzzy-string-python
    ratio = []#List of title's Levenshtein ratios--100 implies a perfect match, 0 implies a perfect non-match
    print(f'Serching for {title}, please be patient.')
    #Loop through movie titles looking for the best match
    for mt in search_df['title'].to_list():
        ratio.append(fuzz.ratio(title.lower(),mt.lower()))
    
    #Check to see if you found any values
    if max(ratio) == 0:
        print(f'No matches were found for the title {title}')
    else:
        print('Please select the best match from the titles below:\n')
        #Don't want to return more than 5 values, or more than the number of non-0 scores from the search algorythm
        max_returns = min(5,len(set(ratio) - set([0])))
        #print('Max Returns', max_returns)
        #Get the top n matches from the calcualted ratios--this is a list of the indexes for those matches
        
        #Find the top 5 matches
        top_5 = list(set(ratio))[-5:]
        
        #Reverse the top_5 order
        top_5.sort(reverse=True)
        
        #Used to store title names
        top_5_titles = []
        
        #Store the top 5 title names
        for i in top_5:
            #print(search_df['title'].to_list()[ratio.index(i)])
            top_5_titles.append(search_df['title'].to_list()[ratio.index(i)])
        
        #Subset a dataframe with just the top 5 titles for the user to choose from
        top5_df = search_df[search_df['title'].isin(top_5_titles)].head(5)
        top5_df.reset_index(drop=True, inplace=True)
        
        #Display the top 5 search results for the user to choose from
        print(top5_df[['title','year','genres']])
        
        #Allow the user to select one of the top 5 search results
        final_selection = ''
        
        #Loop until the user enters something
        while final_selection == '':
            final_selection = input('\nPlease enter the index for your movie from the list below.  To quit, type "Quit"\n')
            if final_selection.lower() == 'quit':
                print("Terminating program per user request")
                exit()
            #Make sure a blank value wasn't entered
            if not final_selection.isnumeric():
                #No final answer given--prompt user to try again
                print("Not a valid selection--Please try again.")
                final_selection = ''
            else:
                #A number was entered--check to see if it is within range
                if int(final_selection) > max_returns:
                    #User has elected to quit--so quit
                    print("Not a valid selection--Please try again.")
                    final_selection = ''
                else:
                    #Good number passed--define the chosen movie as a data series for later reference
                    chosen_df = top5_df.iloc[int(final_selection)]

No movie year specified

Serching for predator, please be patient.
Please select the best match from the titles below:

        title  year                  genres
0    PREDATOR  1987  Action|Sci-Fi|Thriller
1  PREDATOR 2  1990  Action|Sci-Fi|Thriller
2     CREATOR  1985    Comedy|Drama|Romance
3   PREDATORS  2010  Action|Sci-Fi|Thriller
4    OPERATOR  2015   Action|Drama|Thriller

Please enter the index for your movie from the list below.  To quit, type "Quit"
0


In [8]:
#Identify users with the best reviews for this title
chosen_ratings_df = df_ratings[df_ratings['movieId']==chosen_df['movieId']]

#Sort by rating
chosen_ratings_df = chosen_ratings_df.sort_values(by=['rating'], ascending=False)

#Build the dataframe that will hold all of the top reviews for like-minded viewers
#top_reviews = pd.DataFrame
top_reviews = df_ratings.head(1)

#Copy the columns from the reviews dataframe
#top_reviews.columns = df_ratings.columns

#Choose either the top 10 ratings, or the top n number (if there are fewer than 10)
top_num = min(5,len(chosen_ratings_df.index))-1

#Make sure that you get enough other movies to show--might need to add more reviewers if you don't have 10 or more other movies to recommend
while len(top_reviews['movieId'].unique()) < 10:
    #Increment the number of user reviews by 1
    top_num += 1
    
    #Make a list of the top reviewed user IDs for later use
    top_users = chosen_ratings_df.head(top_num)['userId'].to_list()
    
    #Get all the reviews for the top users (those users who reviewd the selected film positively)
    top_reviews = df_ratings[df_ratings['userId'].isin(top_users)]
    
    #If there aren't 10 or more unique movies in this set of reviewers then this will loop and increase the number of reviewers by 1

#Split the genres for this movie into a list for later use
genres = chosen_df['genres'].split('|')

In [9]:
#Score the movie by average rating and how closely the genres match the chosen movie genre

#Find the average rating for this movie accross the selected reviewer population
avg_df = top_reviews.groupby('movieId')['rating'].mean()

#Convert back to dataframe
avg_df = avg_df.to_frame()

#Copy the MovieId so that you can build an index
avg_df['movieId'] = avg_df.index

#Remove the chosen movie from the list of movies to recommend (don't want to recommend the same movie)
avg_df = avg_df[avg_df['movieId'] != chosen_df['movieId']]

#rebuild an index
avg_df.reset_index(drop=True, inplace=True)

#Join the movie genres
merged_df = pd.merge(avg_df,df_movies[['movieId','genres']], on='movieId', how='left')

#list for genre scores
genre_scores = []
#Calculate the genre scores (0% means no genre matches, 100% means all genre matches)
for i in merged_df['genres']:
    genre_scores.append(round(1-len(set(genres) - set(i.split('|')))/len(genres),2))

#Add the genres scores to the dataframe column
avg_df['genre_score'] = genre_scores

#Combine the scores using an arbitrary rating
genre_weight = .50
avg_df['total_score'] = avg_df['genre_score'] * genre_weight + avg_df['rating'] * (1-genre_weight)
#avg_df

In [10]:
#Build the movie recommendations and let the user know what they are

#chosen_ratings_df = chosen_ratings_df.sort_values(by=['rating'], ascending=False)

#Sort the scored DF by total score
avg_df = avg_df.sort_values(by=['total_score'], ascending=False)

#Buld a list with the top 10 movie recommendations
top10 = avg_df['movieId'].head(10)

#Convert back to dataframe
top10 = top10.to_frame()

#rebuild an index
top10.reset_index(drop=True, inplace=True)

#Add the movie names, years, and genres to the top_10 df
#top10.

#Join the movie genres
top10 = pd.merge(top10,df_movies[['movieId','title', 'year','genres']], on='movieId', how='left')

#Tell the user what you chose for them
chosen = chosen_df['title']
print(f'Based on your selection of the movie "{chosen}", we think you might also like one of the following movies:\n')
top10[['title','year','genres']]

Based on your selection of the movie "PREDATOR", we think you might also like one of the following movies:



Unnamed: 0,title,year,genres
0,"MATRIX REVOLUTIONS, THE",2003,Action|Adventure|Sci-Fi|Thriller|IMAX
1,JURASSIC WORLD,2015,Action|Adventure|Drama|Sci-Fi|Thriller
2,RIDDICK,2013,Action|Sci-Fi|Thriller|IMAX
3,BLADE RUNNER,1982,Action|Sci-Fi|Thriller
4,RISE OF THE PLANET OF THE APES,2011,Action|Drama|Sci-Fi|Thriller
5,"TERMINATOR, THE",1984,Action|Sci-Fi|Thriller
6,CHRONICLE,2012,Action|Sci-Fi|Thriller
7,"ONE, THE",2001,Action|Sci-Fi|Thriller
8,PAYCHECK,2003,Action|Sci-Fi|Thriller
9,"ISLAND, THE",2005,Action|Sci-Fi|Thriller
