In [103]:
import pandas as pd
import numpy as np
import scipy 
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [174]:
test_ratings = [
    ('Lethal Weapon (1987)',4),
    ('Natural Born Killers (1994)',3),
    ('Inception (2010)',0),
    ('Heat (1995)',0),
    ('Finding Nemo (2003)',3),
    ('Office Space (1999)',1),
    ('Home Alone (1990)',0),
    ('High Fidelity (2000)',2),
    ('Donnie Darko (2001)',3),
    ('Lion King, The (1994)',2)
]

In [47]:
movie_ratings = pd.read_csv('data\\movielens\\ratings.csv')
movies = pd.read_csv('data\\movielens\\movies.csv')

In [31]:
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [50]:
df = pd.merge(movie_ratings, movies, on='movieId')

In [57]:
movie_matrix = df.pivot_table(index='userId', columns='title', values='rating')

In [95]:
ratings = pd.DataFrame(df.groupby('title')['rating'].mean())
ratings['num_ratings'] = pd.DataFrame(df.groupby('title')['rating'].count())

In [97]:
ratings.head()

Unnamed: 0_level_0,rating,num_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"""Great Performances"" Cats (1998)",1.75,2
$9.99 (2008),3.833333,3
'Hellboy': The Seeds of Creation (2004),2.0,1
'Neath the Arizona Skies (1934),0.5,1
'Round Midnight (1986),2.25,2


In [101]:
movie_matrix.fillna(0, inplace=True)

In [109]:
movie_similarity = 1 - pairwise_distances(movie_matrix.as_matrix(), metric="cosine" )
np.fill_diagonal( movie_similarity, 0 ) #Filling diagonals with 0s for future use when sorting is done
ratings_matrix = pd.DataFrame(movie_similarity)
ratings_matrix.head(5)

  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,661,662,663,664,665,666,667,668,669,670
0,0.0,0.0,0.0,0.074482,0.016818,0.0,0.083884,0.0,0.012843,0.0,...,0.0,0.0,0.014481,0.043719,0.0,0.0,0.0,0.062917,0.0,0.017466
1,0.0,0.0,0.124295,0.118821,0.103646,0.0,0.212985,0.11319,0.113333,0.043213,...,0.477306,0.063202,0.077784,0.164162,0.466281,0.425462,0.084646,0.02414,0.170595,0.113175
2,0.0,0.124295,0.0,0.08164,0.151531,0.060691,0.154714,0.249781,0.134475,0.114672,...,0.161205,0.064198,0.176222,0.158357,0.177098,0.124562,0.124911,0.080984,0.136606,0.170193
3,0.074482,0.118821,0.08164,0.0,0.130649,0.079648,0.319745,0.191013,0.030417,0.137186,...,0.114319,0.047228,0.136647,0.25403,0.121905,0.088735,0.068483,0.104309,0.054512,0.211609
4,0.016818,0.103646,0.151531,0.130649,0.0,0.063796,0.095888,0.165712,0.086616,0.03237,...,0.191029,0.021142,0.146246,0.224245,0.139721,0.058252,0.042926,0.038358,0.062642,0.225086


In [136]:
'''
def recommender(users_ratings, all_movies):
    
    lw = movie_matrix['Lethal Weapon (1987)']
    similar_to_lw = movie_matrix.corrwith(lw)
    corr_to_lw = pd.DataFrame(similar_to_lw, columns=['Correlation'])
    corr_to_lw = corr_to_lw.join(ratings.num_ratings)
    recommendation = corr_to_lw.sort_values('Correlation', ascending=False).iloc[1].name
'''

In [325]:
def recommender(users_ratings, all_movies):
    all_users_movies = []
    all_user_movies_to_check = []
    recommended_movies = []
    for movie, rating in users_ratings:
        all_users_movies.append(movie)
        #recommends similar movies for movies the user gave 3 or higher to 
        if rating >=3:
            all_user_movies_to_check.append(movie)
    for movies in all_user_movies_to_check:
        movie_specific_matrix = all_movies[movies]
        similar_spe_mm = all_movies.corrwith(movie_specific_matrix)
        corr_to_mm = pd.DataFrame(similar_spe_mm, columns=['Correlation'])
        #find 1 recommendation per list that is neither the movie itself nor in the list of movies 
        i = 0
        recommendation = corr_to_mm.sort_values('Correlation', ascending=False).iloc[i].name
        while recommendation in all_users_movies:
            i = i + 1
            recommendation = corr_to_mm.sort_values('Correlation', ascending=False).iloc[i].name
        recommended_movies.append('Because of movie: '+ movies+ ' we recommend: ' +recommendation)
    print(recommended_movies)

In [176]:
recommender(test_ratings, movie_matrix)

['Because of movie: Lethal Weapon (1987) we recommend: Lethal Weapon 2 (1989)',
 'Because of movie: Natural Born Killers (1994) we recommend: Crow, The (1994)',
 'Because of movie: Finding Nemo (2003) we recommend: Incredibles, The (2004)',
 'Because of movie: Donnie Darko (2001) we recommend: Eternal Sunshine of the Spotless Mind (2004)']

# 2. Movie Search Engine

Now that you have a recommendation engine, you need to provide a way for users to find movies to rate. You will need to create a function that takes in a search parameter and returns a ranked list of movies that best matches the input. For this data set, there are less than 10,000 movies and you only need to worry about searching the titles for those movies. Therefore, we do not need to worry as much about coming up with an optimal solution that scales for larger datasets.

When returning candidate movie titles, you will want to return the titles with that match the search input with the highest probability. Consider dividing up the titles and the user input into n-grams, but instead of using n-grams of works, the n-grams are characters in the string.

For example, the title Batman contains the bigrams, [‘ba’, ‘tm’, ‘an’, ‘at’, ‘ma’]. You could then match that input title to titles that contain those bigrams with the highest probability. Find a search method that generally returns correct recommendations based off the search input.

Enter name dan


In [226]:
import re
from difflib import SequenceMatcher


In [227]:
def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\s',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [259]:
def search_engine(user_input, values):
    similarity_scores = []
    for value in values.index.tolist():
        score = similar(ngrams(user_input), ngrams(value))
        similarity_scores.append((value, score))
    scored_similar = pd.DataFrame(similarity_scores, columns=['name', 'score'])
    results = scored_similar.sort_values('score', ascending=False).name.head(5).tolist()
    return results

In [261]:
search_engine('Batman', ratings)

['Batman (1989)',
 'Batman (1966)',
 'Batman & Robin (1997)',
 'Batman Begins (2005)',
 'Batman Returns (1992)']

# 3. Movie Recommendation Application

In this part, you create an interactive movie recommendation application by combining the movie recommendation engine and the movie search engine. To accomplish this, you will create a simple command line application. Upon starting the application, you should ask the user to find a movie to rate. It should return a list of numbered movies or an “I don’t see what I’m looking for” option.



In [328]:
def rec_app(movie_list):
    find_movie = input('Enter movie ')
    possible_movies = search_engine(find_movie, movie_list)
    for i in range(len(possible_movies)):
        print('Press ', i + 1, 'For', possible_movies[i])
    print('Press 6 For I don\'t see what I\'m looking for')
    rec_by_title = input('What number:')
    rec_by_title_number = input('What did you think of it 1-5:')
    if (int(rec_by_title) <= 5):
        title = possible_movies[int(rec_by_title)-1]
        user_rec = [(title,int(rec_by_title_number))]
        recommender(user_rec, movie_matrix)
        

In [332]:
rec_app(ratings)

Enter movie Lethal Weapon
Press  1 For Lethal Weapon (1987)
Press  2 For Lethal Weapon 2 (1989)
Press  3 For Lethal Weapon 3 (1992)
Press  4 For Lethal Weapon 4 (1998)
Press  5 For Legendary Weapons of China (1982)
Press 6 For I don't see what I'm looking for
What number:1
What did you think of it 1-5:5
['Because of movie: Lethal Weapon (1987) we recommend: Lethal Weapon 2 (1989)']


In [333]:
rec_app(ratings)

Enter movie Natural Born Killers
Press  1 For Natural Born Killers (1994)
Press  2 For Killers (2010)
Press  3 For Killers, The (1964)
Press  4 For Natural, The (1984)
Press  5 For Killers, The (1946)
Press 6 For I don't see what I'm looking for
What number:1
What did you think of it 1-5:5
['Because of movie: Natural Born Killers (1994) we recommend: Crow, The (1994)']
