In [1]:
import pandas 
import numpy
import matplotlib.pyplot 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from fuzzywuzzy import fuzz

In [2]:
pandas.set_option('display.max_colwidth', None)
columns_name=['user_id','movie_id','rating','timestamp']
df = pandas.read_csv("../data/ml-100k/u.data",sep="\t", names=columns_name)
df.head()
df_movies =  pandas.read_csv("../data/ml-100k/u.data",sep="\t", names=columns_name)

In [3]:
pandas.set_option('display.max_colwidth', None)
columns_name = ['movie_id','title','release_date','video_release_date','imdb_url','unknown','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
df_movies = pandas.read_csv("../data/ml-100k/u.item",sep="|", names=columns_name)
print(len(df_movies))
df_movies.head()
df_movies_new = df_movies[['movie_id', 'title']].copy()
df_movies_new.head()

1682


Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
# knn and collaborative filtering
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

# pivot and create movie-user matrix
movie_user_mat = df.pivot(index='movie_id', columns='user_id', values='rating').fillna(0)

# create mapper from movie title to index
movie_to_idx = {
    movie: i for i, movie in 
    enumerate(list(df_movies_new.set_index('movie_id').loc[movie_user_mat.index].title))
}
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [5]:
# fitting the model
# define model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
# fit
model_knn.fit(movie_user_mat_sparse)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [6]:
def fuzzy_matching(mapper, fav_movie, verbose=True):
    """
    return the closest match via fuzzy ratio. If no match found, return None
    
    Parameters:  
    mapper: dict, map movie title name to index of the movie in data
    fav_movie: str, name of user input movie
    verbose: bool, print log if True

    Return: index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]



def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    """
    return top n similar movie recommendations based on user's input movie

    Parameters
    ----------
    model_knn: sklearn model, knn model
    data: movie-user matrix
    mapper: dict, map movie title name to index of the movie in data
    fav_movie: str, name of user input movie
    n_recommendations: int, top n recommendations

    Return
    ------
    list of top n similar movie recommendations
    """
    # fit
    model_knn.fit(data)
    # get input movie index
    print('You have input movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    # inference
    print('Recommendation system start to make inference')
    print('......\n')
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    # get list of raw idx of recommendations
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [8]:
my_favorite = 'Toy Story'

make_recommendation(
    model_knn=model_knn,
    data=movie_user_mat_sparse,
    fav_movie=my_favorite,
    mapper=movie_to_idx,
    n_recommendations=10)

You have input movie: Toy Story
Found possible matches in our database: ['Toy Story (1995)']

Recommendation system start to make inference
......

Recommendations for Toy Story:
1: Raiders of the Lost Ark (1981), with distance of 0.3776175050042344
2: Jerry Maguire (1996), with distance of 0.37592529851886347
3: Fargo (1996), with distance of 0.36939923923861095
4: Star Trek: First Contact (1996), with distance of 0.3632726854037742
5: Willy Wonka and the Chocolate Factory (1971), with distance of 0.3618423665130154
6: Mission: Impossible (1996), with distance of 0.3586782396588516
7: Rock, The (1996), with distance of 0.33544521129158
8: Independence Day (ID4) (1996), with distance of 0.31021439592414524
9: Return of the Jedi (1983), with distance of 0.30007502870792213
10: Star Wars (1977), with distance of 0.26542794398902425
