In [1]:
#import libraries
import pandas as pd
import numpy as np
import random
from sklearn.decomposition import NMF
import pickle

In [6]:
#import data
ratings=pd.read_csv('ml-latest-small/ratings.csv')
movies=pd.read_csv('ml-latest-small/movies.csv')
tags=pd.read_csv('ml-latest-small/tags.csv')

In [7]:
def input_cleanup(df, filter_name):
    '''
    Reassignes ratings to a 3 (to neuralize it) based on userId or movieId filter.
    1. top25_users_neutralize: removes effect of the top 25% that gave their input
    2. bottom75_users_neutralize: removes effect of the bottom 75% that gave their input
    3. top25_voted_neutralize: removes effect of top25% voted on films **note: not their 
                                        rating but ones that recieved the most input**
    4. rate4_5_neutralize: removes effect of films that were rated a 4 and 5 on average  
    5. combo: combination of 2 and 3
    '''
    df1=df.copy()
    
    #respective userid and movieId identifiers
    user_rate=pd.DataFrame(df1.groupby('userId')['rating'].count())
    title_rate_count=pd.DataFrame(df1.groupby('movieId')['rating'].count())
    title_rate_mean=pd.DataFrame(df1.groupby('movieId')['rating'].mean())
    
    #filters
    top25_user=user_rate[user_rate['rating']>600].index
    bottom75_user=user_rate[user_rate['rating']<600].index
    top25_movies_voted=title_rate_count[title_rate_count['rating']>9].index
    rate4_movies=title_rate_mean[title_rate_mean['rating']==4].index 
    rate5_movies=title_rate_mean[title_rate_mean['rating']==5].index
    
    #1
    if filter_name == 'top25_users_neutralize':
        df1.set_index('userId', inplace=True)
        df1.loc[top25_user,'rating']=3 
        df1.reset_index(inplace=True)
    #2
    if filter_name == 'bottom75_users_neutralize':
        df1.set_index('userId', inplace=True)
        df1.loc[bottom75_user,'rating']=3 
        df1.reset_index(inplace=True)
    #3
    if filter_name == 'top25_voted_neutralize':    
        df1.set_index('movieId', inplace=True)
        df1.loc[top25_movies_voted, 'rating']=3
        df1.reset_index(inplace=True)
    #4
    if filter_name == 'rate4_5_neutralize':
        df1.set_index('movieId', inplace=True)
        df1.loc[rate4_movies, 'rating']=3 
        df1.loc[rate5_movies, 'rating']=3 
        df1.reset_index(inplace=True)
    #5
    if filter_name == 'combo':
        df1.set_index('userId', inplace=True)
        df1.loc[bottom75_user,'rating']=3 
        df1.reset_index(inplace=True)
        df1.set_index('movieId', inplace=True)
        df1.loc[top25_movies_voted, 'rating']=3
        df1.reset_index(inplace=True)
    return df1            

In [8]:
ratings_default=ratings.copy()
ratings1=input_cleanup(ratings, 'top25_users_neutralize')
ratings2=input_cleanup(ratings, 'bottom75_users_neutralize')
ratings3=input_cleanup(ratings, 'top25_voted_neutralize')
ratings4=input_cleanup(ratings, 'rate4_5_neutralize')
ratings5=input_cleanup(ratings, 'combo')

In [9]:
def nmf_model(df, n_components=20):
    '''
    Build NMF model based on input data. Function works as:
    1. reformats inout data so movieId are the columns, userId is the index and the values are the ratings
    2. saves the number of movies from (1) for the recommender function
    3. saves the movieId number from (1) for recommender function
    4. converts object from (1) to an array
    5. builds an NMF model based on the array from (4) with 20 components and fills all empty values with 3 (neutral)
    6. extracts the movieId-rating mastrix from model for recommender function
    '''
    Rtrue=df.pivot(index='userId', columns='movieId', values='rating').fillna(3) 
    num_movies=Rtrue.shape[1]
    movieId=Rtrue.columns
    Rtrue=np.array(Rtrue)
    model=NMF(n_components)
    model.fit(Rtrue)
    component1=model.components_
    return num_movies, movieId, model, component1

In [10]:
num_movies_default, movieId_default, model_default, component_default=nmf_model(ratings_default)
pickle.dump(model_default, open('model_default', 'wb'))



In [11]:
num_movies1, movieId1, model1, component1=nmf_model(ratings1)
pickle.dump(model1, open('model1', 'wb'))



In [12]:
num_movies2, movieId2, model2, component2=nmf_model(ratings2)
pickle.dump(model2, open('model2', 'wb'))



In [13]:
num_movies3, movieId3, model3, component3=nmf_model(ratings3)
pickle.dump(model3, open('model3', 'wb'))



In [14]:
num_movies4, movieId4, model4, component4=nmf_model(ratings4)
pickle.dump(model4, open('model4', 'wb'))



In [15]:
num_movies5, movieId5, model5, component5=nmf_model(ratings5)
pickle.dump(model5, open('model5', 'wb'))



In [16]:
def movie_indeces(dfmovie, dftags):
    movie_title_index=dict(zip(dfmovie.movieId, dfmovie.title))
    movie_genre_index=dict(zip(dfmovie.movieId, dfmovie.genres))
    movie_tag_index=dict(zip(dftags.movieId, dftags.tag ))
    return movie_title_index, movie_genre_index, movie_tag_index

In [17]:
movie_title_index, movie_genre_index, movie_tag_index=movie_indeces(movies, tags)

In [18]:
top_movie_names=random.sample(list(movie_title_index.values()),3)
top_movie_names

['Ghost in the Shell (Kôkaku kidôtai) (1995)',
 'Dance of Reality, The (Danza de la realidad, La) (2013)',
 'Stealing Harvard (2002)']

In [19]:
def new_user(movie_array, movie_title_index, num_movies):
    '''
    Creates a new user array based on users top pick input
    '''
    index=[] 
    for num in movie_array:
        val=list(movie_title_index.values()).index(num)
        index.append(val)  
    new_user=np.full((1, num_movies), 3, dtype=int) 
    new_user[0][index]=5 
    return new_user

In [20]:
new_user=new_user(top_movie_names, movie_title_index, num_movies_default)

In [21]:
def movie_recommender(new_user, model, component1, movieId, movie_genre_index, movie_tag_index, movie_title_index):
    '''
    Takes the user profile array and calculates recommendations top to bottom by: 
    1. calculates new weights for each film for that customer using the NMF model 
    2. inserts the genre and tag for each movieId into the user_profile array
    3. replaces the movieId with movie title
    4. removes films that user already identified as top rating 
    5. reorders table so top weighted recommendations appear up top
    '''
    #1
    user_profile=model.transform(new_user)
    results=np.dot(user_profile, component1)
    #2
    results2=pd.DataFrame(results[0]).set_index(movieId).reset_index()
    results2['genre']=results2['movieId'].map(movie_genre_index)
    results2['tag']=results2['movieId'].map(movie_tag_index)
    results2.columns=['movieId', 'weight', 'genre', 'tag']
    #3
    recommendations=results2.replace({'movieId':movie_title_index})
    #4
    for i in top_movie_names:
        recommendations.drop(recommendations.index[recommendations['movieId'] == i], inplace = True)
    #5   
    recommendations=recommendations.sort_values('weight', ascending=False) 
    
    return recommendations

In [22]:
#no input data cleanup
recommendations_default=movie_recommender(new_user, model_default, component_default, movieId_default, movie_genre_index, movie_tag_index, movie_title_index)
recommendations_default

Unnamed: 0,movieId,weight,genre,tag
314,Forrest Gump (1994),3.240130,Comedy|Drama|Romance|War,touching
277,"Shawshank Redemption, The (1994)",3.210626,Crime|Drama,Morgan Freeman
1938,"Matrix, The (1999)",3.144053,Action|Sci-Fi|Thriller,post apocalyptic
224,Star Wars: Episode IV - A New Hope (1977),3.126993,Action|Adventure|Sci-Fi,space opera
2144,American Beauty (1999),3.102389,Drama|Romance,
...,...,...,...,...
1823,You've Got Mail (1998),2.935902,Comedy|Romance,remake
1234,I Know What You Did Last Summer (1997),2.935812,Horror|Mystery|Thriller,
396,Free Willy (1993),2.935705,Adventure|Children|Drama,
2034,"Blair Witch Project, The (1999)",2.934146,Drama|Horror|Thriller,video


In [23]:
#top25_users_neutralize: removes effect of the top 25% that gave their input
recommendations1=movie_recommender(new_user, model1, component1, movieId_default, movie_genre_index, movie_tag_index, movie_title_index)
recommendations1

Unnamed: 0,movieId,weight,genre,tag
314,Forrest Gump (1994),3.133092,Comedy|Drama|Romance|War,touching
1938,"Matrix, The (1999)",3.107961,Action|Sci-Fi|Thriller,post apocalyptic
277,"Shawshank Redemption, The (1994)",3.105887,Crime|Drama,Morgan Freeman
2144,American Beauty (1999),3.090772,Drama|Romance,
224,Star Wars: Episode IV - A New Hope (1977),3.089957,Action|Adventure|Sci-Fi,space opera
...,...,...,...,...
396,Free Willy (1993),2.949773,Adventure|Children|Drama,
315,Four Weddings and a Funeral (1994),2.942079,Comedy|Romance,wedding
835,E.T. the Extra-Terrestrial (1982),2.938713,Children|Drama|Sci-Fi,aliens
461,Schindler's List (1993),2.933607,Drama|War,holocaust


In [24]:
#bottom75_users_neutralize: removes effect of the bottom 75% that gave their input
recommendations2=movie_recommender(new_user, model2, component2, movieId_default, movie_genre_index, movie_tag_index, movie_title_index)
recommendations2

Unnamed: 0,movieId,weight,genre,tag
3539,Mulholland Drive (2001),3.009378,Crime|Drama|Film-Noir|Mystery|Thriller,
190,Clerks (1994),3.009015,Comedy,generation X
910,Star Wars: Episode VI - Return of the Jedi (1983),3.008937,Action|Adventure|Sci-Fi,space opera
867,Wallace & Gromit: The Wrong Trousers (1993),3.008644,Animation|Children|Comedy|Crime,Aardman
546,Mission: Impossible (1996),3.008563,Action|Adventure|Mystery|Thriller,based on a TV show
...,...,...,...,...
1109,McHale's Navy (1997),2.993337,Comedy|War,
607,Striptease (1996),2.993283,Comedy|Crime,
2028,Wild Wild West (1999),2.992848,Action|Comedy|Sci-Fi|Western,
656,Escape from L.A. (1996),2.991831,Action|Adventure|Sci-Fi|Thriller,


In [25]:
#top25_voted_neutralize: removes effect of top25% voted on films **note: not their rating but ones that recieved the most input**
recommendations3=movie_recommender(new_user, model3, component3, movieId_default, movie_genre_index, movie_tag_index, movie_title_index)
recommendations3

Unnamed: 0,movieId,weight,genre,tag
9600,"Three Billboards Outside Ebbing, Missouri (2017)",3.019269,Crime|Drama,
3639,Witness for the Prosecution (1957),3.018345,Drama|Mystery|Thriller,court
5100,Gladiator (1992),3.016744,Action|Drama,
713,"Affair to Remember, An (1957)",3.016680,Drama|Romance,
3559,Life as a House (2001),3.016540,Drama,
...,...,...,...,...
1014,Amityville II: The Possession (1982),2.986908,Horror,
1789,Psycho (1998),2.986308,Crime|Horror|Thriller,
271,Stuart Saves His Family (1995),2.983930,Comedy,
3799,Jason X (2002),2.983739,Horror|Sci-Fi|Thriller,


In [26]:
#rate4_5_neutralize: removes effect of films that were rated a 4 and 5 on average 
recommendations4=movie_recommender(new_user, model4, component4, movieId_default, movie_genre_index, movie_tag_index, movie_title_index)
recommendations4

Unnamed: 0,movieId,weight,genre,tag
277,"Shawshank Redemption, The (1994)",3.188692,Crime|Drama,Morgan Freeman
314,Forrest Gump (1994),3.184232,Comedy|Drama|Romance|War,touching
1938,"Matrix, The (1999)",3.128016,Action|Sci-Fi|Thriller,post apocalyptic
224,Star Wars: Episode IV - A New Hope (1977),3.099447,Action|Adventure|Sci-Fi,space opera
2224,Fight Club (1999),3.093540,Action|Crime|Drama|Thriller,violent
...,...,...,...,...
1234,I Know What You Did Last Summer (1997),2.944500,Horror|Mystery|Thriller,
1485,Back to the Future Part II (1989),2.944333,Adventure|Comedy|Sci-Fi,time travel
2028,Wild Wild West (1999),2.939853,Action|Comedy|Sci-Fi|Western,
1823,You've Got Mail (1998),2.935016,Comedy|Romance,remake


In [27]:
#combo of 2 &3
recommendations5=movie_recommender(new_user, model5, component5, movieId_default, movie_genre_index, movie_tag_index, movie_title_index)
recommendations5

Unnamed: 0,movieId,weight,genre,tag
3304,Big Top Pee-Wee (1988),3.009667,Adventure|Children|Comedy,circus
1679,One Crazy Summer (1986),3.008653,Comedy,
3321,Ernest Saves Christmas (1988),3.008266,Children|Comedy,
530,Ed (1996),3.008055,Comedy,
1800,Pale Rider (1985),3.007867,Western,
...,...,...,...,...
2111,Teaching Mrs. Tingle (1999),2.993843,Comedy|Thriller,
1774,Red Sonja (1985),2.993715,Action|Adventure|Fantasy,
5303,Without a Paddle (2004),2.993510,Comedy,
4980,Look Who's Talking Too (1990),2.993312,Comedy|Romance,
