In [1]:
#import libraries
import pandas as pd
import numpy as np
import random
from sklearn.decomposition import NMF
import pickle
import json

In [2]:
#import data
ratings=pd.read_csv('ml-latest-small/ratings.csv')
movies=pd.read_csv('ml-latest-small/movies.csv')
tags=pd.read_csv('ml-latest-small/tags.csv')

In [3]:
def input_cleanup(df, filter_name):
    '''
    Reassignes ratings to a 3 (to neuralize it) based on userId or movieId filter.
    1. top25_users_neutralize: removes effect of the top 25% that gave their input
    2. bottom75_users_neutralize: removes effect of the bottom 75% that gave their input
    3. top25_voted_neutralize: removes effect of top25% voted on films **note: not their 
                                        rating but ones that recieved the most input**
    4. rate4_5_neutralize: removes effect of films that were rated a 4 and 5 on average  
    5. combo: combination of 2 and 3
    '''
    df1=df.copy()
    
    #respective userid and movieId identifiers
    user_rate=pd.DataFrame(df1.groupby('userId')['rating'].count())
    title_rate_count=pd.DataFrame(df1.groupby('movieId')['rating'].count())
    title_rate_mean=pd.DataFrame(df1.groupby('movieId')['rating'].mean())
    
    #filters
    top25_user=user_rate[user_rate['rating']>600].index
    bottom75_user=user_rate[user_rate['rating']<600].index
    top25_movies_voted=title_rate_count[title_rate_count['rating']>9].index
    rate4_movies=title_rate_mean[title_rate_mean['rating']==4].index 
    rate5_movies=title_rate_mean[title_rate_mean['rating']==5].index
    
    #1
    if filter_name == 'top25_users_neutralize':
        df1.set_index('userId', inplace=True)
        df1.loc[top25_user,'rating']=3 
        df1.reset_index(inplace=True)
    #2
    if filter_name == 'bottom75_users_neutralize':
        df1.set_index('userId', inplace=True)
        df1.loc[bottom75_user,'rating']=3 
        df1.reset_index(inplace=True)
    #3
    if filter_name == 'top25_voted_neutralize':    
        df1.set_index('movieId', inplace=True)
        df1.loc[top25_movies_voted, 'rating']=3
        df1.reset_index(inplace=True)
    #4
    if filter_name == 'rate4_5_neutralize':
        df1.set_index('movieId', inplace=True)
        df1.loc[rate4_movies, 'rating']=3 
        df1.loc[rate5_movies, 'rating']=3 
        df1.reset_index(inplace=True)
    #5
    if filter_name == 'combo':
        df1.set_index('userId', inplace=True)
        df1.loc[bottom75_user,'rating']=3 
        df1.reset_index(inplace=True)
        df1.set_index('movieId', inplace=True)
        df1.loc[top25_movies_voted, 'rating']=3
        df1.reset_index(inplace=True)
    return df1            

In [4]:
ratings_default=ratings.copy()
ratings1=input_cleanup(ratings, 'top25_users_neutralize')
ratings2=input_cleanup(ratings, 'bottom75_users_neutralize')
ratings3=input_cleanup(ratings, 'top25_voted_neutralize')
ratings4=input_cleanup(ratings, 'rate4_5_neutralize')
ratings5=input_cleanup(ratings, 'combo')

In [5]:
def nmf_model(df, n_components=20):
    '''
    Build NMF model based on input data. Function works as:
    1. reformats inout data so movieId are the columns, userId is the index and the values are the ratings
    2. saves the number of movies from (1) for the recommender function
    3. saves the movieId number from (1) for recommender function
    4. converts object from (1) to an array
    5. builds an NMF model based on the array from (4) with 20 components and fills all empty values with 3 (neutral)
    6. extracts the movieId-rating mastrix from model for recommender function
    '''
    Rtrue=df.pivot(index='userId', columns='movieId', values='rating').fillna(3) 
    num_movies=Rtrue.shape[1]
    movieId=Rtrue.columns
    Rtrue1=np.array(Rtrue)
    model=NMF(n_components)
    model.fit(Rtrue1)
    component1=model.components_
    return num_movies, movieId, model, component1, Rtrue

In [6]:
num_movies2, movieId2, model2, component2, bottom75_user_neutral=nmf_model(ratings2)
pickle.dump(model2, open('bottom75_user_neutral', 'wb'))



In [22]:
bottom75_user_neutral.to_csv('bottom75_input.csv')

In [23]:
num_movies3, movieId3, model3, component3, top25_movies=nmf_model(ratings3)
pickle.dump(model3, open('top25_movies', 'wb'))



In [24]:
top25_movies.to_csv('top25_movies.csv')

In [25]:
num_movies5, movieId5, model5, component5, combo=nmf_model(ratings5)
pickle.dump(model5, open('combo', 'wb'))



In [26]:
combo.to_csv('combo_input.csv')

In [7]:
def movie_indeces(dfmovie, dftags):
    movie_title_index=dict(zip(dfmovie.movieId, dfmovie.title))
    movie_genre_index=dict(zip(dfmovie.movieId, dfmovie.genres))
    movie_tag_index=dict(zip(dftags.movieId, dftags.tag ))
    return movie_title_index, movie_genre_index, movie_tag_index

In [8]:
movie_title_index, movie_genre_index, movie_tag_index=movie_indeces(movies, tags)

In [31]:
with open('movie_title_index.json', 'w') as fp:
    json.dump(movie_title_index, fp)

In [32]:
with open('movie_genre_index.json', 'w') as fp:
    json.dump(movie_genre_index, fp)

In [33]:
with open('movie_tag_index.json', 'w') as fp:
    json.dump(movie_tag_index, fp)

In [34]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.set_index('timestamp', inplace=True)
ratings.loc[ratings.between_time('00:00', '04:00').index, 'time_block']='cant sleep'
ratings.loc[ratings.between_time('04:00', '07:00').index, 'time_block']='early commuter'
ratings.loc[ratings.between_time('07:00', '11:00').index, 'time_block']='talk show'
ratings.loc[ratings.between_time('11:00', '15:00').index, 'time_block']='kiddy nap'
ratings.loc[ratings.between_time('15:00', '18:00').index, 'time_block']='afterschool'
ratings.loc[ratings.between_time('18:00', '21:00').index, 'time_block']='tv dinner'
ratings.loc[ratings.between_time('21:00', '23:59:59').index, 'time_block']='late night'
ratings.reset_index(inplace=True)
time_tag=pd.DataFrame(ratings.groupby('movieId')['time_block'].value_counts().unstack().idxmax(axis=1), columns=['time_block'])
movie_time_index=dict(zip(time_tag.index, time_tag.time_block))

In [36]:
with open('movie_time_index.json', 'w') as fp:
    json.dump(movie_time_index, fp)

## explain timing categorization

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.set_index('timestamp', inplace=True)
ratings.loc[ratings.between_time('00:00', '04:00').index, 'time_block']='cant sleep'
ratings.loc[ratings.between_time('04:00', '07:00').index, 'time_block']='early commuter'
ratings.loc[ratings.between_time('07:00', '11:00').index, 'time_block']='talk show'
ratings.loc[ratings.between_time('11:00', '15:00').index, 'time_block']='kiddy nap'
ratings.loc[ratings.between_time('15:00', '18:00').index, 'time_block']='afterschool'
ratings.loc[ratings.between_time('18:00', '21:00').index, 'time_block']='tv dinner'
ratings.loc[ratings.between_time('21:00', '23:59:59').index, 'time_block']='late night'
ratings.reset_index(inplace=True)

In [6]:
ratings

Unnamed: 0,timestamp,userId,movieId,rating,time_block
0,2000-07-30 18:45:03,1,1,4.0,tv dinner
1,2000-07-30 18:20:47,1,3,4.0,tv dinner
2,2000-07-30 18:37:04,1,6,4.0,tv dinner
3,2000-07-30 19:03:35,1,47,5.0,tv dinner
4,2000-07-30 18:48:51,1,50,5.0,tv dinner
...,...,...,...,...,...
100831,2017-05-03 21:53:22,610,166534,4.0,late night
100832,2017-05-03 22:21:31,610,168248,5.0,late night
100833,2017-05-08 19:50:47,610,168250,5.0,tv dinner
100834,2017-05-03 21:19:12,610,168252,5.0,late night


In [8]:
ratings.groupby('movieId')['time_block'].value_counts().unstack()

time_block,afterschool,cant sleep,early commuter,kiddy nap,late night,talk show,tv dinner
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,36.0,35.0,21.0,31.0,32.0,24.0,36.0
2,20.0,20.0,6.0,17.0,21.0,12.0,14.0
3,7.0,7.0,4.0,10.0,7.0,8.0,9.0
4,,,,4.0,1.0,,2.0
5,12.0,5.0,1.0,11.0,5.0,9.0,6.0
...,...,...,...,...,...,...,...
193581,,,,1.0,,,
193583,,,,1.0,,,
193585,,,,1.0,,,
193587,1.0,,,,,,


In [10]:
pd.DataFrame(ratings.groupby('movieId')['time_block'].value_counts().unstack().idxmax(axis=1))

Unnamed: 0_level_0,0
movieId,Unnamed: 1_level_1
1,afterschool
2,late night
3,kiddy nap
4,kiddy nap
5,afterschool
...,...
193581,kiddy nap
193583,kiddy nap
193585,kiddy nap
193587,afterschool
