In [6]:
#import libraries
import pandas as pd
import numpy as np
import random
from sklearn.decomposition import NMF
import pickle
import sqlalchemy

In [4]:
#import data
links = pd.read_csv('/Users/lauraeleonoremoritz/Downloads/ml-latest-small/links.csv')
movies = pd.read_csv('/Users/lauraeleonoremoritz/Downloads/ml-latest-small/movies.csv')
ratings = pd.read_csv('/Users/lauraeleonoremoritz/Downloads/ml-latest-small/ratings.csv')
tags = pd.read_csv('/Users/lauraeleonoremoritz/Downloads/ml-latest-small/tags.csv')

In [7]:
#connection to aws rds
conns = 'postgres://postgres:postgres@movierecommenderallspicearrays.co2ftlrmgfgj.eu-central-1.rds.amazonaws.com:5432/movies'
engine = sqlalchemy.create_engine(conns)

In [8]:
links.to_sql('links', engine)
movies.to_sql('movies', engine)
ratings.to_sql('ratings', engine)
tags.to_sql('tags', engine)

ValueError: Table 'ratings' already exists.

In [7]:
def input_cleanup(df, filter_name):
    '''
    Reassignes ratings to a 3 (to neuralize it) based on userId or movieId filter.
    1. top25_users_neutralize: removes effect of the top 25% that gave their input
    2. bottom75_users_neutralize: removes effect of the bottom 75% that gave their input
    3. top25_voted_neutralize: removes effect of top25% voted on films **note: not their 
                                        rating but ones that recieved the most input**
    4. rate4_5_neutralize: removes effect of films that were rated a 4 and 5 on average  
    5. combo: combination of 2 and 3
    '''
    df1=df.copy()
    
    #respective userid and movieId identifiers
    user_rate=pd.DataFrame(df1.groupby('userId')['rating'].count())
    title_rate_count=pd.DataFrame(df1.groupby('movieId')['rating'].count())
    title_rate_mean=pd.DataFrame(df1.groupby('movieId')['rating'].mean())
    
    #filters
    top25_user=user_rate[user_rate['rating']>600].index
    bottom75_user=user_rate[user_rate['rating']<600].index
    top25_movies_voted=title_rate_count[title_rate_count['rating']>9].index
    rate4_movies=title_rate_mean[title_rate_mean['rating']==4].index 
    rate5_movies=title_rate_mean[title_rate_mean['rating']==5].index
    
    #1
    if filter_name == 'top25_users_neutralize':
        df1.set_index('userId', inplace=True)
        df1.loc[top25_user,'rating']=3 
        df1.reset_index(inplace=True)
    #2
    if filter_name == 'bottom75_users_neutralize':
        df1.set_index('userId', inplace=True)
        df1.loc[bottom75_user,'rating']=3 
        df1.reset_index(inplace=True)
    #3
    if filter_name == 'top25_voted_neutralize':    
        df1.set_index('movieId', inplace=True)
        df1.loc[top25_movies_voted, 'rating']=3
        df1.reset_index(inplace=True)
    #4
    if filter_name == 'rate4_5_neutralize':
        df1.set_index('movieId', inplace=True)
        df1.loc[rate4_movies, 'rating']=3 
        df1.loc[rate5_movies, 'rating']=3 
        df1.reset_index(inplace=True)
    #5
    if filter_name == 'combo':
        df1.set_index('userId', inplace=True)
        df1.loc[bottom75_user,'rating']=3 
        df1.reset_index(inplace=True)
        df1.set_index('movieId', inplace=True)
        df1.loc[top25_movies_voted, 'rating']=3
        df1.reset_index(inplace=True)
    return df1            

In [8]:
ratings_default=ratings.copy()
ratings1=input_cleanup(ratings, 'top25_users_neutralize')
ratings2=input_cleanup(ratings, 'bottom75_users_neutralize')
ratings3=input_cleanup(ratings, 'top25_voted_neutralize')
ratings4=input_cleanup(ratings, 'rate4_5_neutralize')
ratings5=input_cleanup(ratings, 'combo')

In [9]:
def nmf_model(df, n_components=20):
    '''
    Build NMF model based on input data. Function works as:
    1. reformats inout data so movieId are the columns, userId is the index and the values are the ratings
    2. saves the number of movies from (1) for the recommender function
    3. saves the movieId number from (1) for recommender function
    4. converts object from (1) to an array
    5. builds an NMF model based on the array from (4) with 20 components and fills all empty values with 3 (neutral)
    6. extracts the movieId-rating mastrix from model for recommender function
    '''
    Rtrue=df.pivot(index='userId', columns='movieId', values='rating').fillna(3) 
    num_movies=Rtrue.shape[1]
    movieId=Rtrue.columns
    Rtrue=np.array(Rtrue)
    model=NMF(n_components)
    model.fit(Rtrue)
    component1=model.components_
    return num_movies, movieId, model, component1

In [10]:
num_movies_default, movieId_default, model_default, component_default=nmf_model(ratings_default)
pickle.dump(model_default, open('model_default', 'wb'))



In [11]:
num_movies1, movieId1, model1, component1=nmf_model(ratings1)
pickle.dump(model1, open('model1', 'wb'))



In [12]:
num_movies2, movieId2, model2, component2=nmf_model(ratings2)
pickle.dump(model2, open('model2', 'wb'))



In [13]:
num_movies3, movieId3, model3, component3=nmf_model(ratings3)
pickle.dump(model3, open('model3', 'wb'))



In [14]:
num_movies4, movieId4, model4, component4=nmf_model(ratings4)
pickle.dump(model4, open('model4', 'wb'))



In [15]:
num_movies5, movieId5, model5, component5=nmf_model(ratings5)
pickle.dump(model5, open('model5', 'wb'))

