In [1]:
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ratings = pd.read_csv("toy_dataset.csv", index_col=0)
ratings = ratings.fillna(0)
ratings

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,0.0,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,0.0
user 3,1.0,0.0,0.0,4.0,5.0,4.0
user 4,0.0,2.0,1.0,4.0,0.0,3.0
user 5,1.0,0.0,2.0,3.0,3.0,4.0


In [3]:
def standardize(row):
    new_row = (row - row.mean())/(row.max()-row.min())
    return new_row
ratings_std = ratings.apply(standardize)

#we are taking the transpose since we want the similarity between items (cosine_similarity works on the rows but our items are on the columns)
item_similarity = cosine_similarity(ratings_std.T) #we obtain the similarity matrix

item_similarity_df = pd.DataFrame(item_similarity, index=ratings.columns, columns=ratings.columns)
item_similarity_df

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
action1,1.0,0.706689,0.813682,-0.799411,-0.025392,-0.914106
action2,0.706689,1.0,0.723102,-0.845154,-0.518999,-0.843374
action3,0.813682,0.723102,1.0,-0.847946,-0.37998,-0.802181
romantic1,-0.799411,-0.845154,-0.847946,1.0,0.148039,0.723747
romantic2,-0.025392,-0.518999,-0.37998,0.148039,1.0,0.393939
romantic3,-0.914106,-0.843374,-0.802181,0.723747,0.393939,1.0


In [5]:
def get_recommended_movies(movie_name, user_rating):
    #we substract 2.5 so that when the rating is low (under 2.5), we make the score/similarity negative so that we do not recommend
    #movies that are similar to this one
    similar_score = item_similarity_df[movie_name]*(user_rating-2.5)
    similar_score = similar_score.sort_values(ascending=False) #from best to worst
    return similar_score #similar score represents the percentage in which the movies should be recommended to us
print(get_recommended_movies("romantic3", 1))

action1      1.371159
action2      1.265061
action3      1.203271
romantic2   -0.590909
romantic1   -1.085620
romantic3   -1.500000
Name: romantic3, dtype: float64


In [1]:
action_lover = [('action1', 5), ('romantic2', 1), ('romantic3', 1)]
recommended_movies = pd.DataFrame()
for movie, rating in action_lover:
    #for every rating we will obtain a row with the recommended movies based on that rating
    result = pd.DataFrame(get_recommended_movies(movie, rating))
    recommended_movies = pd.concat([recommended_movies, result.T])

recommended_movies.head()
#for every row/rating, we sum the scores obtained for each movie, which represent the percentage in which that
#movie should be recommended, the movie with the highest score is first
recommended_movies.sum().sort_values(ascending=False)
recommended_movies.to_csv('Recommendations.txt', sep='\t')

# with open('Recommendations.txt', 'w') as f:
#     for 
#     f.write(recommended_movies)

NameError: name 'pd' is not defined