In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import math
import collections

In [2]:
movies = pd.read_csv("movie.csv")
ratings = pd.read_csv("rating.csv")

### Normalizing user ratings
#### z-scores

In [3]:
user_stats = ratings.groupby('userId')['rating'].agg(['mean'])

ratings = pd.merge(ratings, user_stats, left_on='userId', right_index = True)

ratings = ratings.assign(norm_rating = lambda x: (x['rating'] - x['mean']))
ratings.drop(columns = ['mean'], inplace = True)

ratings.tail()

Unnamed: 0,userId,movieId,rating,timestamp,norm_rating
20000258,138493,68954,4.5,2009-11-13 15:42:00,0.327078
20000259,138493,69526,4.5,2009-12-03 18:31:48,0.327078
20000260,138493,69644,3.0,2009-12-07 18:10:57,-1.172922
20000261,138493,70286,5.0,2009-11-13 15:42:24,0.827078
20000262,138493,71619,2.5,2009-10-17 20:25:36,-1.672922


### One Hot Encoding Genres
##### 27,278 unique movies

In [4]:
movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))
movies.head()

exploded_genres = movies['genres'].apply(pd.Series).stack().reset_index(level=1, drop=True)

# one hot encode
one_hot_encoded = pd.get_dummies(exploded_genres).groupby(level=0).sum()

# join dataframe
movies_encoded = pd.concat([movies.drop(columns=['genres']), one_hot_encoded], axis=1)

# unique movie and the genres that the movie falls under
movies_encoded.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


### Merge to get each user's movie rating with associated genres

In [5]:
# all the movies that a user has rated and their respective genre makeups
movies_with_genres = pd.merge(ratings[['userId', 'movieId', 'rating', 'norm_rating']], movies_encoded, left_on='movieId', right_index = True)

In [6]:
# extract userID and the genre cols
users_genre_counts = movies_with_genres.iloc[:, [1]]  
users_genre_counts = pd.concat([users_genre_counts, movies_with_genres.iloc[:, 6:]], axis=1)

# group by userID to get the total genres each user has watched
users_genres = users_genre_counts.groupby('userId').sum(numeric_only = True)

# add total column
users_genres['Total'] = ratings.groupby('userId')['movieId'].count()

# filter for only users that have rated at least 100 movies
users_genres_filtered = users_genres[users_genres['Total'] > 100]

users_genres.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Total
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,31,24,12,13,66,15,5,85,10,...,13,2,11,6,38,7,24,12,3,175
2,0,12,10,1,6,17,7,3,31,2,...,1,1,4,3,11,2,7,7,3,61
3,0,28,28,13,22,66,19,4,87,17,...,27,3,10,5,35,19,23,14,3,187
4,0,7,3,1,3,12,5,1,14,1,...,0,2,1,0,8,1,2,0,1,28
5,0,15,8,4,7,19,9,0,34,4,...,6,1,3,1,11,4,12,2,3,66


### Want to use the genre counts in the TF.IDF metric
#### this metric will be used to assess how much a user likes a given genre
**IDF: if user only likes 6 Westerns, that is still a high amount considering the popularity of the genre**

In [7]:
# total counts for each genre

not_genre = movies_encoded[['movieId', 'title']]
genres_only = movies_encoded.drop(columns=not_genre.columns)
genre_counts = genres_only.sum()

#### create IDF (inverse genre frequency)

In [8]:
def _inverse_genre_freqs():
    igf_array = np.zeros(len(genre_counts))
    movie_count = movies['movieId'].nunique()

    for i, value in enumerate(genre_counts.values):
      igf_array[i] = math.log2(movie_count/value)

    return igf_array

In [9]:
def create_genre_scores(genre_counts):
    igf_array = _inverse_genre_freqs()
    genres_matrix = genre_counts.iloc[:, :-1].to_numpy()
    
    uf_igf = genres_matrix * igf_array
    
    scaler = MinMaxScaler()

    scaled_genre_scores = np.vstack([scaler.fit_transform(row.reshape(-1, 1)).flatten() for row in uf_igf])

    return scaled_genre_scores

In [10]:
genre_scores = create_genre_scores(users_genres_filtered)

### Now we will use these genre scores to find similar users

In [11]:
def create_scores_for_new_user(userID):
    genre_score = create_genre_scores(users_genres[users_genres.index == userID])
    return genre_score

In [152]:
def find_similar_users(userID):
    index = np.where(users_genres.index == userID)
    
    user_array = create_scores_for_new_user(userID)
    
    user_array = user_array.reshape(1, -1)
    
    cos_similarities = []
    cosine = {}
    for index, row in enumerate(genre_scores):
        row = row.reshape(1, -1)
        cos_sim = cosine_similarity(user_array, row) 
        cosine[cos_sim[0][0]] = index
        
    sorted_users = sorted(cosine.items(), reverse = True)
    
    selected_cols = movies_with_genres[['userId', 'movieId', 'norm_rating']]
    current_user = selected_cols[selected_cols['userId'] == userID]
    
    selected_cols = selected_cols[selected_cols['userId'].isin([user[1] for user in sorted_users[:10]])]
        
    return selected_cols.append(current_user)

In [40]:
def _find_user_favorite_movies(userID, matrix):
    user_movies = matrix[userID]
    fave_movies = user_movies[user_movies.values > 0.0].sort_values(ascending = False)
    return fave_movies.index.values

In [39]:
def _find_unwatched(userID, matrix):
    unwatched_indexes = matrix.index[matrix[userID].isnull()]
    return unwatched_indexes.values

In [110]:
def find_sim_movies_to_favorites(userID, matrix):
    user_fave = _find_user_favorite_movies(userID, matrix)
    user_unwatched = _find_unwatched(userID, matrix)
    
    matrix.fillna(0, inplace = True)
    
    #num_observations = (matrix.astype(bool).sum(axis = 1).values) - 1
    #matrix = matrix[num_observations > 1]
    
    sim_scores = {}

    for movie in user_fave:
        for unwatched_movie in user_unwatched:
            cosine_sim = cosine_similarity(matrix.loc[movie].values.reshape(1, -1),
                                          matrix.loc[unwatched_movie].values.reshape(1,-1))
            if cosine_sim[0][0] > 0:
                sim_scores[cosine_sim[0][0]] = (movie, unwatched_movie)
                
    return collections.OrderedDict(sorted(sim_scores.items(), reverse = True))

In [17]:
def return_to_norm_ratings(matrix):
    for col in matrix.columns:
        condition = (matrix[col] != 0)
        mean_val = user_stats[user_stats.index == col].values[0][0]
        matrix.loc[condition, col] += mean_val
    return matrix

In [54]:
def predict_rating(userID, movieID, global_mean, matrix):
    rel_rating = user_stats[user_stats.index == userID] - global_mean
    
    predicted_rating = rel_rating + matrix.loc[movieID]['mean']
    predicted_rating = predicted_rating.values[0][0]
    
    if predicted_rating > 5:
        predicted_rating = 5
        
    return predicted_rating

In [81]:
def print_recommendations(n, sorted_predictions):
    for i in list(sorted_predictions.keys())[:n]:
        print(sorted_predictions[i][0], end = '')
        print(f', Predicted Rating: {i:.2f}')

In [153]:
def recommend(userID, n):
    #sim_users = find_similar_users(userID)
    
    #selected_cols = _k_most_similar(userID, sim_users)
    selected_cols = find_similar_users(userID)
    matrix = selected_cols.pivot_table(index='movieId',columns='userId', values='norm_rating')
    
    non_nan = matrix.notna().sum(axis = 1)
    matrix = matrix[non_nan > 1]
    
    most_sim_movies = find_sim_movies_to_favorites(userID, matrix)
    
    matrix = return_to_norm_ratings(matrix)
    
    row_mean = matrix[matrix != 0.0].mean(axis = 1)
    matrix['mean'] = row_mean

    global_mean = user_stats[user_stats.index.isin(matrix.columns)].values.mean()
    
    predictions = {}
    reco_array = np.array([])

    for unwatched_movie in most_sim_movies:
        reco = most_sim_movies[unwatched_movie][1]
        reco_array = np.append(reco_array, reco)

    for recommendation in reco_array:
        predictions[predict_rating(userID, recommendation, global_mean, matrix)] = movies[movies['movieId'] == recommendation]['title'].values
        
    sorted_predictions = collections.OrderedDict(sorted(predictions.items(), reverse = True))

    print_recommendations(n, sorted_predictions)

In [154]:
recommend(100, 10)

  return selected_cols.append(current_user)


2001: A Space Odyssey (1968), Predicted Rating: 5.00
Godfather, The (1972), Predicted Rating: 4.91
Wonder Boys (2000), Predicted Rating: 4.78
Matrix, The (1999), Predicted Rating: 4.66
Shrek (2001), Predicted Rating: 4.58
Sixth Sense, The (1999), Predicted Rating: 4.53
Seven (a.k.a. Se7en) (1995), Predicted Rating: 4.45
Lord of the Rings: The Two Towers, The (2002), Predicted Rating: 4.41
One Flew Over the Cuckoo's Nest (1975), Predicted Rating: 4.28
Minority Report (2002), Predicted Rating: 4.16
