# Movie Recommendation System

## import libraries & read in data

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import math
import collections

In [2]:
movies = pd.read_csv("movie.csv")
ratings = pd.read_csv("rating.csv")

## Data Transformation

### Normalizing User Ratings - Subtract Mean

In [3]:
user_stats = ratings.groupby('userId')['rating'].agg(['mean'])

ratings = pd.merge(ratings, user_stats, left_on='userId', right_index = True)

ratings = ratings.assign(norm_rating = lambda x: (x['rating'] - x['mean']))
ratings.drop(columns = ['mean'], inplace = True)

### One Hot Encode Genres
##### 27,278 unique movies

In [4]:
movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))
movies.head()

exploded_genres = movies['genres'].apply(pd.Series).stack().reset_index(level=1, drop=True)

# one hot encode
one_hot_encoded = pd.get_dummies(exploded_genres).groupby(level=0).sum()

# join dataframe
movies_encoded = pd.concat([movies.drop(columns=['genres']), one_hot_encoded], axis=1)

In [5]:
movies_with_genres = pd.merge(ratings[['userId', 'movieId', 'rating', 'norm_rating']], movies_encoded, left_on='movieId', right_index = True)

### User-Genre Profiles

In [6]:
# extract userID and the genre cols
users_genre_counts = movies_with_genres.iloc[:, [1]]  
users_genre_counts = pd.concat([users_genre_counts, movies_with_genres.iloc[:, 6:]], axis=1)

# group by userID to get the total genres each user has watched
users_genres = users_genre_counts.groupby('userId').sum(numeric_only = True)

# add total column
users_genres['Total'] = ratings.groupby('userId')['movieId'].count()

# filter for only users that have rated at least 100 movies
users_genres_filtered = users_genres[users_genres['Total'] > 100]

## Create Genre Scores

### TF.IDF - Frequency of Genre * Frequency of that Genre in all Movies
#### scaled from 0 to 1

In [None]:
not_genre = movies_encoded[['movieId', 'title']]
genres_only = movies_encoded.drop(columns=not_genre.columns)
genre_counts = genres_only.sum()

In [8]:
def _inverse_genre_freqs():
    igf_array = np.zeros(len(genre_counts))
    movie_count = movies['movieId'].nunique()

    for i, value in enumerate(genre_counts.values):
      igf_array[i] = math.log2(movie_count/value)

    return igf_array

In [9]:
def create_genre_scores(genre_counts):
    igf_array = _inverse_genre_freqs()
    genres_matrix = genre_counts.iloc[:, :-1].to_numpy()
    
    uf_igf = genres_matrix * igf_array
    
    scaler = MinMaxScaler()

    scaled_genre_scores = np.vstack([scaler.fit_transform(row.reshape(-1, 1)).flatten() for row in uf_igf])

    return scaled_genre_scores

## Find Similar Users

In [10]:
# in the case where we are trying to recommend someone with < 100 movies watched
def _create_scores_for_new_user(userID):
    genre_score = create_genre_scores(users_genres[users_genres.index == userID])
    return genre_score

In [11]:
def find_similar_users(userID, genre_scores):
    index = np.where(users_genres.index == userID)
    
    user_array = _create_scores_for_new_user(userID)
    
    user_array = user_array.reshape(1, -1)
    
    cos_similarities = []
    cosine = {}
    for index, row in enumerate(genre_scores):
        row = row.reshape(1, -1)
        cos_sim = cosine_similarity(user_array, row) 
        cosine[cos_sim[0][0]] = index
        
    sorted_users = sorted(cosine.items(), reverse = True)
    
    selected_cols = movies_with_genres[['userId', 'movieId', 'norm_rating']]
    current_user = selected_cols[selected_cols['userId'] == userID]
    
    selected_cols = selected_cols[selected_cols['userId'].isin([user[1] for user in sorted_users[:10]])]
        
    return selected_cols.append(current_user)

## Find movies similar users liked

### extract the user's favorite movies (rating > 0)
### extract the movies that user has not watched

In [12]:
def _find_user_favorite_movies(userID, matrix):
    user_movies = matrix[userID]
    fave_movies = user_movies[user_movies.values > 0.0].sort_values(ascending = False)
    return fave_movies.index.values

In [13]:
def _find_unwatched(userID, matrix):
    unwatched_indexes = matrix.index[matrix[userID].isnull()]
    return unwatched_indexes.values

### Find the most similar movies (that the user hasn't watched) to movies that the user likes

In [14]:
def find_sim_movies_to_favorites(userID, matrix):
    user_fave = _find_user_favorite_movies(userID, matrix)
    user_unwatched = _find_unwatched(userID, matrix)
    
    matrix.fillna(0, inplace = True)
   
    sim_scores = {}

    for movie in user_fave:
        for unwatched_movie in user_unwatched:
            cosine_sim = cosine_similarity(matrix.loc[movie].values.reshape(1, -1),
                                          matrix.loc[unwatched_movie].values.reshape(1,-1))
            if cosine_sim[0][0] > 0:
                sim_scores[cosine_sim[0][0]] = (movie, unwatched_movie)
                
    return collections.OrderedDict(sorted(sim_scores.items(), reverse = True))

## Predict Ratings

##### un-normalize ratings so we can display 1-5 predicted rating scale 

In [15]:
def return_to_norm_ratings(matrix):
    for col in matrix.columns:
        condition = (matrix[col] != 0)
        mean_val = user_stats[user_stats.index == col].values[0][0]
        matrix.loc[condition, col] += mean_val
    return matrix

##### find how the user's rating compares to the other similar users on average
##### find the average rating of a given movie by similar users
**predicted rating will be the movie's average rating + the user's relative rating scale**

In [16]:
def predict_rating(userID, movieID, global_mean, matrix):
    rel_rating = user_stats[user_stats.index == userID] - global_mean
    
    predicted_rating = rel_rating + matrix.loc[movieID]['mean']
    predicted_rating = predicted_rating.values[0][0]
    
    if predicted_rating > 5:
        predicted_rating = 5
        
    return predicted_rating

## Function to print the movies that we predict the user would give the highest rating

In [17]:
def print_recommendations(n, sorted_predictions):
    for i in list(sorted_predictions.keys())[:n]:
        print(sorted_predictions[i][0], end = '')
        print(f', Predicted Rating: {i:.2f}')

## Recommendation Function

In [21]:
def recommend(userID, n, genre_scores):
    # create utility matrix w/ user and similar users
    selected_cols = find_similar_users(userID, genre_scores)
    matrix = selected_cols.pivot_table(index='movieId',columns='userId', values='norm_rating')
    
    # filter for movies that more than one similar user has rated
    non_nan = matrix.notna().sum(axis = 1)
    matrix = matrix[non_nan > 1]
    
    most_sim_movies = find_sim_movies_to_favorites(userID, matrix)
    
    # un-normalize the ratings
    matrix = return_to_norm_ratings(matrix)
    
    # calculate each movie's average rating
    row_mean = matrix[matrix != 0.0].mean(axis = 1)
    matrix['mean'] = row_mean
    
    # find each user's relative rating scale
    global_mean = user_stats[user_stats.index.isin(matrix.columns)].values.mean()
    
    predictions = {}
    reco_array = np.array([])

    for unwatched_movie in most_sim_movies:
        reco = most_sim_movies[unwatched_movie][1]
        reco_array = np.append(reco_array, reco)

    for recommendation in reco_array:
        predictions[predict_rating(userID, recommendation, global_mean, matrix)] = movies[movies['movieId'] == recommendation]['title'].values
        
    sorted_predictions = collections.OrderedDict(sorted(predictions.items(), reverse = True))

    print_recommendations(n, sorted_predictions)
      
    return sorted_predictions

## main - create genre scores and recommend different users

In [19]:
genre_scores = create_genre_scores(users_genres_filtered)

In [22]:
recommendations = recommend(100, 10, genre_scores)

  return selected_cols.append(current_user)


2001: A Space Odyssey (1968), Predicted Rating: 5.00
Godfather, The (1972), Predicted Rating: 4.91
Wonder Boys (2000), Predicted Rating: 4.78
Matrix, The (1999), Predicted Rating: 4.66
Shrek (2001), Predicted Rating: 4.58
Sixth Sense, The (1999), Predicted Rating: 4.53
Seven (a.k.a. Se7en) (1995), Predicted Rating: 4.45
Lord of the Rings: The Two Towers, The (2002), Predicted Rating: 4.41
One Flew Over the Cuckoo's Nest (1975), Predicted Rating: 4.28
Minority Report (2002), Predicted Rating: 4.16


In [23]:
recommendations

OrderedDict([(5, array(['2001: A Space Odyssey (1968)'], dtype=object)),
             (4.90884116690974,
              array(['Godfather, The (1972)'], dtype=object)),
             (4.78384116690974, array(['Wonder Boys (2000)'], dtype=object)),
             (4.65884116690974, array(['Matrix, The (1999)'], dtype=object)),
             (4.583841166909739, array(['Shrek (2001)'], dtype=object)),
             (4.53384116690974,
              array(['Sixth Sense, The (1999)'], dtype=object)),
             (4.450507833576406,
              array(['Seven (a.k.a. Se7en) (1995)'], dtype=object)),
             (4.40884116690974,
              array(['Lord of the Rings: The Two Towers, The (2002)'], dtype=object)),
             (4.28384116690974,
              array(["One Flew Over the Cuckoo's Nest (1975)"], dtype=object)),
             (4.15884116690974,
              array(['Minority Report (2002)'], dtype=object)),
             (4.117174500243073, array(['Alien (1979)'], dtype=object)),
    