In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
from scipy.sparse import csr_matrix, coo_matrix

In [2]:
# Take in all of our movies and ratings csvs and read it into pandas
movies = "Resources/ml-latest-small/movies.csv"
ratings = "Resources/ml-latest-small/ratings.csv"
movies_df = pd.read_csv(movies)
movies_df.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# Extract the release year from the title column
movies_df["release_year"] = movies_df["title"].str.extract(r'\((\d{4})\)')

# Remove the year from the title column
movies_df["title"] = movies_df["title"].str.replace(r" \(\d{4}\)", "", regex=True)

# Display the updated DataFrame
movies_df.head()

Unnamed: 0,movieId,title,genres,release_year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [4]:
# #cleaning up the genres column to a list instead of a string
# movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|') )
# movies_df.head()

In [5]:

from collections import Counter

In [6]:
ratings_df = pd.read_csv(ratings)
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
# Merge the DataFrames on movieId
consolidated_movies_df = movies_df.merge(ratings_df, on = 'movieId', how ='inner')
consolidated_movies_df.head()

Unnamed: 0,movieId,title,genres,release_year,userId,rating,timestamp
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,1,4.0,964982703
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,5,4.0,847434962
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,7,4.5,1106635946
3,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,15,2.5,1510577970
4,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,17,4.5,1305696483


In [8]:
# Convert timestamp to year and store in a new column `rating_year`
consolidated_movies_df['rating_year'] = consolidated_movies_df['timestamp'].apply(
    lambda x: datetime.utcfromtimestamp(x).year
)

# Display the DataFrame
consolidated_movies_df.head()

Unnamed: 0,movieId,title,genres,release_year,userId,rating,timestamp,rating_year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,1,4.0,964982703,2000
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,5,4.0,847434962,1996
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,7,4.5,1106635946,2005
3,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,15,2.5,1510577970,2017
4,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,17,4.5,1305696483,2011


In [9]:
consolidated_movies_df.dtypes

movieId           int64
title            object
genres           object
release_year     object
userId            int64
rating          float64
timestamp         int64
rating_year       int64
dtype: object

In [10]:
print(consolidated_movies_df.isnull().sum())

movieId          0
title            0
genres           0
release_year    18
userId           0
rating           0
timestamp        0
rating_year      0
dtype: int64


**Data Preprocessing**:

*Collaborative Filtering* 

We will use a method called collaborative filtering to generate user recommendations. This approach operates on the principle that individuals with similar preferences tend to enjoy similar items.

The first step involves converting our dataset into a user-item matrix, also referred to as a "utility matrix." In this matrix, each row corresponds to a user, and each column represents a movie. A key advantage of collaborative filtering is its ability to generate recommendations without relying on additional information about the users or the items.

The create_X() function produces a sparse matrix along with four mapping dictionaries:

user_mapper: Maps user IDs to their corresponding row indices.

movie_mapper: Maps movie IDs to their corresponding column indices.

user_inv_mapper: Maps row indices back to user IDs.

movie_inv_mapper: Maps column indices back to movie IDs.

These dictionaries are essential for linking the rows and columns of the utility matrix to their respective user and movie IDs.

The resulting user-item matrix is a scipy.sparse.csr_matrix, designed to efficiently store and handle sparse data.


In [11]:
from scipy.sparse import csr_matrix

# def create_X(consolidated_movies_df):
#     """
#     Generates a sparse matrix from ratings dataframe.
    
#     Args:
#         consolidated_movies_df: pandas dataframe containing columns (userId, movieId, rating)
    
#     Returns:
#         X: sparse matrix
#         user_mapper: dict mapping user id's to user indices
#         user_inv_mapper: dict mapping user indices to user id's
#         movie_mapper: dict mapping movie id's to movie indices
#         movie_inv_mapper: dict mapping movie indices to movie id's
#     """
#     # Number of unique users and movies
#     M = consolidated_movies_df['userId'].nunique()
#     N = consolidated_movies_df['movieId'].nunique()

#     # Mapping dictionaries
#     user_mapper = dict(zip(np.unique(consolidated_movies_df["userId"]), list(range(M))))
#     movie_mapper = dict(zip(np.unique(consolidated_movies_df["movieId"]), list(range(N))))
    
#     user_inv_mapper = dict(zip(list(range(M)), np.unique(consolidated_movies_df["userId"])))
#     movie_inv_mapper = dict(zip(list(range(N)), np.unique(consolidated_movies_df["movieId"])))
    
#     # Map user and movie IDs to their matrix indices
#     user_index = [user_mapper[i] for i in consolidated_movies_df['userId']]
#     item_index = [movie_mapper[i] for i in consolidated_movies_df['movieId']]

#     # Create the sparse matrix
#     X = csr_matrix((consolidated_movies_df["rating"], (user_index, item_index)), shape=(M, N))
    
#     return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

# # Example usage
# X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(consolidated_movies_df)

In [12]:
# ARIELMY: SEE XPERT RECC TO INCLUDE GENRE INFO [SAVED IN ME-NOTES WORD DOC]
def create_X(consolidated_movies_df):
    """ Generates a user-movie-genre-ratings interaction matrix.
    
    Args:
        consolidated_movies_df: pandas dataframe containing columns (userId, movieId, rating, genres)
    
    Returns:
        user_movie_genre_ratings_matrix: sparse matrix of user-movie-genre ratings interactions
        user_mapper: dict mapping user id's to user indices
        movie_mapper: dict mapping movie id's to movie indices
        genre_mapper: dict mapping genre names to genre indices
    """
    # Number of unique users, movies, and genres
    users = consolidated_movies_df['userId'].unique()
    movies = consolidated_movies_df['movieId'].unique()
    
    user_mapper = {user: idx for idx, user in enumerate(users)}
    movie_mapper = {movie: idx for idx, movie in enumerate(movies)}
    
    # Ensure genres are in string format (if they are lists, convert to a string)
    if isinstance(consolidated_movies_df['genres'].iloc[0], list):
        consolidated_movies_df['genres'] = consolidated_movies_df['genres'].apply(lambda x: '|'.join(x))
    
    # Create a one-hot encoding for genres
    genres_dummies = consolidated_movies_df[['movieId', 'genres']].drop_duplicates()
    genres_one_hot = genres_dummies['genres'].str.get_dummies(sep='|')
    
    # Create genre mapping
    genre_names = genres_one_hot.columns
    genre_mapper = {genre: idx for idx, genre in enumerate(genre_names)}
    
    # Prepare data for the user-movie-genre-ratings interaction matrix
    data = []
    row_indices = []
    col_indices = []
    
    for _, row in consolidated_movies_df.iterrows():
        user_idx = user_mapper[row['userId']]
        movie_idx = movie_mapper[row['movieId']]
        
        # Get the genres for the movie
        genres = row['genres'].split('|')
        
        for genre in genres:
            if genre in genre_mapper:
                genre_idx = genre_mapper[genre]
                # Use the rating as the interaction value
                data.append(row['rating'])  
                row_indices.append(user_idx)  # User index
                # Combine movie index and genre index
                col_indices.append(movie_idx * len(genre_names) + genre_idx)  
    
    # Create a COO sparse matrix
    user_movie_genre_ratings_matrix = coo_matrix((data, (row_indices, col_indices)), 
                                                  shape=(len(users), len(movies) * len(genre_names)))
    
    return user_movie_genre_ratings_matrix, user_mapper, movie_mapper, genre_mapper

# Example usage
# Assuming consolidated_movies_df has columns: userId, movieId, rating, genres
X, user_mapper, movie_mapper, genre_mapper = create_X(consolidated_movies_df)

# Check the shape of the resulting matrix
print("Shape of X:", X.shape)

Shape of X: (610, 194480)


In [13]:
# # ARIELMY
# def create_user_movie_genre_interaction(consolidated_movies_df):
#     """ Generates a user-movie-genre interaction matrix.
    
#     Args:
#         consolidated_movies_df: pandas dataframe containing columns (userId, movieId, rating, genres)
    
#     Returns:
#         user_movie_genre_matrix: sparse matrix of user-movie-genre interactions
#         user_mapper: dict mapping user id's to user indices
#         movie_mapper: dict mapping movie id's to movie indices
#         genre_mapper: dict mapping genre names to genre indices
#     """
#     # Number of unique users, movies, and genres
#     users = consolidated_movies_df['userId'].unique()
#     movies = consolidated_movies_df['movieId'].unique()
    
#     user_mapper = {user: idx for idx, user in enumerate(users)}
#     movie_mapper = {movie: idx for idx, movie in enumerate(movies)}
    
#     # Create a one-hot encoding for genres
#     genres_dummies = consolidated_movies_df[['movieId', 'genres']].drop_duplicates()
#     genres_dummies['genres'] = genres_dummies['genres'].str.get_dummies(sep='|')
    
#     # Create genre mapping
#     genre_names = genres_dummies['genres'].str.get_dummies(sep='|').columns
#     genre_mapper = {genre: idx for idx, genre in enumerate(genre_names)}
    
#     # Prepare data for the user-movie-genre interaction matrix
#     data = []
#     row_indices = []
#     col_indices = []
    
#     for _, row in consolidated_movies_df.iterrows():
#         user_idx = user_mapper[row['userId']]
#         movie_idx = movie_mapper[row['movieId']]
        
#         # Get the genres for the movie
#         genres = row['genres'].split('|')
        
#         for genre in genres:
#             if genre in genre_mapper:
#                 genre_idx = genre_mapper[genre]
#                 data.append(row['rating'])  # Use the rating as the interaction value
#                 row_indices.append(user_idx)  # User index
#                 col_indices.append(movie_idx * len(genre_names) + genre_idx)  # Combine movie and genre index
    
#     # Create a COO sparse matrix
#     user_movie_genre_matrix = coo_matrix((data, (row_indices, col_indices)), 
#                                           shape=(len(users), len(movies) * len(genre_names)))
    
#     return user_movie_genre_matrix, user_mapper, movie_mapper, genre_mapper

# # Example usage
# # Assuming consolidated_movies_df has columns: userId, movieId, rating, genres
# user_movie_genre_matrix, user_mapper, movie_mapper, genre_mapper = create_user_movie_genre_interaction(consolidated_movies_df)

# # Check the shape of the resulting matrix
# print("Shape of user-movie-genre interaction matrix:", user_movie_genre_matrix.shape)


Our X matrix contains 610 users and 9724 movies [UPDATE THIS]

Evaluating Sparsity : Evaluating sparsity in collaborative filtering models is essential to address data challenges, improve performance, and ensure effective recommendations. Sparse data limits the system's ability to learn preferences, exacerbates cold start issues, and risks overfitting or biasing predictions. It influences algorithm suitability, often favoring matrix factorization over neighborhood-based methods, and highlights the need for hybrid approaches. Sparse datasets also demand efficient computational structures and can complicate optimization. Understanding sparsity aids in refining data collection, imputation, and preprocessing strategies, while revealing user behavior and item popularity trends. This evaluation is vital for designing robust, scalable recommendation systems. Here, we calculate sparsity by dividing the number of stored elements by total number of elements. The number of stored (non-empty) elements in our matrix ([nnz](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.nnz.html)) is equivalent to the number of ratings in our dataset.

In [14]:
n_total = X.shape[0]*X.shape[1]
n_ratings = X.nnz
sparsity = n_ratings/n_total
print(f"Matrix sparsity: {round(sparsity*100,2)}%")

Matrix sparsity: 0.23%


Usually a sparsity of 0.1 or above we are good to use this method of collaborative filtering else we have to use the Content based filtering. The **cold start problem** is when there are new users and movies in our matrix that do not have any ratings. In our Movielens dataset, all users and movies have at least one rating but in general, it's useful to check which users and movies have few interactions.

In [15]:
n_ratings_per_user = X.getnnz(axis=1)
len(n_ratings_per_user)

610

In [16]:
print(f"Most active user rated {n_ratings_per_user.max()} movies.")
print(f"Least active user rated {n_ratings_per_user.min()} movies.")
# THIS MAY NOW MEAN SOMETHING ELSE. LIKE MOVIE-GENRES??

Most active user rated 6616 movies.
Least active user rated 40 movies.


In [17]:
n_ratings_per_movie = X.getnnz(axis=0)
len(n_ratings_per_movie)
# THIS MAY NOW BE SMTH LIKE RATINGS PER MOVIE PER GENRE

194480

In [18]:
print(f"Most rated movie has {n_ratings_per_movie.max()} ratings.")
print(f"Least rated movie has {n_ratings_per_movie.min()} ratings.")
# UPDATE LOGIC EXPLAINED IN PRINT...

Most rated movie has 329 ratings.
Least rated movie has 0 ratings.


We will be Implementing Collaborative Filtering using the K-Nearest Neighbors algorithm, considering that the matrix is dense, to identify movies with similar user engagement patterns

In [19]:
from sklearn.neighbors import NearestNeighbors

In [20]:
# # ARIELMY ATTEMPT TO ADD USER INPUT TO ABOVE
# def find_similar_movies(movie_id, X, movie_mapper, genre_mapper, k, metric='cosine'):
#     """
#     Finds k-nearest neighbors for a given movie id.
    
#     Args:
#         movie_id: id of the movie of interest
#         X: user-item utility matrix
#         k: number of similar movies to retrieve
#         metric: distance metric for kNN calculations
    
#     Output: returns list of k similar movie ID's
#     """
#     X = X.T
#     neighbour_ids = []
    
#     movie_ind = movie_mapper[movie_id]
#     movie_vec = X[movie_ind]
#     if isinstance(movie_vec, (np.ndarray)):
#         movie_vec = movie_vec.reshape(1,-1)
#     # use k+1 since kNN output includes the movieId of interest
#     kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
#     kNN.fit(X)
#     neighbour = kNN.kneighbors(movie_vec, return_distance=False)
#     for i in range(0, k):
#         n = neighbour.item(i)
#         neighbour_ids.append(movie_inv_mapper[n])
#     neighbour_ids.pop(0)
#     return neighbour_ids


In [21]:
# from sklearn.metrics.pairwise import cosine_similarity

# def find_similar_movies(movie_id, X, movie_mapper, genre_mapper, k=5):
#     # Convert the COO matrix to CSR format for efficient row slicing
#     X_csr = X.tocsr()
    
#     # Get the index of the movie_id
#     if movie_id not in movie_mapper:
#         raise ValueError("Movie ID not found in movie_mapper.")
    
#     movie_index = movie_mapper[movie_id]
    
#     # Get the vector for the specified movie
#     movie_vector = X_csr[movie_index].toarray()  # Convert to dense array for similarity computation
    
#     # Compute cosine similarity between the movie vector and all other movie vectors
#     similarities = cosine_similarity(movie_vector, X_csr).flatten()  # Flatten to get a 1D array
    
#     # Get the indices of the top k similar movies (excluding the movie itself)
#     similar_indices = similarities.argsort()[-k-1:-1][::-1]  # Get top k indices, excluding the movie itself
    
#     # Map indices back to movie IDs
#     similar_movies = [(movie_mapper_inv[i], similarities[i]) for i in similar_indices]
    
#     return similar_movies


In [22]:
# from sklearn.metrics.pairwise import cosine_similarity
# def find_similar_movies(movie_id, X, movie_mapper, genre_mapper, k=5):
#     # Convert the COO matrix to CSR format for efficient row slicing
#     X_csr = X.tocsr()
    
#     # Check if the movie_id exists in the movie_mapper
#     if movie_id not in movie_mapper:
#         raise ValueError("Movie ID not found in movie_mapper.")
    
#     movie_index = movie_mapper[movie_id]
    
#     # Get the vector for the specified movie
#     movie_vector = X_csr[movie_index].toarray()  # Convert to dense array for similarity computation
    
#     # Compute cosine similarity between the movie vector and all other movie vectors
#     similarities = cosine_similarity(movie_vector, X_csr).flatten()  # Flatten to get a 1D array
    
#     # Get the genre of the specified movie
#     movie_genre = genre_mapper[movie_id]  # Assuming genre_mapper maps movie_id to its genre
    
#     # Filter similar movies by genre (this is a simple example; adjust as necessary)
#     similar_movies = []
#     for i in range(len(similarities)):
#         if genre_mapper.get(movie_mapper[i]) == movie_genre:  # Check if genres match
#             similar_movies.append((i, similarities[i]))
    
#     # Sort and get the top k similar movies
#     similar_movies.sort(key=lambda x: x[1], reverse=True)
#     return similar_movies[:k]

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
def find_similar_movies(movie_id, X, movie_mapper, genre_mapper, k=5, genre_weight=1.5):
    # Convert the COO matrix to CSR format for efficient row slicing
    X_csr = X.tocsr()
    
    # Check if the movie_id exists in the movie_mapper
    if movie_id not in movie_mapper:
        raise ValueError("Movie ID not found in movie_mapper.")
    
    movie_index = movie_mapper[movie_id]
    
    # Get the vector for the specified movie
    movie_vector = X_csr[movie_index].toarray()  # Convert to dense array for similarity computation
    
    # Compute cosine similarity between the movie vector and all other movie vectors
    similarities = cosine_similarity(movie_vector, X_csr).flatten()  # Flatten to get a 1D array
    
    # Get the genre of the specified movie
    movie_genre = genre_mapper[movie_id]  # Assuming genre_mapper maps movie_id to its genre
    
    # Create a list to hold weighted similarities
    weighted_similarities = []
    
    for i in range(len(similarities)):
        # Check if genres match
        if genre_mapper.get(movie_mapper[i]) == movie_genre:
            # Increase similarity score by genre_weight if genres match
            weighted_similarity = similarities[i] * genre_weight
        else:
            weighted_similarity = similarities[i]
        
        weighted_similarities.append((i, weighted_similarity))
    
    # Sort and get the top k similar movies based on weighted similarity
    weighted_similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Map indices back to movie IDs for the output
    similar_movies = [(movie_mapper[i], weighted_similarity) for i, weighted_similarity in weighted_similarities[:k]]
    
    return similar_movies

find_similar_movies() takes in a movieId and X matrix, and outputs a list of

movies that are similar to the movieId of interest.

Let's see how it works in action. We will first create another mapper that maps movieId to title so that our results are interpretable.

In [24]:
# ARIELMY: HELP USER SEARCH FOR THEIR MOVIE ID TO USE IN NEXT STEP
# Create a dictionary mapping movie IDs to titles
movie_titles = dict(zip(consolidated_movies_df['movieId'], consolidated_movies_df['title']))

# Get user input for the movie title
search_title = input("Please enter the movie title you want to search for: ")

# Find the movie ID based on the title
movie_id = [key for key, value in movie_titles.items() if value.lower() == search_title.lower()]

if movie_id:
    movie_id = movie_id[0]  # Get the first match
    print(f"The movie ID for '{search_title}' is: {movie_id}")

Please enter the movie title you want to search for:  Toy Story


The movie ID for 'Toy Story' is: 1


In [30]:
# Check the range of valid movie IDs
min_movie_id = min(movie_mapper.keys())
max_movie_id = max(movie_mapper.keys())
print(f"The valid range of movie IDs is from {min_movie_id} to {max_movie_id}.")

# Display a sample of valid movie IDs
sample_movie_ids = list(movie_mapper.keys())[:10]  # Get the first 10 valid movie IDs
print("Here are some valid movie IDs you can choose from:", sample_movie_ids)

The valid range of movie IDs is from 1 to 193609.
Here are some valid movie IDs you can choose from: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [28]:
# Get user input for movie_id, k, and genre_weight
user_movie_id = int(input("Enter the movie ID: "))
user_k = int(input("Enter the number of similar movies to retrieve (k): "))
user_genre_weight = float(input("Enter the genre weight (e.g., 1.5 for a 50% increase): "))

# Check if the movie_id exists in the movie_mapper
if user_movie_id not in movie_mapper:
    print("Error: Movie ID not found. Please enter a valid movie ID.")
    print("Available Movie IDs:", list(movie_mapper.keys())[:10])  # Show a sample of available IDs
else:
    # Call the function with the user-defined k value and genre weight
    similar_movies = find_similar_movies(user_movie_id, X, movie_mapper, genre_mapper, k=user_k, genre_weight=user_genre_weight)

    # Print the similar movies
    print("Similar Movies:")
    for movie in similar_movies:
        print(movie)

Enter the movie ID:  2
Enter the number of similar movies to retrieve (k):  10
Enter the genre weight (e.g., 1.5 for a 50% increase):  1.5


KeyError: 2

In [25]:
# # ARIELMY
# # Get user input for movie_id and k
# user_movie_id = int(input("Enter the movie ID: "))
# user_k = int(input("Enter the number of similar movies to retrieve (k): "))

# # Call the function with the user-defined k value

# similar_movies = find_similar_movies(user_movie_id, X, movie_mapper, genre_mapper, k=user_k)
# print("Similar Movies:", similar_movies)

Enter the movie ID:  1791
Enter the number of similar movies to retrieve (k):  10


IndexError: row index (1323) out of range

find_similar_movies() returns a list of movieId's that are most similar to your movie of interest. Let's convert these id's to titles so that we can interpret our results. To make things easier, we will create a dictionary that maps movieId to title.

In [32]:
movie_title = movie_titles[movie_id]

print(f"Because you watched {movie_title}:")
for i in similar_movies:
    print(movie_titles[i])

Because you watched Twilight:
Damsels in Distress
Father Hood
Jack the Bear
Nick Fury: Agent of S.H.I.E.L.D.
Ski School
The Hound of the Baskervilles
Assassination Tango
Dragon Ball Z: Bojack Unbound (Doragon bôru Z 9: Ginga girigiri!! Butchigiri no sugoi yatsu)
Meet the Applegates
2048: Nowhere to Run
Christopher Columbus: The Discovery
The Night Before
Meteor Man, The
Godzilla vs. Mechagodzilla (Gojira tai Mekagojira)
Double Trouble
The Prime Gig
Plain Clothes
Dragon Ball Z: Broly - The Legendary Super Saiyan (Doragon bôru Z 8: Moetsukiro!! Nessen retsusen-chô gekisen)
Omega Doom
The Punisher: Dirty Laundry
A Man Called Blade
Zone 39
Jetsons: The Movie
Hamburger Hill


In [15]:
# Save the DataFrame to a CSV file
consolidated_movies_df.to_csv('consolidated_movies.csv', index=False)

In [16]:
# Analysis on Data
# Count of Unique users and movies
print("Number of unique movies:", consolidated_movies_df['movieId'].nunique())
print("Number of unique users:", consolidated_movies_df['userId'].nunique())


Number of unique movies: 9724
Number of unique users: 610


In [17]:
#what are the average ratings per movie
average_ratings = consolidated_movies_df.groupby('movieId')['rating'].mean()
print(average_ratings.head())


movieId
1    3.920930
2    3.431818
3    3.259615
4    2.357143
5    3.071429
Name: rating, dtype: float64


In [None]:
# Take in the tags csv and read it into pandas
tags = "Resources/ml-latest-small/tags.csv"
tags_df = pd.read_csv(tags)
tags_df.head()

In [None]:
# Merge the DataFrames on movieId with tags 
tags_movies_df = consolidated_movies_df.merge(tags_df , on = 'movieId', how ='outer')
tags_movies_df.head()