In [1]:
# Import dependencies 
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
# from scipy.sparse import csr_matrix



In [2]:
#import our input dataset
#SAMPLES from MovieLens

In [3]:
links_df = pd.read_csv("Resources/ml-latest-small/links.csv")
links_df.head()

#example of imdb id for toy story is tt0114709 -- could use with OMDB API for extra info (excerpt etc) once presenting reccs?

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
movies_df = pd.read_csv("Resources/ml-latest-small/movies.csv")
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_df = pd.read_csv("Resources/ml-latest-small/ratings.csv")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags_df = pd.read_csv("Resources/ml-latest-small/tags.csv")
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
#merge rating + movies to have titles ready
merged_movie_ratings = pd.merge(movies_df, ratings_df, on='movieId', how='outer')
merged_movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [8]:
#check shape of new merged movies+ratings
print("Rows, Columns:", merged_movie_ratings.shape)

#outer join: rows = 100854 ; include all and give nulls to those with no ratings. keeps individual user ratings!
# inner join: rows = 100836 ; exclude those with no ratings

Rows, Columns: (100854, 6)


In [9]:
# How many user ratings total
count_user_ratings = merged_movie_ratings['rating'].dropna().count()
print("Count of user ratings (drop NAs):", count_user_ratings)

# How many unique users (Meena may have already started) @ ratings
unique_user_count = merged_movie_ratings['userId'].nunique()
print("Count of unqiue users (movies+ratings):", unique_user_count)

# How many unique movies total in this ratings merged df
unique_movie_count = merged_movie_ratings['movieId'].nunique()
print("Count of unqiue movies (movies+ratings):", unique_movie_count)

Count of user ratings (drop NAs): 100836
Count of unqiue users (movies+ratings): 610
Count of unqiue movies (movies+ratings): 9742


In [27]:
# How many user ratings for each movie (viz?). what is distrib/avg of ratings for each movie?
# count
ratings_count_per_movie = merged_movie_ratings.groupby('movieId')['rating'].count()
print(ratings_count_per_movie)

# Reset the index to convert the Series to a DataFrame
ratings_count_per_movie_df = ratings_count_per_movie.reset_index(name='rating_count')

# Merge with the movies+ratings df to add movie titles
ratings_count_with_titles_df = ratings_count_per_movie_df.merge(merged_movie_ratings[['movieId', 'title']], on='movieId', how='left')
ratings_count_with_titles_df = ratings_count_with_titles_df.drop_duplicates(subset='movieId')
# Display the resulting DataFrame
ratings_count_with_titles_df.head()

#try to get viz/df with ratings for each movie?

movieId
1         215
2         110
3          52
4           7
5          49
         ... 
193581      1
193583      1
193585      1
193587      1
193609      1
Name: rating, Length: 9742, dtype: int64


Unnamed: 0,movieId,rating_count,title
0,1,215,Toy Story (1995)
215,2,110,Jumanji (1995)
325,3,52,Grumpier Old Men (1995)
377,4,7,Waiting to Exhale (1995)
384,5,49,Father of the Bride Part II (1995)


In [25]:
# Calc avg user rating for each movieId (Meena already did)
avg_ratings = merged_movie_ratings.groupby('movieId')['rating'].mean().reset_index()
avg_ratings.columns = ['movieId', 'average_user_rating']
print(avg_ratings)
# Merge with the movies+ratings df to add movie titles
avg_ratings_with_titles_df = avg_ratings.merge(merged_movie_ratings[['movieId', 'title']], on='movieId', how='left')
avg_ratings_with_titles_df = avg_ratings_with_titles_df.drop_duplicates(subset='movieId')
avg_ratings_with_titles_df.head()

      movieId  average_user_rating
0           1             3.920930
1           2             3.431818
2           3             3.259615
3           4             2.357143
4           5             3.071429
...       ...                  ...
9737   193581             4.000000
9738   193583             3.500000
9739   193585             3.500000
9740   193587             3.500000
9741   193609             4.000000

[9742 rows x 2 columns]


Unnamed: 0,movieId,average_user_rating,title
0,1,3.92093,Toy Story (1995)
215,2,3.431818,Jumanji (1995)
325,3,3.259615,Grumpier Old Men (1995)
377,4,2.357143,Waiting to Exhale (1995)
384,5,3.071429,Father of the Bride Part II (1995)


In [31]:
# Top rated overall based on score and number of ratings given
top_rated = merged_movie_ratings.groupby('movieId').agg(
    average_user_rating=('rating', 'mean'),
    rating_count=('rating', 'count')
).reset_index()

top_rated_df = top_rated.merge(merged_movie_ratings[['movieId', 'title']], on='movieId', how='left')
top_rated_df = top_rated_df.drop_duplicates(subset='movieId')
# top_rated_df = top_rated_df.sort_values(by=['average_user_rating', 'rating_count'], ascending=False) #rating then amt votes
top_rated_df = top_rated_df.sort_values(by=['rating_count', 'average_user_rating'], ascending=False) #amt votes then rating
top_rated_df.head(10)

# LATER: check top 10 for each genre?

Unnamed: 0,movieId,average_user_rating,rating_count,title
10019,356,4.164134,329,Forrest Gump (1994)
8652,318,4.429022,317,"Shawshank Redemption, The (1994)"
7860,296,4.197068,307,Pulp Fiction (1994)
16228,593,4.16129,279,"Silence of the Lambs, The (1991)"
45015,2571,4.192446,278,"Matrix, The (1999)"
6853,260,4.231076,251,Star Wars: Episode IV - A New Hope (1977)
13085,480,3.75,238,Jurassic Park (1993)
3387,110,4.031646,237,Braveheart (1995)
15651,589,3.970982,224,Terminator 2: Judgment Day (1991)
14106,527,4.225,220,Schindler's List (1993)


In [None]:
# 2.	Explore genres info (see if good to use as feature, where need to clean)
# a.	How many genres? [see readme; may need to parse column with genres then count unique]
# b.	Did each user give a genre to the movies or only 1 entry for genre for each movie at original?
# i.	A: 1 genre entry for each movie (entry can have more than 1 genre)
# c.	Group movies by genre? (viz)
# d.	[see where may need to clean ex. nulls, readability]
# e.	[]
# f.	[eventually] figure out hot to categ/scale based on genre – dummies?
# g.	[]


In [None]:
# 3.	Explore user info from unique user ids (can I build user profiles based on this to do collaborative with later for ML?)
# a.	Compile all data based on user ids (group by?) + clean/rmv if no user data
# b.	[]
# c.	[eventually] For each user, cluster on chosen features
# d.	[eventually] Predict our new input based on prev clustering of user profiles that user most sim to?
