In [2]:
import pandas as pd
import numpy as np

In [4]:
ratings = pd.read_csv('data/ratings.csv')
movies = pd.read_csv('data/movies.csv')

In [5]:
print(ratings.head())
print(movies.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [6]:
# Los géneros mejor rankeados en promedio en el año 2005
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings_2005 = ratings[ratings['timestamp'].dt.year == 2005]
ratings_2005 = ratings_2005.merge(movies, on='movieId')
mean_ratings_2005 = ratings_2005.groupby('genres')['rating'].mean().reset_index()
mean_ratings_2005 = mean_ratings_2005.sort_values(by='rating', ascending=False)
print(mean_ratings_2005.head())

                         genres    rating
407        Drama|Mystery|Sci-Fi  5.000000
181  Adventure|Animation|Comedy  5.000000
211       Adventure|Crime|Drama  5.000000
249            Animation|Comedy  5.000000
434  Film-Noir|Mystery|Thriller  4.666667


In [7]:
# Cuál es el top 5 de películas mejor rankeadas en promedio?
ratings_mean = ratings.groupby('movieId')['rating'].mean().reset_index()
ratings_mean = ratings_mean.merge(movies, on='movieId')
top_5_movies = ratings_mean.sort_values(by='rating', ascending=False).head(5)
print(top_5_movies[['title', 'rating']])

                                                  title  rating
87                 Heidi Fleiss: Hollywood Madam (1995)     5.0
6903                               Che: Part Two (2008)     5.0
1036  Vampire in Venice (Nosferatu a Venezia) (Nosfe...     5.0
7583                           Idiots and Angels (2008)     5.0
7582               Louis Theroux: Law & Disorder (2008)     5.0


In [8]:
# Cuál es el género con más reviews?
genre_counts = movies['genres'].str.get_dummies(sep='|').sum().reset_index()
genre_counts.columns = ['genre', 'count']
genre_counts = genre_counts.sort_values(by='count', ascending=False)
print(genre_counts.head())

       genre  count
8      Drama   4361
5     Comedy   3756
17  Thriller   1894
1     Action   1828
15   Romance   1596


In [9]:
# Cuáles es el género por año con mejor rating entre el año 2000 y 2010?
ratings['year'] = ratings['timestamp'].dt.year
ratings_2000_2010 = ratings[(ratings['year'] >= 2000) & (ratings['year'] <= 2010)]
ratings_2000_2010 = ratings_2000_2010.merge(movies, on='movieId')
mean_ratings_2000_2010 = ratings_2000_2010.groupby(['year', 'genres'])['rating'].mean().reset_index()
mean_ratings_2000_2010 = mean_ratings_2000_2010.sort_values(by=['year', 'rating'], ascending=[True, False])
print(mean_ratings_2000_2010.groupby('year').first().reset_index())

    year                                      genres  rating
0   2000              Animation|Comedy|Drama|Fantasy     5.0
1   2001                        Action|Drama|Western     5.0
2   2002    Action|Adventure|Animation|Horror|Sci-Fi     5.0
3   2003    Action|Adventure|Animation|Drama|Fantasy     5.0
4   2004   Adventure|Children|Comedy|Fantasy|Musical     5.0
5   2005                  Adventure|Animation|Comedy     5.0
6   2006                  Action|Romance|War|Western     5.0
7   2007  Adventure|Animation|Children|Comedy|Sci-Fi     5.0
8   2008  Adventure|Animation|Children|Comedy|Sci-Fi     5.0
9   2009   Action|Adventure|Mystery|Romance|Thriller     5.0
10  2010                       Action|Comedy|Musical     5.0


In [10]:
#  Cuál es el usuario con más críticas y cuál es su género favorito?
user_review_counts = ratings['userId'].value_counts().reset_index()
user_review_counts.columns = ['userId', 'review_count']
top_user = user_review_counts.iloc[0]
top_user_reviews = ratings[ratings['userId'] == top_user['userId']]
top_user_reviews = top_user_reviews.merge(movies, on='movieId')
top_user_genres = top_user_reviews['genres'].str.get_dummies(sep='|').sum().reset_index()
top_user_genres.columns = ['genre', 'count']
top_user_genres = top_user_genres.sort_values(by='count', ascending=False)
print(f"User ID: {top_user['userId']}, Review Count: {top_user['review_count']}")
print(top_user_genres.head(1))

User ID: 414, Review Count: 2698
   genre  count
8  Drama   1309
