# Filter the Full Dataset to get a manageable size

Import Packages

In [218]:
import pandas as pd
import numpy as np

Load Data
+ Movies from the small data set
+ Ratings from the full data set

In [220]:
movies = pd.read_csv('../data/ml-latest-small/movies.csv')
ratings = pd.read_csv('../data/ml-latest/ratings.csv')

## Let's filter the full dataset, so we have only the movies from the small data set left:

In [221]:
# list of movie IDs small dataset used for filtering
movie_list = movies.movieId.to_list()
ratings = ratings[ratings['movieId'].isin(movie_list)]

The shape of the ratings dataframe shows that we have still over 26 Mio. ratings:

In [222]:
ratings.shape

(26782448, 4)

## Defining Filter Functions:

In [223]:
def filter_user(ratings, num):
    ratings_per_user = ratings.groupby('userId').count()['rating'].reset_index()
    ratings_per_user = ratings_per_user[ratings_per_user['rating'] >= num]
    user_list = ratings_per_user.userId.to_list()
    ratings = ratings[ratings['userId'].isin(user_list)]
    return ratings

def filter_movies(ratings, num):
    ratings_per_movie = ratings.groupby('movieId').count()['rating'].reset_index()
    ratings_per_movie = ratings_per_movie[ratings_per_movie['rating'] >= num]
    new_movie_list = ratings_per_movie.movieId.to_list()
    ratings = ratings[ratings['movieId'].isin(new_movie_list)]
    return ratings

Filter user ratings and movie ratings with at least 500:

In [224]:
ratings = filter_user(ratings, 500)
ratings = filter_movies(ratings, 500)

In [225]:
ratings.shape

(7398697, 4)

In [226]:
# some only rated one movie
ratings.groupby('userId').count()['rating'].reset_index().sort_values(by='rating', ascending=True).head()

Unnamed: 0,userId,rating
8668,253425,235
2991,88422,255
9507,277211,327
4577,133710,335
522,15235,340


In [227]:
ratings.groupby('movieId').count()['rating'].reset_index().sort_values(by='rating', ascending=True).head()

Unnamed: 0,movieId,rating
1142,2280,500
3378,71252,500
3581,90522,501
2371,5680,501
3419,74452,501


In [228]:
ratings.groupby('userId').count()['rating'].reset_index().min()

userId      4
rating    235
dtype: int64

In [229]:
ratings.groupby('movieId').count()['rating'].reset_index().min()

movieId      1
rating     500
dtype: int64

So we end up with a minimum of 235 user ratings and 500 ratings per movie.

(We could filter until both - user and movie ratings - end up to be at least 500... but 235 ratings per user is already an okay number.)

### Read into a csv file:

In [230]:
# export new data to csv. file
ratings.to_csv('../data/ratings_235_500.csv',index=False)