In [1]:
import pandas as pd
from collections import defaultdict
import numpy as np
import dask.dataframe as dd
from tqdm import tqdm

In [2]:
MOVIES_DATASET = "./data/movies.csv"
RATINGS_DATASET = "./data/ratings.csv"
NUMBER_OF_USER_WATCHED_MIN = 5
NUMBER_OF_MOVIES_WATCHED_MIN = 3

# Preprocessing Movies Dataset

In [3]:
# Mapping movieId to indexes for later use in one-hot encoding  
movie_mapper = defaultdict(int)
def map_movie_to_idx():
    movies_df = pd.read_csv(MOVIES_DATASET)
    
    ratings_df = pd.read_csv(RATINGS_DATASET)
    movie_ids = ratings_df['movieId']
    movie_dict = defaultdict(int)
    for movie in movie_ids:
        movie_dict[movie] += 1
    
    movies_df.dropna(inplace=True)
    counter = 0
    
    # Fastest way to iterate a df
    iter_dict = movies_df.to_dict('records')
    for row in tqdm(iter_dict):
        if (row['movieId'] not in movie_dict) or (movie_dict[row['movieId']] < NUMBER_OF_USER_WATCHED_MIN): 
            continue
        movie_mapper[row['movieId']] = counter
        counter += 1
    print("Number of movies {}".format(counter))

map_movie_to_idx()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9742/9742 [00:00<00:00, 2784009.65it/s]

Number of movies 3650





# Preprocessing Ratings Dataset

In [4]:
def one_hot_encode_movie(movieId):
    num_movies = len(movie_mapper)
#     encoded_movie = np.zeros(num_movies, dtype=int) 
    encoded_movie = [0] * num_movies
    encoded_movie[movie_mapper[movieId]] = 1
    return encoded_movie

def one_hot_encode_user_seq(user_seq):
    encoded = []
    for movie in user_seq:
        encoded.append(one_hot_encode_movie(movie))
#     return np.array(encoded, dtype=object)
    return encoded

def sequentialize():
    ratings_df = pd.read_csv(RATINGS_DATASET)
    ratings_df.dropna(inplace=True)
    ratings_df.sort_values(by=['userId', 'timestamp'], inplace=True)
#     ratings_df = ratings_df[:50000]
    
    ratings_df.dropna(inplace=True)
    userIds = ratings_df['userId'].unique()
    
    seq = []
    for user in tqdm(userIds):
        user_seq = list(ratings_df[(ratings_df['userId'] == user)]['movieId'])
        if len(user_seq) >= NUMBER_OF_MOVIES_WATCHED_MIN:
            seq.append(one_hot_encode_user_seq(user_seq))
#     return np.array(seq, dtype=object)
    return seq
res = sequentialize()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 610/610 [00:06<00:00, 100.89it/s]
