In [1]:
# https://udemy.com/recommender-systems
# https://deeplearningcourses.com/recommender-systems
from __future__ import print_function, division
from builtins import range, input
from io import StringIO

import pandas as pd
import numpy as np
import time
import pickle
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [2]:
#carregando dataset
df = pd.read_csv("large_files/movielens-20m-dataset/rating.csv")

In [3]:
df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
5,1,112,3.5,2004-09-10 03:09:00
6,1,151,4.0,2004-09-10 03:08:54
7,1,223,4.0,2005-04-02 23:46:13
8,1,253,4.0,2005-04-02 23:35:40
9,1,260,4.0,2005-04-02 23:33:46


In [4]:
print("tamanho do dataset: ", df.shape, "\nquantidade de usuários : ", df["userId"].unique().shape[0], "\nquantidade de filmes : ", df["movieId"].unique().shape[0])

tamanho do dataset:  (20000263, 4) 
quantidade de usuários :  138493 
quantidade de filmes :  26744


In [5]:
# faz ids comecarem no 0
df["userId"] = df["userId"]-1

In [6]:
# nem todos os ids de filmes aparecem no dataset
print("maior id de filme: ", max(df["movieId"]), "\nquantidade de filmes : ", df["movieId"].unique().shape[0])

maior id de filme:  131262 
quantidade de filmes :  26744


In [7]:
inicio = time.time()
# colocando o id dos filmes sequenciais
unique_movie_ids = set(df.movieId.values)
movie2idx = {}
count = 0
for movie_id in unique_movie_ids:
    movie2idx[movie_id] = count
    count +=1
df["movie_idx"] = df.apply(lambda row: movie2idx[row.movieId], axis = 1)
fim = time.time()
print("tempo de execução:",(fim - inicio)/60," minutos")

tempo de execução: 5.450014364719391  minutos


In [8]:
# excluindo coluna indeseja e salvando este dataset
df = df.drop(columns=['timestamp'])
df.to_csv('large_files/movielens-20m-dataset/edited_rating.csv', index=False)

In [9]:
# selecionando subset do dataset, pelos usuarios e filmes mais comuns

# criando contador de ids
user_ids_count = Counter(df.userId)
movie_ids_count = Counter(df.movie_idx)
n = 10000
m = 2000

# buscando os usuarios e filmes mais comumns
user_ids = [u for u, c in user_ids_count.most_common(n)]#para cada id:valor do counter pega apenas o id
movie_ids = [m for m, c in movie_ids_count.most_common(m)]#para cada id:valor do counter pega apenas o id

# criando dataframe de usuários e filmes mais comuns
df_small = df[df.userId.isin(user_ids) & df.movie_idx.isin(movie_ids)].copy()

In [10]:
# refazendo sequencia de user id e id do filme 
# para os ids e filmes selecionados

# criando mapa de userid e movieId
new_user_id_map = {}
i = 0
for old in user_ids:
  new_user_id_map[old] = i
  i += 1
print("i:", i)

new_movie_id_map = {}
j = 0
for old in movie_ids:
  new_movie_id_map[old] = j
  j += 1
print("j:", j)

# Aplicando mapa dos novos ids
df_small.loc[:, 'userId'] = df_small.apply(lambda row: new_user_id_map[row.userId], axis=1)
df_small.loc[:, 'movie_idx'] = df_small.apply(lambda row: new_movie_id_map[row.movie_idx], axis=1)
df_small.drop(columns=['userId', 'movie_idx'])
df_small.rename(index=str, columns={'new_userId': 'userId', 'new_movie_idx': 'movie_idx'})
print("max user id:", df_small.userId.max())
print("max movie id:", df_small.movie_idx.max())

print("small dataframe size:", len(df_small))

i: 10000
j: 2000
max user id: 9999
max movie id: 1999
small dataframe size: 5392025


In [11]:
print("small dataframe size:", len(df_small))
df_small.to_csv('large_files/movielens-20m-dataset/small_rating.csv', index=False)

small dataframe size: 5392025


In [12]:
df = df_small
df_small = 0

# dividindo base de treino e teste
df = shuffle(df)
cutoff = int(0.8*len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]

In [13]:
# criando dicionario para dizer qual usuário deu nota a qual filme
user2movie = {}
# criando dicionario para dizer qual filme foi avaliado por qual usuário
movie2user = {}
# criando dicionario usuario+filme vs. nota
usermovie2rating = {}

# preenchendo dicionario da base de treino
def update_user2movie_and_movie2user(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/cutoff))

  i = int(row.userId)
  j = int(row.movie_idx)
  if i not in user2movie:# se ainda nao existe aquele usuário no dicionario criar usuário e respectivo id de filme
    user2movie[i] = [j]
  else:# caso ja exista o usuário adicionar o id do filme para este
    user2movie[i].append(j)

  if j not in movie2user:# se ainda nao existe aquele filme no dicionario criar filme e respectivo id de usuário
    movie2user[j] = [i]
  else:# caso ja exista o usuario adicionar o id do filme para este
    movie2user[j].append(i)

  usermovie2rating[(i,j)] = row.rating #dicionario onde a chqave é id de usuários + id de filme e o valor é a respectiva nota
df_train.apply(update_user2movie_and_movie2user, axis=1)#aplicando funcao para todo o df

# preenchendo dicionario da base de teste(somente o usuario+filme vs. nota)
usermovie2rating_test = {}
print("Calling: update_usermovie2rating_test")
count = 0
def update_usermovie2rating_test(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/len(df_test)))

  i = int(row.userId)
  j = int(row.movie_idx)
  usermovie2rating_test[(i,j)] = row.rating
df_test.apply(update_usermovie2rating_test, axis=1)

# note: these are not really JSONs
with open('user2movie.json', 'wb') as f:
  pickle.dump(user2movie, f)

with open('movie2user.json', 'wb') as f:
  pickle.dump(movie2user, f)

with open('usermovie2rating.json', 'wb') as f:
  pickle.dump(usermovie2rating, f)

with open('usermovie2rating_test.json', 'wb') as f:
  pickle.dump(usermovie2rating_test, f)


processed: 0.023
processed: 0.046
processed: 0.070
processed: 0.093
processed: 0.116
processed: 0.139
processed: 0.162
processed: 0.185
processed: 0.209
processed: 0.232
processed: 0.255
processed: 0.278
processed: 0.301
processed: 0.325
processed: 0.348
processed: 0.371
processed: 0.394
processed: 0.417
processed: 0.440
processed: 0.464
processed: 0.487
processed: 0.510
processed: 0.533
processed: 0.556
processed: 0.580
processed: 0.603
processed: 0.626
processed: 0.649
processed: 0.672
processed: 0.695
processed: 0.719
processed: 0.742
processed: 0.765
processed: 0.788
processed: 0.811
processed: 0.835
processed: 0.858
processed: 0.881
processed: 0.904
processed: 0.927
processed: 0.950
processed: 0.974
processed: 0.997
Calling: update_usermovie2rating_test
processed: 0.093
processed: 0.185
processed: 0.278
processed: 0.371
processed: 0.464
processed: 0.556
processed: 0.649
processed: 0.742
processed: 0.835
processed: 0.927
