In [1]:
import pickle
import pandas as pd
import random
import tqdm

In [2]:
ratings = pd.read_csv('./ml-25m/ratings.csv')
ratings = ratings.astype({'movieId':int, 'userId':int })
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [3]:
ratings = ratings.loc[ratings.rating > 4.0]
len(ratings)


5813013

In [4]:
counts = ratings['userId'].value_counts()

res = ratings[~ratings['userId'].isin(counts[counts < 10].index)]

In [5]:
len(res.groupby('userId'))

118779

In [6]:
ratings_grouped = res.sort_values(by=['userId', 'timestamp']).groupby("userId")

In [7]:
idxs = []
for name, g in tqdm.tqdm(ratings_grouped):
    idxs.extend(list(g.timestamp.nsmallest(random.randint(10,20)).reset_index().iloc[:, 0]))

100%|██████████| 118779/118779 [01:46<00:00, 1119.55it/s]


In [8]:
res.sort_values(by=['userId', 'timestamp'])

Unnamed: 0,userId,movieId,rating,timestamp
41,1,6711,5.0,1147868622
23,1,3949,5.0,1147868678
2,1,307,5.0,1147868828
8,1,1237,5.0,1147868839
56,1,8154,5.0,1147868865
...,...,...,...,...
25000002,162541,2324,4.5,1240953595
25000013,162541,2788,4.5,1240953598
24999915,162541,47,4.5,1240953606
24999971,162541,1259,4.5,1240953609


In [9]:
res = res.loc[idxs]
res.head(30)

Unnamed: 0,userId,movieId,rating,timestamp
41,1,6711,5.0,1147868622
23,1,3949,5.0,1147868678
2,1,307,5.0,1147868828
8,1,1237,5.0,1147868839
56,1,8154,5.0,1147868865
44,1,7234,4.5,1147868869
20,1,2843,4.5,1147868891
24,1,4144,5.0,1147868898
29,1,4973,4.5,1147869080
37,1,6016,5.0,1147869090


In [10]:
ratings = res

In [11]:
user_ids = ratings.userId.unique()
user_df = pd.DataFrame(zip(range(len(user_ids)), user_ids), columns=['newUserId', 'oldId'])
user_df.head()

Unnamed: 0,newUserId,oldId
0,0,1
1,1,2
2,2,3
3,3,4
4,4,5


In [12]:
with open('./files_big/users.txt', 'w') as f:
    for idx, row in user_df.iterrows():
        f.write(f'{user_df.loc[idx == user_df.newUserId].newUserId.to_string(index=False)}\n')

In [13]:
valid_movies = ratings.movieId.unique()
valid_movies, len(valid_movies)

(array([  6711,   3949,    307, ...,  84983,   7625, 148685]), 14600)

In [14]:
movies = pd.read_csv('./ml-25m/movies.csv')
print(len(movies))
movies.head()

62423


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [15]:
movies = movies.drop_duplicates('title', keep='first')
movies = movies.loc[movies.movieId.isin(valid_movies)]
num_movies = len(movies)
num_movies

14596

In [16]:
movies.loc[movies.title == 'Emma (1996)']

Unnamed: 0,movieId,title,genres
820,838,Emma (1996),Comedy|Drama|Romance


In [17]:
movies['newId'] = range(len(movies))
movies.head()

Unnamed: 0,movieId,title,genres,newId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3
4,5,Father of the Bride Part II (1995),Comedy,4


In [18]:
genres = movies['genres'].str.get_dummies().columns[1:]
genres_df = pd.DataFrame(list(zip(range(len(genres)), genres)), columns=['genreId','genre'])
    
genres_df.head()

Unnamed: 0,genreId,genre
0,0,Action
1,1,Adventure
2,2,Animation
3,3,Children
4,4,Comedy


In [19]:
movie_to_genres = {}

for id, row in movies.iterrows():
    movie_genres = row['genres'].split('|')
    movie_title = row['title']
    gen_idxs = []
    for mg in movie_genres:
        if mg == '(no genres listed)':
            continue
        gen_idxs.append((genres_df.loc[genres_df.genre == mg].genreId.to_string(index=False)))
    movie_to_genres[row['newId']] = gen_idxs

In [20]:
df_tags_movie = pd.read_csv('./ml-25m/tags.csv')
df_tags_movie = df_tags_movie.loc[df_tags_movie.movieId.isin(valid_movies)]
df_tags_movie.tag = df_tags_movie.tag.str.lower()

df_tags_movie = df_tags_movie.groupby('tag').filter(lambda df:df.shape[0] > 2)

tags = df_tags_movie.tag.unique()
df_tags_movie = df_tags_movie.drop_duplicates(subset=['movieId', 'tag'])

In [21]:
df_tags = pd.DataFrame(list(zip(range(len(tags)), tags)), columns=['tagId','tagName'])
df_tags

Unnamed: 0,tagId,tagName
0,0,classic
1,1,sci-fi
2,2,dark comedy
3,3,great dialogue
4,4,so bad it's good
...,...,...
20127,20127,waatch top pick
20128,20128,pg-13:smoking
20129,20129,pg:rude humor
20130,20130,what a twist!


In [22]:
with open('./files_big/tags.txt', 'w') as f:
    for idx in range(len(df_tags)):
        f.write(f'{df_tags.loc[idx==df_tags.tagId].tagName.to_string(index=False)}\n')  

In [23]:
movie_to_tags = {id: [] for id in range(num_movies)}

tmp_merged = df_tags_movie.merge(df_tags, left_on='tag', right_on='tagName').merge(movies, left_on='movieId', right_on='movieId')

for newId in tmp_merged.newId.unique():
    movie_tag_ids = list(tmp_merged.loc[tmp_merged.newId == newId].tagId)
    movie_to_tags[newId] = movie_tag_ids

In [24]:
len(movies.groupby('movieId')), num_movies

(14596, 14596)

In [25]:
with open('./files_big/movie_to_tags.txt', 'w') as f:
    lines = []
    for movie_id in range(len(movie_to_tags)):
        lines.append(f'{" ".join([str(t) for t in movie_to_tags[movie_id]])}\n')
    f.writelines(lines)

In [26]:
with open('./files_big/movie_to_genres.txt', 'w') as f:
    lines = []
    for movie_id in range(num_movies):
        lines.append(f'{" ".join(movie_to_genres[movie_id])}\n')
    f.writelines(lines)

with open('./files_big/genres.txt', 'w') as f:
    for idx in range(len(genres_df)):
        f.write(f'{genres_df.loc[genres_df.genreId == idx].genre.to_string(index=False)}\n')    

In [27]:
with open('./files_big/movies.txt', 'w') as f:
    for idx in range(len(movies)):
        f.write(f'{movies.loc[movies.newId == idx].title.to_string(index=False)}\n')

In [28]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
41,1,6711,5.0,1147868622
23,1,3949,5.0,1147868678
2,1,307,5.0,1147868828
8,1,1237,5.0,1147868839
56,1,8154,5.0,1147868865
...,...,...,...,...
25000077,162541,7147,4.5,1240952343
25000094,162541,63876,5.0,1240952515
24999935,162541,541,5.0,1240952537
24999975,162541,1275,4.5,1240952554


In [29]:
# with open('./files_big/ratings.txt', 'w') as f:
#     for idx, row in ratings.iterrows():
#         try:
#             f.write(f'{int(row["userId"])} {movies.loc[movies.movieId == row["movieId"]].newId.to_string(index=False)}\n')
#         except:
#             pass

interactions = ratings.merge(movies, left_on='movieId', right_on='movieId').merge(user_df, left_on='userId', right_on='oldId')
interactions = interactions.loc[:, ['newUserId', 'newId']].sort_values(by='newUserId')

interactions.to_csv('./files_big/ratings.txt', index=False, header=False, sep=' ')

In [30]:
ratings_by_user = {}

with open('./files_big/ratings.txt', 'r') as f:
    lines = [line.strip() for line in f.readlines()]
    for line in lines:
        uid, mid = line.split()
        ratings_by_user[int(uid)] = ratings_by_user.get(int(uid), []) + [int(mid)]

with open('./files_big/ratings_by_user.pkl', 'wb') as f:
    pickle.dump(ratings_by_user, f)


In [31]:
movies.to_csv('./ml-25m/movies_cleaned.csv', index=False)