In [1]:
import pickle
import pandas as pd
import random

In [2]:
ratings = pd.read_csv('./ml-latest-small/ratings.csv')
ratings.astype({'movieId':int, 'userId':int })
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
ratings = ratings.loc[ratings.rating >= 4.0]
len(ratings)


48580

In [4]:
counts = ratings['userId'].value_counts()

ratings = ratings[~ratings['userId'].isin(counts[counts < 10].index)]

In [5]:
len(ratings.groupby('userId'))

579

In [6]:
ratings = ratings.loc[ratings.rating >= 4.0]
ratings = ratings.sort_values(by=['userId', 'timestamp']).groupby("userId", as_index=False).apply(lambda group: group[:min(len(group), random.randint(10, 30))]).reset_index(drop=True)

In [7]:
user_ids = ratings.userId.unique()
user_df = pd.DataFrame(zip(range(len(user_ids)), user_ids), columns=['newUserId', 'oldId'])
user_df.head(), len(user_ids)

(   newUserId  oldId
 0          0      1
 1          1      2
 2          2      3
 3          3      4
 4          4      5,
 579)

In [8]:
with open('./files/users.txt', 'w') as f:
    for idx, row in user_df.iterrows():
        f.write(f'{user_df.loc[idx == user_df.newUserId].newUserId.to_string(index=False)}\n')

In [9]:
valid_movies = ratings.movieId.unique()
valid_movies

array([  804,  1210,  2018, ..., 26612, 26401, 27790], dtype=int64)

In [10]:
movies = pd.read_csv('./ml-latest-small/movies.csv')
print(len(movies))
movies.head()

9742


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
movies = movies.drop_duplicates('title', keep='first')
movies = movies.loc[movies.movieId.isin(valid_movies)]
num_movies = len(movies)
num_movies

2255

In [12]:
movies.loc[movies.title == 'Emma (1996)']

Unnamed: 0,movieId,title,genres
650,838,Emma (1996),Comedy|Drama|Romance


In [13]:
movies['newId'] = range(len(movies))
movies.head()

Unnamed: 0,movieId,title,genres,newId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
4,5,Father of the Bride Part II (1995),Comedy,3
5,6,Heat (1995),Action|Crime|Thriller,4


In [14]:
genres = movies['genres'].str.get_dummies().columns[1:]
genres_df = pd.DataFrame(list(zip(range(len(genres)), genres)), columns=['genreId','genre'])
    
genres_df

Unnamed: 0,genreId,genre
0,0,Action
1,1,Adventure
2,2,Animation
3,3,Children
4,4,Comedy
5,5,Crime
6,6,Documentary
7,7,Drama
8,8,Fantasy
9,9,Film-Noir


In [15]:
movie_to_genres = {}

for id, row in movies.iterrows():
    movie_genres = row['genres'].split('|')
    movie_title = row['title']
    gen_idxs = []
    for mg in movie_genres:
        if mg == '(no genres listed)':
            continue
        gen_idxs.append((genres_df.loc[genres_df.genre == mg].genreId.to_string(index=False)))
    movie_to_genres[row['newId']] = gen_idxs

In [16]:
df_tags_movie = pd.read_csv('./ml-latest-small/tags.csv')
df_tags_movie = df_tags_movie.loc[df_tags_movie.movieId.isin(valid_movies)]
df_tags_movie.tag = df_tags_movie.tag.str.lower()

df_tags_movie = df_tags_movie.groupby('tag').filter(lambda df:df.shape[0] > 2)

tags = df_tags_movie.tag.unique()
df_tags_movie = df_tags_movie.drop_duplicates(subset=['movieId', 'tag'])

In [17]:
df_tags = pd.DataFrame(list(zip(range(len(tags)), tags)), columns=['tagId','tagName'])
df_tags

Unnamed: 0,tagId,tagName
0,0,funny
1,1,highly quotable
2,2,will ferrell
3,3,tom hardy
4,4,drugs
...,...,...
233,233,heartbreaking
234,234,loneliness
235,235,intense
236,236,bad


In [18]:
with open('./files/tags.txt', 'w') as f:
    for idx in range(len(df_tags)):
        f.write(f'{df_tags.loc[idx==df_tags.tagId].tagName.to_string(index=False)}\n')  

In [19]:
movie_to_tags = {id: [] for id in range(num_movies)}

tmp_merged = df_tags_movie.merge(df_tags, left_on='tag', right_on='tagName').merge(movies, left_on='movieId', right_on='movieId')

for newId in tmp_merged.newId.unique():
    movie_tag_ids = list(tmp_merged.loc[tmp_merged.newId == newId].tagId)
    movie_to_tags[newId] = movie_tag_ids

In [20]:
with open('./files/movie_to_tags.txt', 'w') as f:
    lines = []
    for movie_id in range(len(movie_to_tags)):
        lines.append(f'{" ".join([str(t) for t in movie_to_tags[movie_id]])}\n')
    f.writelines(lines)

In [21]:
with open('./files/movie_to_genres.txt', 'w') as f:
    lines = []
    for movie_id in range(num_movies):
        lines.append(f'{" ".join(movie_to_genres[movie_id])}\n')
    f.writelines(lines)

with open('./files/genres.txt', 'w') as f:
    for idx in range(len(genres_df)):
        f.write(f'{genres_df.loc[genres_df.genreId == idx].genre.to_string(index=False)}\n')    

In [22]:
with open('./files/movies.txt', 'w') as f:
    for idx in range(len(movies)):
        f.write(f'{movies.loc[movies.newId == idx].title.to_string(index=False)}\n')

In [23]:
# with open('./files/ratings.txt', 'w') as f:
#     for idx, row in ratings.iterrows():
#         try:
#             f.write(f'{int(row["userId"])} {movies.loc[movies.movieId == row["movieId"]].newId.to_string(index=False)}\n')
#         except:
#             pass

interactions = ratings.merge(movies, left_on='movieId', right_on='movieId').merge(user_df, left_on='userId', right_on='oldId')
interactions = interactions.loc[:, ['newUserId', 'newId']].sort_values(by='newUserId')

interactions.to_csv('./files/ratings.txt', index=False, header=False, sep=' ')     

In [24]:
ratings_by_user = {}

with open('./files/ratings.txt', 'r') as f:
    lines = [line.strip() for line in f.readlines()]
    for line in lines:
        uid, mid = line.split()
        ratings_by_user[int(uid)] = ratings_by_user.get(int(uid), []) + [int(mid)]

with open('./files/ratings_by_user.pkl', 'wb') as f:
    pickle.dump(ratings_by_user, f)


In [25]:
movies.to_csv('./ml-latest-small/movies_cleaned.csv', index=False)