In [1]:
import pandas as pd
from tqdm import tqdm
import datetime

In [None]:
ratings = pd.read_csv('data/ml-1m/ratings.csv')
ratings['month'] = ratings['timestamp'].map(lambda ts: datetime.datetime.fromtimestamp(ts).month)
ratings['day'] = ratings['timestamp'].map(lambda ts: datetime.datetime.fromtimestamp(ts).day)
ratings['hour'] = ratings['timestamp'].map(lambda ts: datetime.datetime.fromtimestamp(ts).hour)
# ratings = ratings.set_index('userId')
print(ratings.shape)
ratings.head()

(100836, 7)


Unnamed: 0,userId,movieId,rating,timestamp,month,day,hour
0,1,1,4.0,964982703,7,31,1
1,1,3,4.0,964981247,7,31,1
2,1,6,4.0,964982224,7,31,1
3,1,47,5.0,964983815,7,31,2
4,1,50,5.0,964982931,7,31,1


In [3]:
movies = pd.read_csv('data/ml-latest-small/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
genre_vocab = set()
for genre in movies['genres']:
    for g in genre.split('|'):
        genre_vocab.add(g)
len(genre_vocab)

20

In [5]:
with open('data/ml-latest-small/genre_vocab.txt', 'w') as f:
    f.write('\n'.join(sorted(genre_vocab)))

In [6]:
tags = pd.read_csv('data/ml-latest-small/tags.csv')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
df = pd.merge(left=ratings, right=movies, how='left', on='movieId').reset_index(drop=True)
print(df.shape)
df.head()

(100836, 9)


Unnamed: 0,userId,movieId,rating,timestamp,month,day,hour,title,genres
0,1,1,4.0,964982703,7,31,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,7,31,1,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,7,31,1,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,7,31,2,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,7,31,1,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [8]:
# get unique users
users = df['userId'].unique()
len(users)

610

In [15]:
test_user_ids = []
test_feat_genres = []
test_feat_movie_ids = []
test_feat_ratings = []
test_feat_months = []
test_feat_days = []
test_feat_hours = []
test_label_genres = []
test_label_movie_ids = []
test_label_ratings = []
test_label_months = []
test_label_days = []
test_label_hours = []

train_user_ids = []
train_feat_genres = []
train_feat_movie_ids = []
train_feat_ratings = []
train_feat_months = []
train_feat_days = []
train_feat_hours = []
train_label_genres = []
train_label_movie_ids = []
train_label_ratings = []
train_label_months = []
train_label_days = []
train_label_hours = []


# vanilla dataset
train_vanilla_dfs = []
for uid in tqdm(users):
    tmp = df[df['userId'] == uid].sort_values('timestamp').reset_index(drop=True)
    if len(tmp) < 20:
        continue
    tmp_movies = tmp['movieId'].astype(str).tolist()
    tmp_ratings = tmp['rating'].astype(str).tolist()
    tmp_genres = tmp['genres'].astype(str).tolist()
    tmp_months = tmp['month'].astype(str).tolist()
    tmp_days = tmp['day'].astype(str).tolist()
    tmp_hours = tmp['hour'].astype(str).tolist()
    
    # generate test dataset
    test_size = int(len(tmp_movies) * 0.2)
    if test_size > 20:
        test_size = 20
    
    for _ in range(test_size):
        label_genre = tmp_genres.pop()
        label_movid_id = tmp_movies.pop()
        label_rating = tmp_ratings.pop()
        label_month = tmp_months.pop()
        label_day = tmp_days.pop()
        label_hour = tmp_hours.pop()

        feat_genre = ','.join(tmp_genres)
        feat_movie_id = ','.join(tmp_movies)
        feat_rating = ','.join(tmp_ratings)
        feat_month = ','.join(tmp_months)
        feat_day = ','.join(tmp_days)
        feat_hour = ','.join(tmp_hours)

        test_user_ids.append(str(uid))

        test_feat_genres.append(feat_genre)
        test_feat_movie_ids.append(feat_movie_id)
        test_feat_ratings.append(feat_rating)
        test_feat_months.append(feat_month)
        test_feat_days.append(feat_day)
        test_feat_hours.append(feat_hour)
        
        test_label_genres.append(label_genre)
        test_label_movie_ids.append(label_movid_id)
        test_label_ratings.append(label_rating)
        test_label_months.append(label_month)
        test_label_days.append(label_day)
        test_label_hours.append(label_hour)
    
    # generate train vanilla dataset
    vanilla_user_ids = [uid] * len(tmp_ratings)
    tmp_vanilla = pd.DataFrame({
        "userId": vanilla_user_ids,
        "movieId": tmp_movies,
        "rating": tmp_ratings,
        "genre": tmp_genres,
        "month": tmp_months,
        "day": tmp_days,
        "hour": tmp_hours
    })
    train_vanilla_dfs.append(tmp_vanilla)
    
    # generate train dataset for seq
    train_size = int(len(tmp_ratings) * 0.5)
    if train_size > 200:
        train_size = 200
    
    for _ in range(train_size): 
        label_genre = tmp_genres.pop()
        label_movid_id = tmp_movies.pop()
        label_rating = tmp_ratings.pop()
        label_month = tmp_months.pop()
        label_day = tmp_days.pop()
        label_hour = tmp_hours.pop()

        feat_genre = ','.join(tmp_genres)
        feat_movie_id = ','.join(tmp_movies)
        feat_rating = ','.join(tmp_ratings)
        feat_month = ','.join(tmp_months)
        feat_day = ','.join(tmp_days)
        feat_hour = ','.join(tmp_hours)
        
        train_user_ids.append(str(uid))
        
        train_feat_genres.append(feat_genre)
        train_feat_movie_ids.append(feat_movie_id)
        train_feat_ratings.append(feat_rating)
        train_feat_months.append(feat_month)
        train_feat_days.append(feat_day)
        train_feat_hours.append(feat_hour)

        train_label_genres.append(label_genre)
        train_label_movie_ids.append(label_movid_id)
        train_label_ratings.append(label_rating)
        train_label_months.append(label_month)
        train_label_days.append(label_day)
        train_label_hours.append(label_hour)

100%|██████████| 610/610 [00:01<00:00, 536.11it/s]


In [20]:
# constructing train and test vanilla dataset
train_vanilla_df = pd.concat(train_vanilla_dfs)
print("train vanilla shape: {}".format(train_vanilla_df.shape))
train_vanilla_df.to_csv('data/ml-latest-small/vanilla_train.csv', index=False)

test_vanilla_df = pd.DataFrame({
    "userId": test_user_ids,
    "movieId": test_label_movie_ids,
    "genre": test_label_genres,
    "rating": test_label_ratings,
    "month": test_label_months,
    "day": test_label_days,
    "hour": test_label_hours
})
print("test vanilla shape: {}".format(test_vanilla_df.shape))
test_vanilla_df.to_csv('data/ml-latest-small/vanilla_test.csv', index=False)

train vanilla shape: (92708, 7)
test vanilla shape: (8128, 7)


In [21]:


# construct training and test dataset
train_df = pd.DataFrame({
    'userId': train_user_ids,
    'movieIdSeq': train_feat_movie_ids,
    'genreSeq': train_feat_genres,
    'ratingSeq': train_feat_ratings,
    'monthSeq': train_feat_months,
    'daySeq': train_feat_days,
    'hourSeq': train_feat_hours,
    'targetMovieId': train_label_movie_ids,
    'targetGenre': train_label_genres,
    'targetRating': train_label_ratings,
    'targetMonth': train_label_months,
    'targetDay': train_label_days,
    'targetHour': train_label_hours
})
print("train seq shape: {}".format(train_df.shape))
train_df.to_csv('data/ml-latest-small/sequence_train.csv', index=False)

test_df = pd.DataFrame({
    'userId': test_user_ids,
    'movieIdSeq': test_feat_movie_ids,
    'genreSeq': test_feat_genres,
    'ratingSeq': test_feat_ratings,
    'monthSeq': test_feat_months,
    'daySeq': test_feat_days,
    'hourSeq': test_feat_hours,
    'targetMovieId': test_label_movie_ids,
    'targetGenre': test_label_genres,
    'targetRating': test_label_ratings,
    'targetMonth': test_label_months,
    'targetDay': test_label_days,
    'targetHour': test_label_hours
})
print("test seq shape: {}".format(test_df.shape))
test_df.to_csv('data/ml-latest-small/sequence_test.csv', index=False)

train seq shape: (34953, 13)
test seq shape: (8128, 13)


In [22]:
train_df.head()

Unnamed: 0,userId,movieIdSeq,genreSeq,ratingSeq,monthSeq,daySeq,hourSeq,targetMovieId,targetGenre,targetRating,targetMonth,targetDay,targetHour
0,1,"1210,804,2018,2628,2826,3578,3617,3744,2858,10...","Action|Adventure|Sci-Fi,Comedy|Romance,Animati...","5.0,4.0,5.0,4.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4....","7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,...","31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,3...","1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...",3702,Action|Adventure|Sci-Fi,5.0,7,31,2
1,1,"1210,804,2018,2628,2826,3578,3617,3744,2858,10...","Action|Adventure|Sci-Fi,Comedy|Romance,Animati...","5.0,4.0,5.0,4.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4....","7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,...","31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,3...","1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...",1206,Crime|Drama|Sci-Fi|Thriller,5.0,7,31,2
2,1,"1210,804,2018,2628,2826,3578,3617,3744,2858,10...","Action|Adventure|Sci-Fi,Comedy|Romance,Animati...","5.0,4.0,5.0,4.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4....","7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,...","31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,3...","1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...",1240,Action|Sci-Fi|Thriller,5.0,7,31,2
3,1,"1210,804,2018,2628,2826,3578,3617,3744,2858,10...","Action|Adventure|Sci-Fi,Comedy|Romance,Animati...","5.0,4.0,5.0,4.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4....","7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,...","31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,3...","1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...",1270,Adventure|Comedy|Sci-Fi,5.0,7,31,2
4,1,"1210,804,2018,2628,2826,3578,3617,3744,2858,10...","Action|Adventure|Sci-Fi,Comedy|Romance,Animati...","5.0,4.0,5.0,4.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4....","7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,...","31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,3...","1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,...",2291,Drama|Fantasy|Romance,5.0,7,31,2


In [None]:
genre_map = {g: i+1 for i,g in enumerate(genre_vocab)}
genre_map

In [None]:
for idx, row in train_df.iterrows():
    if idx == 2:
        break
    genre_seq = [[genre_map.get(y, 0) for y in x.split('|')] for x in row['genreSeq'].split(',')]

In [None]:
genre_seq

In [None]:
seq = [18, 12, 17]

In [None]:
max_len = 5
max_len

In [None]:
out = [0] * max_len
out

In [None]:
# if len(seq) > max_len:
#     # truncate the by removing the head
#     seq = seq[-max_len:]
seq = seq[-max_len:]
seq

In [None]:
out_seq = [0] * max_len
for i, item in enumerate(seq):
    out_seq[i] = item

In [None]:
out_seq

In [None]:
out_seq = [[0] * 21 for _ in range(10)]
out_seq

In [None]:
train_df.loc[0, 'genreSeq']

In [None]:
train_df.columns

In [None]:
train_df['targetGenre'][0]

In [2]:
import pandas as pd

In [4]:
ratings = pd.read_csv('data/ml-1m/ratings.dat', sep='::', header=None, names=['userId', 'movieId', 'rating', 'timestamp'])
ratings.head()

  ratings = pd.read_csv('data/ml-1m/ratings.dat', sep='::', header=None, names=['userId', 'movieId', 'rating', 'timestamp'])


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
movies = pd.read_csv('data/ml-latest-small/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
movies = pd.read_csv('data/ml-1m/movies.dat', sep='::', header=None, names=['movieId', 'title', 'genres'], encoding="ISO-8859-1")
movies.head()

  movies = pd.read_csv('data/ml-1m/movies.dat', sep='::', header=None, names=['movieId', 'title', 'genres'], encoding="ISO-8859-1")


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
