<a href="https://colab.research.google.com/github/flywithu/cornac/blob/master/examples/RecVAE_Example_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install cornac==1.17 bottleneck



In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [None]:
FILE_PREFIX="."
if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  FILE_PREFIX="/content/drive/MyDrive/mycornac"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
# sys.path.insert(0,'/content/drive/MyDrive/daicon/msr')
if FILE_PREFIX not in sys.path:
  sys.path.insert(0,FILE_PREFIX)

In [None]:
import os
import numpy as np
import random
import torch
import cornac
SEED=42
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    # pl.seed_everything(seed)
    # mx.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False


In [None]:
data_set=f"{FILE_PREFIX}/data/ml-20m"
output_set=f"{FILE_PREFIX}/data/20m"

In [None]:
import os
import shutil
import sys

import numpy as np
from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sn
sn.set()

import pandas as pd
import tensorflow as tf
import bottleneck as bn

In [None]:
raw_data = pd.read_csv(os.path.join(data_set, 'ratings.csv'), header=0)


In [None]:
raw_data = raw_data[raw_data['rating'] > 3.5]


In [None]:
raw_data.head()


Unnamed: 0,userId,movieId,rating,timestamp
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826
10,1,293,4.0,1112484703


In [None]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [None]:
def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users.
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]

    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        print("*****************")
        print(usercount)
        print("*****************")

        tp = tp[tp['userId'].isin(usercount.index[usercount['size'] >= min_uc])]
        print(tp)

    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId')
    return tp, usercount, itemcount

Only keep items that are clicked on by at least 5 users


In [None]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data)


*****************
        userId  size
0            1    88
1            2    43
2            3   145
3            4    16
4            5    50
...        ...   ...
138282  138489    27
138283  138490    86
138284  138491     5
138285  138492    61
138286  138493   301

[138287 rows x 2 columns]
*****************
          userId  movieId  rating   timestamp
6              1      151     4.0  1094785734
7              1      223     4.0  1112485573
8              1      253     4.0  1112484940
9              1      260     4.0  1112484826
10             1      293     4.0  1112484703
...          ...      ...     ...         ...
19972658  138286     2941     4.0   956600243
19972659  138286     2946     5.0   956600210
19972660  138286     3072     4.0   956600423
19972661  138286     3159     5.0   956600061
19972663  138286     3471     5.0   956600893

[9857803 rows x 4 columns]


In [None]:
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" %
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 9857803 watching events from 136477 users and 20649 movies (sparsity: 0.350%)


In [None]:
unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [None]:
len(idx_perm)

136477

In [None]:
# create train/validation/test users
n_users = unique_uid.size
n_heldout_users = 10000

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

In [None]:
len(tr_users)

116477

In [None]:
len(vd_users)

10000

In [None]:
len(te_users)

10000

In [None]:
train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]
train_plays


Unnamed: 0,userId,movieId,rating,timestamp
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826
10,1,293,4.0,1112484703
...,...,...,...,...
19709802,136476,1019,5.0,842660232
19709803,136476,1027,4.0,842660064
19709804,136476,1031,4.0,842660320
19709805,136476,1035,5.0,842660064


In [None]:
unique_sid = pd.unique(train_plays['movieId'])
unique_sid

array([   151,    223,    253, ..., 129313,   6745, 126106])

In [None]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [None]:
pro_dir = os.path.join(output_set, 'pro_sg')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)
with open(os.path.join(pro_dir, 'unique_uid.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)

In [None]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('userId')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)

    return data_tr, data_te

In [None]:
vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]

In [None]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)


0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


In [None]:
test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]

In [None]:
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)


0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


Save the data into (user_index, item_index) format


In [None]:
def numerize(tp):
    # uid = list(map(lambda x: profile2id[x], tp['userId']))
    # sid = list(map(lambda x: show2id[x], tp['movieId']))
    uid = tp['userId'].map(profile2id)
    sid = tp['movieId'].map(show2id)
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

In [None]:
train_data = numerize(train_plays)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)

In [None]:
vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

In [None]:
vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

In [None]:
test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

In [None]:
test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)