In [1]:
import numpy as np 
import pandas as pd 
import random

In [2]:
data = pd.read_csv('QB-video.csv',usecols=['user_id', 'item_id', 'click'])

In [3]:
data = data.drop_duplicates(['user_id','item_id'])

In [4]:
# core 10 process 
pos_data = data[data['click'] == 1].reset_index(drop=True)
while True:
    org_num = pos_data.shape[0]
    # process user 
    user_id_counts = pos_data['user_id'].value_counts()
    save_user_ids = user_id_counts[user_id_counts >= 10].index
    save_rows = pos_data['user_id'].isin(save_user_ids)
    pos_data = pos_data[save_rows].reset_index(drop=True)
    # process item 
    item_id_counts = pos_data['item_id'].value_counts()
    save_item_ids = item_id_counts[item_id_counts >= 10].index
    save_rows = pos_data['item_id'].isin(save_item_ids)
    pos_data = pos_data[save_rows].reset_index(drop=True)
    if org_num == pos_data.shape[0]:
        break 
save_data_row = (data['user_id'].isin(save_user_ids)) & (data['item_id'].isin(save_item_ids))
data = data[save_data_row].reset_index(drop=True) 
data 

Unnamed: 0,user_id,item_id,click
0,3502303,32277,0
1,3502303,1370829,0
2,3502303,1357499,1
3,3502303,1353465,0
4,3502303,1354715,1
...,...,...,...
1975078,3729300,3202887,0
1975079,3729300,1434577,0
1975080,3729300,3202945,1
1975081,3729300,3305030,0


In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
user_trans = LabelEncoder()
data['user_id'] = user_trans.fit_transform(data['user_id'])
item_trans = LabelEncoder() 
data['item_id'] = item_trans.fit_transform(data['item_id'])
data

Unnamed: 0,user_id,item_id,click
0,1707,680,0
1,1707,4153,0
2,1707,2970,1
3,1707,2390,0
4,1707,2609,1
...,...,...,...
1975078,1878,9219,0
1975079,1878,6137,0
1975080,1878,9253,1
1975081,1878,15345,0


In [7]:
user_num = data['user_id'].max() + 1 
item_num = data['item_id'].max() + 1
user_num,item_num 

(27941, 15383)

In [8]:
pos_data = data[data['click'] == 1].reset_index(drop=True)
neg_data = data[data['click'] != 1].reset_index(drop=True) 

In [9]:
interaction_data = {}
dislike_data = {}
for user,item in zip(pos_data['user_id'],pos_data['item_id']):
    if user not in interaction_data:
        interaction_data[user] = [] 
    interaction_data[user].append(item) 
for user,item in zip(neg_data['user_id'],neg_data['item_id']):
    if user not in dislike_data:
        dislike_data[user] = [] 
    dislike_data[user].append(item)   

In [10]:
# split train/val set in 8:2 ratio 
train_inter = {} 
val_inter = {} 
for user in range(user_num):
    inter_num = len(interaction_data[user])
    assert(inter_num == len(set(interaction_data[user])))
    train_num = int(0.8 * inter_num) 
    random.shuffle(interaction_data[user])
    train_inter[user] = interaction_data[user][:train_num] 
    val_inter[user] = interaction_data[user][train_num:]

In [11]:
with open('train.txt','w') as f:
    for user in train_inter:
        for item in train_inter[user]:
            f.write(f'{user} {item} 1\n')
with open('test.txt','w') as f:
    for user in val_inter:
        for item in val_inter[user]:
            f.write(f'{user} {item} 1\n')
with open('neg.txt','w') as f:
    for user in dislike_data:
        for item in dislike_data[user]:
            f.write(f'{user} {item} 1\n')

In [11]:
train_data = pd.DataFrame(list(train_inter.items()), columns=['user', 'item_list'])
train_data.to_csv('train.csv',index=False)
val_data = pd.DataFrame(list(val_inter.items()), columns=['user', 'item_list'])
val_data.to_csv('val.csv',index=False)
neg_data = pd.DataFrame(list(dislike_data.items()), columns=['user', 'item_list'])
neg_data.to_csv('neg.csv',index=False)