# Run Baselines

## chainRec Goodreads

In [2]:
import tensorflow as tf
import gzip
import pandas as pd
import json

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Pull down and store GoodReads data and user/id map needed for baseline

In [102]:
# pull down user interactions map
!gdown https://drive.google.com/uc?id=1zmylV7XW2dfQVCLeg1LbllfQtHD2KUon

!gzip -f ./goodreads_interactions.csv; mv ./goodreads_interactions.csv.gz ./chainRec-master/data/

Downloading...
From: https://drive.google.com/uc?id=1zmylV7XW2dfQVCLeg1LbllfQtHD2KUon
To: /scratch/charles/RankPSL/Baselines/goodreads_interactions.csv
4.32GB [01:40, 43.1MB/s]


In [4]:
# preprocess goodreads data for chainrec format
filename = "goodreads_interactions.csv.gz"
df = pd.read_csv("chainRec-master/data/" + filename, 
                 compression='gzip', header=0, sep=',', quotechar='"')

In [None]:
def load_data(file_name, head = 1000):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the threshold number of lines
            if (head) and (count > head):
                break
    return data

In [None]:
books = load_data(os.path.join(DIR, 'goodreads_books_comics_graphic.json.gz'), head=None)

In [None]:
interactions = load_data(os.path.join(DIR, 'goodreads_interactions_comics_graphic.json.gz'), head=None)

In [None]:
reviews = load_data(os.path.join(DIR, 'goodreads_reviews_dedup.json.gz'), head=None)

Apply preprocessing mentioned in paper: 

"We apply the same preprocessing criteria for all five datasets: we
discard users who have never reached the last stage of any behavior
chain and items with fewer than 5 associated interactions in
the system. Statistics and distributions of the above datasets after
preprocessing are included in Table 2. For each dataset, we sample
100,000 interaction chains for validation and another 100,000
for testing. Within each of these two sets, each interaction chain
corresponds to a different user. Data and code are available at
https://github.com/MengtingWan/chainRec."

In [5]:
filtered_goodreads_df = df.groupby('user_id').filter(lambda user_frame: user_frame.is_reviewed.sum() > 0)
filtered_goodreads_df = filtered_goodreads_df.groupby('book_id').filter(lambda book_frame: book_frame.shape[0] > 5)

KeyboardInterrupt: 

In [None]:
filtered_goodreads_df.user_id.unique().shape

In [106]:
sampled_filtered_goodreads_df = filtered_goodreads_df.head(10000)

Build the user item map needed for than cahin_rec method

In [None]:
# monotonic chain from paper
['shelve', 'read', 'rate', 'recommend']

DATA_DIR = './chainRec-master/data/'
DATA_NAME = 'goodreads'

user_item_map = []

df_by_user = filtered_goodreads_df.groupby('user_id')

def chain_stage(row):
    if row.is_reviewed == 1:
        return 3
    if row.rating > 0:
        return 2
    if row.is_read == 1:
        return 1
    else:
        return 0
    
f = open(DATA_DIR+DATA_NAME+".user_item_map", "w")
for user_dict in user_item_map:
    f.write(str(user_dict))
    f.write('\n')

# put data into user_item_map
for user_id, user_df in df_by_user:
    items = {str(row.book_id): chain_stage(row) for _, row in user_df.iterrows()}
    f.write(str({'user_id': user_id, 'items': items}))
    f.write('\n')
    
f.close()

In [144]:
len(user_item_map)

97808

Write user item map to file

In [None]:
# zip up file
file_name = DATA_DIR+DATA_NAME+".user_item_map"
!rm "{file_name}.gz"
!gzip $file_name

## Yoochoose

In [123]:
import numpy as np
import pandas as pd
import gzip
import sys
DATA_DIR = './chainRec-master/data/'
DATA_NAME = 'goodreads'
DATA_PATH = DATA_DIR+DATA_NAME+".user_item_map.gz"
n_user = 0
n_item = 0
n_interaction = 0
try:
    user_item_map = {}
    with gzip.open(DATA_DIR+DATA_NAME+".user_item_map.gz") as fin:
        for l in fin:
            d = eval(l)
            uid = int(d['user_id'])
            if (uid+1) > n_user:
                n_user = (uid+1)

            items = np.array(list(d['items'].items()), dtype=int)
            n_interaction += items.shape[0]

            max_iid = items[:,0].max()
            if (max_iid+1) > n_item:
                n_item = (max_iid+1)
            user_item_map[uid] = items
except NameError as err:
    print(err)
    print("Fail to load", DATA_PATH, ". Please check if the file exists and is in a correct format!")

In [129]:
items = user_item_map[0]
items[:, 1] > 4

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,