In [1]:
import numpy as np
import pandas as pd
import pickle
from pathlib import Path

In [2]:
base_artifacts = Path.cwd().resolve().parents[1] / 'CausalI2I_artifacts'
data_path_goodreads = base_artifacts / 'Datasets' / 'Processed' / 'goodreads'
data_path_sequels = base_artifacts / 'Datasets' / 'Sequels'

train = pd.read_csv(data_path_goodreads / 'train.csv')
test = pd.read_csv(data_path_goodreads / 'test.csv')
with open(data_path_goodreads / 'item_dict.pkl', 'rb') as f:
    item_dict = pickle.load(f)

n_users = len(train['user_id'].unique())
n_items = len(item_dict)

full_data = pd.concat([train, test], ignore_index=True)
title2id = {v: k for k, v in item_dict.items()}

### Find titles that:
 * Belong to a series, and
 * From this series there is more than one book in the data

In [3]:
with open(data_path_sequels / 'name2series.pkl', 'rb') as f:
    name2series = pickle.load(f)

In [4]:
all_titles = list(item_dict.values())
ser_titles = [title for title in all_titles if title in name2series]
print(f'Number of items with precomputed series: {len(ser_titles)} out of {n_items}')

Number of items with precomputed series: 3780 out of 6384


In [5]:
serNames = [name2series[title]['series'] for title in ser_titles]
ser = pd.Series(serNames).value_counts()
good_series = ser[ser > 1].index.tolist()
good_titles = [title for title in ser_titles if name2series[title]['series'] in good_series]

### Out of the `good_titles` find those that are popular in the data

In [6]:
id2series = {title2id[title]: name2series[title]['series'] for title in good_titles}

In [7]:
df = full_data[full_data['interaction'] == 1].copy()
df['series'] = df['item_id'].apply(
    lambda x: id2series.get(x, np.nan)
)
df = df.dropna(subset=['series'])

series_summaries = (
    df
    .groupby('series')
    .agg({
        'interaction': 'size', 
        'item_id': 'nunique'
    })
    .sort_values(by='interaction', ascending=False)
)
series_summaries['cumsum'] = series_summaries['item_id'].cumsum()

In [8]:
candidates = series_summaries[series_summaries['cumsum'] < n_items * 0.2]
chosen_series = candidates.index.tolist()
chosen_ids = [item_id for item_id, series in id2series.items() if series in chosen_series]

print(f"Number of chosen items: {len(chosen_ids):,}")
print(f"Number of chosen series: {len(chosen_series):,}")
print(f"Number of labeled pairs: {candidates['item_id'].apply(lambda x: x * (x-1)).sum():,}")

Number of chosen items: 1,274
Number of chosen series: 248
Number of labeled pairs: 7,386


### Set new train and test sets

In [9]:
unique_users = full_data['user_id'].unique().tolist()
unique_items = full_data['item_id'].unique().tolist()

rng = np.random.default_rng(42)
test_users = rng.choice(unique_users, size=int(0.5 * len(unique_users)), replace=False).tolist()
train_users = [u for u in unique_users if u not in test_users]
test_items = chosen_ids
train_items = [i for i in unique_items if i not in test_items]

In [10]:
new_train = full_data[(full_data['user_id'].isin(train_users)) | (full_data['item_id'].isin(train_items))].copy()
new_test = full_data[(full_data['user_id'].isin(test_users)) & (full_data['item_id'].isin(test_items))].copy()

In [11]:
id2info = {item_id: {
    'title': item_dict[item_id],
    'series': name2series[item_dict[item_id]]['series'],
    'number': name2series[item_dict[item_id]]['number']} 
    for item_id in chosen_ids
}

In [12]:
# 6) Save to disk
print("Saving files to disk...\n")
new_train.to_csv(data_path_sequels / 'train.csv', index=False)
new_test.to_csv(data_path_sequels / 'test.csv', index=False)

with open(data_path_sequels / 'id2info.pkl', 'wb') as f:
    pickle.dump(id2info, f)

print('Files saved to disk.')

Saving files to disk...

Files saved to disk.
