In [179]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os
from matplotlib import pyplot as plt
import json
from ortools.sat.python import cp_model
from collections import Counter
import math

In [4]:
data_dir = './data/'
books = pd.read_csv(os.path.join(data_dir, 'books.csv'))
books.shape, books.columns

((10000, 23),
 Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
        'isbn13', 'authors', 'original_publication_year', 'original_title',
        'title', 'language_code', 'average_rating', 'ratings_count',
        'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
        'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
        'small_image_url'],
       dtype='object'))

In [158]:
ratings = pd.read_csv(os.path.join(data_dir, 'new_filter_ratings.csv'))
ratings.shape, ratings.columns

((979436, 3), Index(['user_id', 'book_id', 'rating'], dtype='object'))

In [159]:
ratings['rating'].unique()

array([4, 3, 5, 1, 2])

In [160]:
ratings['user_id'].value_counts()

12874    200
30944    200
12381    199
28158    199
52036    199
        ... 
46124      2
32154      2
3207       2
46130      2
17384      2
Name: user_id, Length: 53382, dtype: int64

In [162]:
user_ids = ratings['user_id'].unique()
user_ids.sort()
np.all(user_ids == np.arange(user_ids.min(), user_ids.max() + 1, 1))

  np.all(user_ids == np.arange(user_ids.min(), user_ids.max() + 1, 1))


False

In [163]:
book_ids = books['id'].unique()
book_ids.sort()
np.all(book_ids == np.arange(book_ids.min(), book_ids.max() + 1, 1))

True

In [164]:
book_ids = ratings['book_id'].unique()
book_ids.sort()
np.all(book_ids == np.arange(book_ids.min(), book_ids.max() + 1, 1))

True

In [165]:
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,1180,4
1,1,4893,3
2,1,6285,4
3,2,8034,4
4,2,8855,5


In [137]:
# def filter(s):
#     return s.value_counts().sort_index(ascending=False).sort_values(ascending=False).index[0]
# ratings.groupby(by=['user_id', 'book_id'], as_index=False).agg(lambda s: filter(s)).to_csv(os.path.join(data_dir, 'filter_ratings.csv'), index=False)

In [161]:
# tmp = ratings['user_id'].value_counts()
# ratings[~ ratings['user_id'].isin(tmp[tmp == 1].index)]['book_id'].unique()
# book_ids.sort()
# print(np.all(book_ids == np.arange(book_ids.min(), book_ids.max() + 1, 1)))
# ratings = ratings[~ ratings['user_id'].isin(tmp[tmp == 1].index)]
# ratings.to_csv(os.path.join(data_dir, 'new_filter_ratings.csv'), index=False)

In [169]:
uid_bid = dict()
for uid in tqdm(user_ids):
    uid_bid[uid] = ratings[ratings['user_id'] == uid]['book_id'].to_list()

  0%|          | 0/53382 [00:00<?, ?it/s]

In [171]:
bid_uid = dict()
for bid in tqdm(book_ids):
    bid_uid[bid] = ratings[ratings['book_id'] == bid]['user_id'].to_list()

  0%|          | 0/10000 [00:00<?, ?it/s]

In [182]:
ratio = 0.3
model = cp_model.CpModel()
x = dict()
for uid in user_ids:
    for bid in uid_bid[uid]:
        x[uid, bid] = model.NewIntVar(0, 1, 'x[{}, {}]'.format(uid, bid))
    model.Add(sum(x[uid, bid] for bid in uid_bid[uid]) >= max(1, math.ceil(ratio * len(uid_bid[uid]))))
y = dict()
for bid in book_ids:
    y[bid] = model.NewIntVar(0, 1, 'y[{}]'.format(bid))
    # model.Add(y[bid] == 1).OnlyEnforceIf(
    #     sum(x[uid, bid] for uid in bid_uid[bid]) < len(bid_uid[bid])
    # )
    # model.Add(y[bid] == 0).OnlyEnforceIf(
    #     sum(x[uid, bid] for uid in bid_uid[bid]) == len(bid_uid[bid])
    # )
    # model.Add(sum(x[uid, bid] for uid in bid_uid[bid]) < len(bid_uid[bid])).OnlyEnforceIf(y[bid] == 1)
    # model.Add(sum(x[uid, bid] for uid in bid_uid[bid]) == len(bid_uid[bid])).OnlyEnforceIf(y[bid] == 0)
    model.Add(sum(x[uid, bid] for uid in bid_uid[bid]) + y[bid] <= len(bid_uid[bid]))
model.Maximize(sum(y[bid] for bid in book_ids))
solver = cp_model.CpSolver()
status = solver.Solve(model)

In [183]:
if status in [cp_model.FEASIBLE, cp_model.OPTIMAL]:
    print(solver.ObjectiveValue())

10000.0


In [189]:
for uid in tqdm(user_ids):
    if len(uid_bid[uid]) == sum(solver.Value(x[uid, bid]) for bid in uid_bid[uid]):
        print(uid)

  0%|          | 0/53382 [00:00<?, ?it/s]

In [194]:
large_dict = dict()
for uid in tqdm(user_ids[:]):
    large_dict[uid] = dict()
    for index, row in ratings[ratings['user_id'] == uid].iterrows():
        large_dict[uid][row['book_id']] = row['rating']

  0%|          | 0/53382 [00:00<?, ?it/s]

In [196]:
train_data = {
    'user_id': list(),
    'book_id': list(),
    'rating': list()
}
test_data = {
    'user_id': list(),
    'book_id': list(),
    'rating': list()
}
for uid in tqdm(user_ids):
    for bid in uid_bid[uid]:
        if solver.Value(x[uid, bid]) == 0:
            train_data['user_id'].append(uid)
            train_data['book_id'].append(bid)
            train_data['rating'].append(large_dict[uid][bid])
        else:
            test_data['user_id'].append(uid)
            test_data['book_id'].append(bid)
            test_data['rating'].append(large_dict[uid][bid])

  0%|          | 0/53382 [00:00<?, ?it/s]

In [198]:
train_ratings = pd.DataFrame(data=train_data)
test_ratings = pd.DataFrame(data=test_data)

In [200]:
train_ratings.to_csv(os.path.join(data_dir, 'train_ratings.csv'), index=False)
test_ratings.to_csv(os.path.join(data_dir, 'test_ratings.csv'), index=False)