## Creating a smaller version of `goodbooks-10k`

In this notebook, we reduce the `goodbooks-10k` dataset to a dataset with 5,000 users and 7,960 rated books.

In [1]:
import csv
import os
import pandas as pd

In [2]:
csv_files = [name for name in os.listdir('.') if name.endswith('.csv')]

In [3]:
csv_files

['to_read.csv', 'books.csv', 'book_tags.csv', 'tags.csv', 'ratings.csv']

In [4]:
dfs = {}
for name in csv_files:
    dfs[name] = pd.read_csv(f'./{name}')

In [5]:
ratings_df = dfs['ratings.csv']

In [6]:
top_users = ratings_df.user_id.value_counts().sort_values(ascending=False)

In [7]:
top_users = top_users[:5000]

In [8]:
top_user_ids = top_users.index.tolist()

In [9]:
limited_ratings_df = ratings_df[ratings_df['user_id'].isin(top_user_ids)]

In [10]:
len(limited_ratings_df)

801625

In [11]:
book_counts = limited_ratings_df.book_id.value_counts()

In [12]:
book_counts = book_counts[book_counts >= 10]

In [13]:
top_book_ids = book_counts.index.tolist()

In [14]:
!mkdir -p reduced

In [15]:
def reduce_dataset_and_save(df, top_user_ids, top_book_ids, name):
    limit_df = df[:]
    if 'user_id' in limit_df.columns:
        limit_df = limit_df[(limit_df['user_id'].isin(top_user_ids))]
    if 'book_id' in limit_df.columns:
        limit_df = limit_df[(limit_df['book_id'].isin(top_book_ids))]
    limit_df.to_csv(f'./reduced/{name}', index=False, quoting=csv.QUOTE_ALL)

In [16]:
for name, df in dfs.items():
    reduce_dataset_and_save(df, top_user_ids, top_book_ids, name)