In [1]:
import numpy as np
import pandas as pd

## Goodreads Samples

This notebook samples the files generated in the preprocessing notebook to bring them down to a more managable level. For our purposes, we don't need to utilize the full dataset, but just to prove that the reproducibility issues also happen with the Goodreads dataset.

In [2]:
# Input
responses_csv = 'responses.csv.gz'
user_features_csv = 'user_features.csv.gz'

# Output
sample_responses_csv = 'sample_responses.csv.gz'
sample_user_features_csv = 'sample_user_features.csv.gz'

In [3]:
user_features = pd.read_csv(user_features_csv)
user_features.head()

Unnamed: 0,user_id,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,read-2013,scifi,faves,hunger-games,did-not-finish,english,lgbt,reread,e-books,rick-riordan
0,00000377eea48021d3002730d56aca9a,0.0,0.008206,0.0,0.0,0.013352,0.006544,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
1,00009e46d18f223a82b22da38586b605,7.6e-05,0.006519,9e-06,0.00086,0.004576,0.001978,0.000135,0.001225,0.000869,...,False,False,False,False,False,False,False,False,False,False
2,0000c3d51aa099745e93a4e99c4856c8,0.001248,0.003721,0.000594,0.000242,0.007567,0.003567,0.006358,0.001862,0.001025,...,False,False,False,False,False,False,False,False,False,False
3,0001085188e302fc6b2568de45a5f56b,0.004398,0.006148,0.002103,0.000484,0.019721,0.002422,3.9e-05,0.00182,0.001387,...,False,False,False,False,False,False,False,False,False,False
4,000157a6f8331e9c9a21252e1fee91d1,0.0,0.00341,0.010195,0.00056,0.003218,0.001702,0.0,0.0,0.000493,...,False,False,False,False,False,False,False,False,False,False


We take a sample of 7K train users and 3K test users.

In [4]:
np.random.seed(42)

train_sample = user_features[user_features['set'] == 'train'].sample(7000)
test_sample = user_features[user_features['set'] == 'test'].sample(3000)
sample_features = pd.concat([train_sample, test_sample], ignore_index=True)
print(sample_features.shape)
sample_features.head()

(10000, 132)


Unnamed: 0,user_id,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,read-2013,scifi,faves,hunger-games,did-not-finish,english,lgbt,reread,e-books,rick-riordan
0,838242f8a023a925393313ecacf63876,0.004155,0.003627,0.0,0.000429,0.016731,0.005133,0.000126,0.004856,0.006303,...,False,False,False,False,False,False,False,False,False,False
1,ace33950b7a3a3e4ce3c2bdbfc5c6797,0.0,1.4e-05,8e-06,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,1a5a22320fa146de0437df0bf5f7335f,0.002643,0.003326,0.000646,4.5e-05,0.011235,0.001576,0.000141,3.9e-05,0.003314,...,False,False,False,False,False,False,False,False,False,False
3,214ba9ed6431176800ea3fea13b50578,0.000659,0.005181,0.0,0.000163,0.018886,0.000528,8.6e-05,5e-05,0.000413,...,False,False,False,False,False,False,False,False,False,False
4,1af22c7610c97e28a4225e192cfb4d29,0.017151,0.007885,0.0,0.0,0.019878,0.000784,0.002049,0.0,0.001679,...,False,False,False,False,False,False,False,False,False,False


We restrict the responses to those only made by the sampled train and test users.

In [5]:
responses = pd.read_csv(responses_csv)
responses.head()

Unnamed: 0,user_id,book_id,response
0,8842281e1d1347389f2ab93d60773d4d,8684868,False
1,8842281e1d1347389f2ab93d60773d4d,8423493,False
2,8842281e1d1347389f2ab93d60773d4d,87976,True
3,8842281e1d1347389f2ab93d60773d4d,18116,True
4,8842281e1d1347389f2ab93d60773d4d,2767052,True


In [6]:
sample_responses = responses[responses['user_id'].isin(sample_features['user_id'])]
print(sample_responses.shape)
sample_responses.head()

(502542, 3)


Unnamed: 0,user_id,book_id,response
2092,b1360d644d3f5a5a604d6fcfbf772aec,2342280,True
2093,b1360d644d3f5a5a604d6fcfbf772aec,12977172,False
2094,b1360d644d3f5a5a604d6fcfbf772aec,7631105,False
2095,b1360d644d3f5a5a604d6fcfbf772aec,10401084,True
2096,b1360d644d3f5a5a604d6fcfbf772aec,8755785,True


Furthermore, in order to reduce computational cost, we restrict the books to the top 500 books.

In [7]:
book_counts = sample_responses.groupby('book_id')['response'].count().sort_values(ascending=False)
book_ids = book_counts[book_counts > 500].index
sample_responses = sample_responses[sample_responses['book_id'].isin(book_ids)]
print(sample_responses.shape)
sample_responses.head()

(140808, 3)


Unnamed: 0,user_id,book_id,response
2094,b1360d644d3f5a5a604d6fcfbf772aec,7631105,False
2096,b1360d644d3f5a5a604d6fcfbf772aec,8755785,True
2097,b1360d644d3f5a5a604d6fcfbf772aec,6752378,False
2098,b1360d644d3f5a5a604d6fcfbf772aec,7747374,False
2099,b1360d644d3f5a5a604d6fcfbf772aec,18710190,False


In [8]:
sample_features = sample_features[sample_features['user_id'].isin(sample_responses['user_id'])]
print(sample_features.shape)
sample_features.head()

(9017, 132)


Unnamed: 0,user_id,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,read-2013,scifi,faves,hunger-games,did-not-finish,english,lgbt,reread,e-books,rick-riordan
0,838242f8a023a925393313ecacf63876,0.004155,0.003627,0.0,0.000429,0.016731,0.005133,0.000126,0.004856,0.006303,...,False,False,False,False,False,False,False,False,False,False
1,ace33950b7a3a3e4ce3c2bdbfc5c6797,0.0,1.4e-05,8e-06,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,False,False
2,1a5a22320fa146de0437df0bf5f7335f,0.002643,0.003326,0.000646,4.5e-05,0.011235,0.001576,0.000141,3.9e-05,0.003314,...,False,False,False,False,False,False,False,False,False,False
3,214ba9ed6431176800ea3fea13b50578,0.000659,0.005181,0.0,0.000163,0.018886,0.000528,8.6e-05,5e-05,0.000413,...,False,False,False,False,False,False,False,False,False,False
4,1af22c7610c97e28a4225e192cfb4d29,0.017151,0.007885,0.0,0.0,0.019878,0.000784,0.002049,0.0,0.001679,...,False,False,False,False,False,False,False,False,False,False


In [9]:
sample_features.to_csv(sample_user_features_csv, index=False)
sample_responses.to_csv(sample_responses_csv, index=False)

As a result of this sampling, we end up with ~9000 users, 140K responses, and 500 books.