In [2]:
import pandas as pd
import numpy as np
np.random.seed(11)

In [3]:
# Enter data paths for raw_text version and parsed_text version
data_raw = "./data/reviews_raw.json"
data_parsed = "./data/reviews_parsed.json"

data_with_raw = pd.read_json(data_raw, lines=True)
data_with_parsed = pd.read_json(data_parsed, lines=True)

In [4]:
new_df = pd.merge(data_with_parsed, data_with_raw, on=["review_id"])

In [5]:
new_df = new_df.rename(columns={"rating_x": "rating", "user_id_x": "user_id", "rating_x": "rating",
                       "book_id_x": "book_id",})
selected_cols = ["user_id", "book_id", "review_sentences", "rating", 
                 "timestamp", "n_votes", "n_comments", "has_spoiler"]
new_df = new_df[selected_cols]

In [6]:
new_df.head()

Unnamed: 0,user_id,book_id,review_sentences,rating,timestamp,n_votes,n_comments,has_spoiler
0,8842281e1d1347389f2ab93d60773d4d,18245960,"[[0, This is a special book.], [0, It started ...",5,2017-08-30,28,1,True
1,8842281e1d1347389f2ab93d60773d4d,16981,"[[0, Recommended by Don Katz.], [0, Avail for ...",3,2017-03-22,1,0,False
2,8842281e1d1347389f2ab93d60773d4d,28684704,"[[0, A fun, fast paced science fiction thrille...",3,2017-03-20,22,0,True
3,8842281e1d1347389f2ab93d60773d4d,27161156,"[[0, Recommended reading to understand what is...",0,2016-11-09,5,1,False
4,8842281e1d1347389f2ab93d60773d4d,25884323,"[[0, I really enjoyed this book, and there is ...",4,2016-04-25,9,1,True


In [7]:
# Dataset reduction

print(f"number of unique books at the beginning: {new_df.book_id.nunique()}")

select_number = 15000
random_books = np.random.choice(new_df.book_id.unique().tolist(), size=select_number)
reduced_df = new_df[new_df.book_id.isin(random_books)]

print(f"number of unique books at the end: {reduced_df.book_id.nunique()}")

print(f"Shape at the end: {reduced_df.shape}")

number of unique books at the beginning: 25475
number of unique books at the end: 11339
Shape at the end: (604686, 8)


In [32]:
print(f"ratio of non-spoiler labels: {reduced_df.has_spoiler.value_counts()[0] / len(reduced_df)}%")

0.9344783904373509

In [26]:
reduced_df.to_csv("./data/reduced.csv", index=False)

In [27]:
df = pd.read_csv("./data/reduced.csv")

In [28]:
df.head()

Unnamed: 0,user_id,book_id,review_sentences,rating,timestamp,n_votes,n_comments,has_spoiler
0,8842281e1d1347389f2ab93d60773d4d,16981,"[[0, 'Recommended by Don Katz.'], [0, 'Avail f...",3,2017-03-22,1,0,False
1,8842281e1d1347389f2ab93d60773d4d,28684704,"[[0, 'A fun, fast paced science fiction thrill...",3,2017-03-20,22,0,True
2,8842281e1d1347389f2ab93d60773d4d,19398490,"[[0, 'A beautiful story.'], [0, 'It is rare to...",4,2016-09-20,35,5,False
3,8842281e1d1347389f2ab93d60773d4d,24189224,"[[0, 'Numerous people in publishing have told ...",0,2015-05-29,11,5,False
4,8842281e1d1347389f2ab93d60773d4d,22551730,"[[0, 'Another hard to put down nonfiction book...",4,2016-12-14,20,6,False


In [29]:
df.shape

(604686, 8)