In [1]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
import fasttext
from tqdm import tqdm
tqdm.pandas()

In [2]:
# PARAMETERS
n = None #numbers of lines to load from dataset

# Load Data

In [3]:
def load_data(file_name, n = None):
    """Load data from json.gz files"""
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the 100th line
            if (n is not None) and (count > n):
                break
    return data

    
def get_values(cell, dict):
    """Assigns a string value based on an int key from dict data"""
    return [dict[id] for id in cell]

def return_ids(cell, key):
    """Extracts values of key"""
    return [x[key] for x in cell]



In [4]:
#Import book dataset
books= pd.DataFrame(load_data('goodreads_books.json.gz'), n=n)



In [None]:
books.head()

In [4]:
genres= pd.DataFrame(load_data('goodreads_book_genres_initial.json.gz', n = n))
genres.head()

Unnamed: 0,book_id,genres
0,5333265,"{'history, historical fiction, biography': 1}"
1,1333909,"{'fiction': 219, 'history, historical fiction,..."
2,7327624,"{'fantasy, paranormal': 31, 'fiction': 8, 'mys..."
3,6066819,"{'fiction': 555, 'romance': 23, 'mystery, thri..."
4,287140,{'non-fiction': 3}


In [5]:
genres.tail()

Unnamed: 0,book_id,genres
499996,379374,"{'history, historical fiction, biography': 31,..."
499997,7529684,"{'fantasy, paranormal': 998, 'young-adult': 33..."
499998,18051119,{'children': 1}
499999,32147891,{}
500000,33597722,"{'fantasy, paranormal': 15582, 'young-adult': ..."


In [5]:
import pickle

with open('book_ids', 'rb') as f:
    book_ids = pickle.load(f)
df['book_title'] = df.book_id.map(book_ids)

In [4]:
books.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')

In [6]:
books = pd.merge(books, genres, on = 'book_id', how = 'outer')
books

NameError: name 'books' is not defined

In [7]:
columns = ['series', 'language_code', 'country_code' 'is_ebook', 
'similar_books', 'description', 'format', 'authors','publisher', 'num_pages', 
'publication_year', 'book_id','title', 'title_without_series', 'genres', 'average_rating']
# keep following columns: ['series', 'language_code', 'country_code', 'popular_shelves', 'is_ebook', 'similar_books', 
# 'description', 'format', 'authors','publisher', 'num_pages', 'publication_year', 'book_id','title',
# 'title_without_series']
books = books.loc[:, books.columns.isin(columns)]
books.head()

Unnamed: 0,series,country_code,language_code,popular_shelves,is_ebook,similar_books,description,format,authors,publisher,num_pages,publication_year,book_id,title,title_without_series,genres
0,[],US,,"[{'count': '3', 'name': 'to-read'}, {'count': ...",False,[],,Paperback,"[{'author_id': '604031', 'role': ''}]",St. Martin's Press,256.0,1984.0,5333265,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,"{'history, historical fiction, biography': 1}"
1,[],US,,"[{'count': '2634', 'name': 'to-read'}, {'count...",False,"[8709549, 17074050, 28937, 158816, 228563, 112...","Anita Diamant's international bestseller ""The ...",Audio CD,"[{'author_id': '626222', 'role': ''}]",Simon & Schuster Audio,,2001.0,1333909,Good Harbor,Good Harbor,"{'fiction': 219, 'history, historical fiction,..."
2,[189911],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",False,"[19997, 828466, 1569323, 425389, 1176674, 2627...",Omnibus book club edition containing the Ladie...,Hardcover,"[{'author_id': '10333', 'role': ''}]","Nelson Doubleday, Inc.",600.0,1987.0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ...","{'fantasy, paranormal': 31, 'fiction': 8, 'mys..."
3,[],US,eng,"[{'count': '7615', 'name': 'to-read'}, {'count...",False,"[6604176, 6054190, 2285777, 82641, 7569453, 70...",Addie Downs and Valerie Adler were eight when ...,Hardcover,"[{'author_id': '9212', 'role': ''}]",Atria Books,368.0,2009.0,6066819,Best Friends Forever,Best Friends Forever,"{'fiction': 555, 'romance': 23, 'mystery, thri..."
4,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",False,[],,,"[{'author_id': '149918', 'role': ''}]",,,,287140,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...,{'non-fiction': 3}


In [8]:
#remove duplicate rows that shares the same title and description
books = books.drop_duplicates(subset=["title", "description"], keep='first', inplace=False)

In [9]:
model = fasttext.load_model('/home/naimrod/miniconda3/envs/naimrod/lib/python3.10/site-packages/fasttext/lid.176.bin')

def get_language(text, 
             model,
             lang = ['__label__en']):
    '''Return True if description is written in wanted languages, else False'''
    if text == '':
        return True
    text = text.replace('\n', '')
    pred = model.predict(text.replace('\n', ''),k=1)[0][0]

    return True if pred in lang else False



In [10]:
#Get language of descriptions
books['is_english'] = books.description.progress_apply(get_language, model = model)
#Keep only books that have English descriptions
books = books[books.is_english]

100%|██████████| 2160491/2160491 [03:43<00:00, 9662.21it/s] 


In [11]:
books.shape

(1841442, 17)

In [12]:
def filter_shelves(l, quantile = 0.5):
    """Return shelf names if count > quantile(default setting: median)"""
    if len(l)<1:
        return ''
    l = [(int(d['count']), d['name']) for d in l] #extracts count and name values for each dict in l
    #print(l)
    quartile1 = np.quantile([x[0] for x in l ], quantile)
    return ', '.join([x[1].replace('-s-', "'s-").replace('-t', "'t").replace('-', ' ') for x in l if x[0]>quartile1])

In [13]:
#extract relevant shelf names
books['popular_shelves_names'] = books.popular_shelves.progress_apply(filter_shelves)

100%|██████████| 1841442/1841442 [03:46<00:00, 8119.90it/s] 


In [12]:
books.head()

Unnamed: 0,series,country_code,language_code,popular_shelves,is_ebook,similar_books,description,format,authors,publisher,num_pages,publication_year,book_id,title,title_without_series,is_english,popular_shelves_names,text
0,[],US,,"[{'count': '3', 'name': 'to-read'}, {'count': ...",False,[],,Paperback,"[{'author_id': '604031', 'role': ''}]",St. Martin's Press,256.0,1984.0,5333265,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,True,to read,W.C. Fields: A Life on Film to read
1,[],US,,"[{'count': '2634', 'name': 'to-read'}, {'count...",False,"[8709549, 17074050, 28937, 158816, 228563, 112...","Anita Diamant's international bestseller ""The ...",Audio CD,"[{'author_id': '626222', 'role': ''}]",Simon & Schuster Audio,,2001.0,1333909,Good Harbor,Good Harbor,True,"to read, fiction, currently reading, contempor...","Good Harbor to read, fiction, currently readin..."
2,[189911],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",False,"[19997, 828466, 1569323, 425389, 1176674, 2627...",Omnibus book club edition containing the Ladie...,Hardcover,"[{'author_id': '10333', 'role': ''}]","Nelson Doubleday, Inc.",600.0,1987.0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ...",True,"to read, fantasy, fiction, owned, hardcover, s...","The Unschooled Wizard (Sun Wolf and Starhawk, ..."
3,[],US,eng,"[{'count': '7615', 'name': 'to-read'}, {'count...",False,"[6604176, 6054190, 2285777, 82641, 7569453, 70...",Addie Downs and Valerie Adler were eight when ...,Hardcover,"[{'author_id': '9212', 'role': ''}]",Atria Books,368.0,2009.0,6066819,Best Friends Forever,Best Friends Forever,True,"to read, chick lit, currently reading, fiction...","Best Friends Forever to read, chick lit, curre..."
4,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",False,[],,,"[{'author_id': '149918', 'role': ''}]",,,,287140,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...,True,"to read, runes, owned, nonfiction",Runic Astrology: Starcraft and Timekeeping in ...


In [14]:
#Import series dataset
series = pd.DataFrame(load_data('goodreads_book_series.json.gz'))

#map series_id to their title
map_series = dict(zip(series['series_id'], series.title))
map_series

{'189911': 'Sun Wolf and Starhawk',
 '151854': 'Avalon: Web of Magic',
 '169353': 'Crowner John Mystery',
 '1052227': "Pluto's Snitch",
 '1070125': 'The Way, With The Enchanted Neighborhood',
 '147734': 'Dark Visions',
 '953679': 'Inspector Capstan',
 '811663': "Dante's Circle,",
 '408775': 'Patrick Grant',
 '250807': 'Foundation (Publication Order)',
 '415703': 'Foundation (Chronological Order)',
 '181883': 'Foundation Universe',
 '938303': 'Finding Fatherhood',
 '584334': 'Sweet Water',
 '274410': 'Aristotle',
 '414880': 'The Norton History of Modern Europe',
 '425995': 'The Ambrosia Sequence',
 '493993': 'The Half Bad Trilogy',
 '199039': 'Miss Marple',
 '391329': 'The World of Beatrix Potter: Peter Rabbit',
 '326237': 'Wycliffe',
 '514956': 'On the Island',
 '246830': 'Captain America, Volume V',
 '362583': 'Marvel Ultimate Graphic Novels Collection',
 '362581': 'Marvel Ultimate Graphic Novels Collection: Publication Order',
 '623032': 'Capitan America Marvel Deluxe',
 '152366': 'W

In [15]:
# import authors dataset
authors = pd.DataFrame(load_data('goodreads_book_authors.json.gz'))

#map authors to their name
map_authors = dict(zip(authors['author_id'], authors.name))
map_authors

{'604031': 'Ronald J. Fields',
 '626222': 'Anita Diamant',
 '10333': 'Barbara Hambly',
 '9212': 'Jennifer Weiner',
 '149918': 'Nigel Pennick',
 '3041852': 'Alfred J. Church',
 '215594': 'Michael Halberstam',
 '19158': 'Rachel Roberts',
 '5807700': 'V.L. Locey',
 '2983296': 'Anton Szandor LaVey',
 '40075': 'Kenneth Anger',
 '37778': 'Bernard Knight',
 '242185': 'Carolyn Haines',
 '27397': 'Benjamin Hoff',
 '35259': 'Christopher Ross',
 '3083854': 'Tom Wolfe',
 '39054': 'Don Piper',
 '11099': 'Cecil Murphey',
 '4862': 'Randy Alcorn',
 '137561': 'Jennifer L. Holm',
 '5411': 'Cynthia Rylant',
 '4092328': 'Margot Hovley',
 '75424': 'Wayne Kinsey',
 '1357577': 'Francoise Sagan',
 '3389': 'Stephen King',
 '105602': 'Edward Herrmann',
 '31308': 'Peter Maass',
 '50873': 'L.J. Smith',
 '232533': 'Anne Emmert',
 '166520': 'Christopher Michael McDonough',
 '166517': 'Richard E. Prior',
 '3958428': 'Mark Jackson Stansbury',
 '3958429': 'Mark Stansbury',
 '6536612': 'Zabrina Murray',
 '15585': 'Jame

In [16]:
books.series = books.series.apply(get_values, dict = map_series)
books.similar_books = books.similar_books.apply(get_values, dict = map_books)

In [17]:
books.authors = books.authors.apply(return_ids, key='author_id')
books.authors = books.authors.apply(get_values, dict = map_authors)
books.head()

Unnamed: 0,series,country_code,language_code,popular_shelves,is_ebook,similar_books,description,format,authors,publisher,num_pages,publication_year,book_id,title,title_without_series,genres,is_english,popular_shelves_names
0,[],US,,"[{'count': '3', 'name': 'to-read'}, {'count': ...",False,[],,Paperback,[Ronald J. Fields],St. Martin's Press,256.0,1984.0,5333265,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film,"{'history, historical fiction, biography': 1}",True,to read
1,[],US,,"[{'count': '2634', 'name': 'to-read'}, {'count...",False,[Hope Will Find You: My Search for the Wisdom ...,"Anita Diamant's international bestseller ""The ...",Audio CD,[Anita Diamant],Simon & Schuster Audio,,2001.0,1333909,Good Harbor,Good Harbor,"{'fiction': 219, 'history, historical fiction,...",True,"to read, fiction, currently reading, contempor..."
2,[Sun Wolf and Starhawk],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",False,"[Through a Brazen Mirror, The Hound and the Fa...",Omnibus book club edition containing the Ladie...,Hardcover,[Barbara Hambly],"Nelson Doubleday, Inc.",600.0,1987.0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ...","{'fantasy, paranormal': 31, 'fiction': 8, 'mys...",True,"to read, fantasy, fiction, owned, hardcover, s..."
3,[],US,eng,"[{'count': '7615', 'name': 'to-read'}, {'count...",False,"[The Opposite of Me, Dune Road, A Summer Affai...",Addie Downs and Valerie Adler were eight when ...,Hardcover,[Jennifer Weiner],Atria Books,368.0,2009.0,6066819,Best Friends Forever,Best Friends Forever,"{'fiction': 555, 'romance': 23, 'mystery, thri...",True,"to read, chick lit, currently reading, fiction..."
4,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",False,[],,,[Nigel Pennick],,,,287140,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...,{'non-fiction': 3},True,"to read, runes, owned, nonfiction"


In [18]:
#Concatenate title, shelf names and description
books['text'] = books.title + ' '+ books.description

In [19]:
books.to_csv('en_desc_books.csv', index=False)


(92072, 19)

In [None]:
books = books[books.publication_year != '']
books.to_csv('en_desc_books_year.csv', index=False)

In [20]:
#Take 5% of the raws
books_sample = books.sample(frac = 0.05, random_state=57, ignore_index=True)

books_sample.to_csv('5_percents_books.csv', index=False)
books_sample.to_pickle('5p_books.pickle')
books_sample.shape

(92072, 19)

In [21]:
books_sample.head()

Unnamed: 0,series,country_code,language_code,popular_shelves,is_ebook,similar_books,description,format,authors,publisher,num_pages,publication_year,book_id,title,title_without_series,genres,is_english,popular_shelves_names,text
0,[],US,,"[{'count': '13', 'name': 'to-read'}, {'count':...",False,[],Working for tech support is a boring yet easy ...,Paperback,[Clancy Nacht],Xlibris Corporation,128.0,2009.0,6597590,The Night Caller,The Night Caller,"{'romance': 2, 'mystery, thriller, crime': 2}",True,"to read, m m",The Night Caller Working for tech support is a...
1,[Nutshell Library],US,eng,"[{'count': '329', 'name': 'picture-books'}, {'...",False,"[A Hole is to Dig, What Do You Say, Dear?, The...","Maurice Sendak, the Caldecott Medal-winning cr...",Library Binding,[Maurice Sendak],HarperCollins,30.0,1962.0,1592486,Chicken Soup with Rice: A Book of Months,Chicken Soup with Rice: A Book of Months,"{'children': 1083, 'poetry': 162, 'fiction': 7...",True,"picture books, childrens, to read, children's ...",Chicken Soup with Rice: A Book of Months Mauri...
2,[],US,,"[{'count': '78', 'name': 'to-read'}, {'count':...",False,[],"Includes tales from Iceland, Sudan, Palestine,...",Paperback,[Lawrence Millman],Interlink Books,208.0,2004.0,1823244,A Kayak Full of Ghosts: Eskimo Tales,A Kayak Full of Ghosts: Eskimo Tales,"{'fiction': 4, 'non-fiction': 2, 'history, his...",True,"to read, folklore, favorites, fiction, horror,...",A Kayak Full of Ghosts: Eskimo Tales Includes ...
3,[Scarlet Order],US,,"[{'count': '30', 'name': 'to-read'}, {'count':...",False,[],,,[David Lee Summers],,,,1949018,Vampires of the Scarlet Order,Vampires of the Scarlet Order,"{'romance': 1, 'fantasy, paranormal': 2, 'myst...",True,"to read, horror, vampires, favorites",Vampires of the Scarlet Order
4,[],US,per,"[{'count': '141828', 'name': 'to-read'}, {'cou...",False,"[Franz Kafka's The Castle, Notes from Undergro...",khtb dr qT` jyby w shml dstn hy zyr st\nmskh\n...,,[Franz Kafka],,,,7176642,مسخ,مسخ,"{'fiction': 5061, 'fantasy, paranormal': 1034}",True,"to read, classics, currently reading, fiction,...",مسخ khtb dr qT` jyby w shml dstn hy zyr st\nms...
