# Prepare notebook

In [1]:
import json 
import pandas as pd

# Import data

In [2]:
books = pd.read_pickle('2019112_goodreads_book_data.pkl')

# Initial exploration of the data

In [3]:
shelves = pd.get_dummies(books.goodreads_shelves.apply(pd.Series).stack()).sum(level=0)

In [4]:
shelves = shelves.astype(bool)

In [5]:
shelves.sum().sort_values(ascending = False).iloc[0:30]

non-fiction         526
science             278
history             224
biography           119
memoir              115
psychology          109
nature              109
biology              89
audiobook            86
philosophy           79
animals              63
classics             60
self-help            59
natural-history      44
travel               44
politics             41
historical           35
essays               33
health               33
evolution            30
spirituality         29
fiction              28
feminism             27
medicine             27
neuroscience         26
humor                25
religion             25
business             24
biography-memoir     24
environment          24
dtype: int64

In [6]:
shelves = shelves.reindex(books.index)

In [7]:
academic_field_mappings  = { 'humanities': ['archaeology','art','architecture','cartography','history','music', 
                                'library-science', 'linguistics'],
                 
                      'social_sciences': ['business','economics','education','finance','psychology','geography',
                                          'political-science','sociology','social-science', 'journalism', 
                                          'management'],

                      'physical_sciences':['anthropology','astronomy',
                                        'biology', 'natural-history',
                                        'chemistry','computer-science','ecology',
                                        'earth-sciences','engineering',
                                        'geology','medicine',  
                                         'neuroscience',
                                        'palaeontology',   'mathematics','physics'],
                     }



In [8]:
category_mappings = {'history': ['history','19th-century','archaeology','ancient','american-history','art-history','ancient-history',
                             'classical-studies','civil-war','egyptology','european-history','food-history','historical', 
                             'history-of-science','holocaust', 'medieval', 'medieval-history', 
                             'medievalism', 'microhistory','military-history', 'prehistory', 'roman', 'victorian',
                             'war', 'world-history', 'world-war-ii','palaeontology'],
                 
                 'sociology': ['activism', 'african-american','communication','crime','disability',
                              'family','feminism','gay','gender','gender-studies','health-care', 'social-justice',
                               'relationships', 'leadership', 'lgbt', 'politics', 'parenting', 'money', 'productivity',
                               'queer','race', 'social-issues',  'society',  'witchcraft',
                               'womens','business','economics','education','finance','psychology','geography',
                                'political-science','sociology','social-science', 'journalism', 
                                'management'],
                        
                 'sciences':['ecology','earth-sciences','biology', 'natural-history','anthropology','animals','birds','cats','conservation',
                                     'dinosaurs','dogs','science-nature','geology',
                                     'environment','evolution', 'genetics', 'nature', 'ornithology',
                                     'outdoors', 'plants', 'space', 'wildlife', 'science','astronomy',
                                   'chemistry','mathematics','physics'],
                 
                 'technology':['computer-science','artificial-intelligence','computers',
                               'hackers', 'internet', 'technology', 'theory','engineering'],
                 
                 'philosophy':['atheism','buddhism','christianity','eastern-philosophy','inspirational','islam',
                               'judaism','occult', 'philosophy', 'spirituality', 'religion', 'taoism'],
                
                 
                 'culture':['art','architecture','cartography','alcohol','basketball', 'classical-music','cultural','book-club','crafts',
                            'design','espionage','fashion','gardening', 'jazz','music', 
                             'photography', 'sports', 'theatre',  'language', 'librarianship',
                            'writing', 'magic','maps','witches','plays', 'poetry', 'classics','library-science', 'linguistics'],
                 
                 'literatures':['literature', 'british-literature','classic-literature',
                                    'english-literature', 'french-literature', 'italian-literature', 'japanese-literature',
                                     'russian-literature', 'spanish-literature'],
                 
                  'other':['comedy', 'drama','fantasy','horror', 'humor',  'mystery',
                            'romance', 'novels', 'science-fiction', 'short-stories', 'thriller', 'tragedy',
                                'true-crime','books-about-books','collections','essays'],
                 
                  'mythology': ['arthurian','fairy-tales', 'folk-tales','folklore', 'mythology'],
                 
                 'biography':['autobiography','biography','biography-memoir','diary', 'memoir','travelogue'],
                 
                
                 'travel':['africa','adventure','american','asia','australia','canada','china','egypt', 
                           'france','greece','india', 'ireland', 'travel', 'italy', 'japan', 'new-york', 'pakistan',
                           'russia', 'spain', 'the-united-states-of-america', 'united-states', 'western-africa'],
                 
                 'culinary': ['cookbooks','cooking','culinary','food','food-and-drink', 'foodie', 'tea'],
                 
                 'health':['medicine', 'neuroscience','brain','health','fitness', 'medical',  'mental-health', 'nutrition', 'personal-development', 'self-help']
                }

 

In [9]:
academic_fields = pd.DataFrame(index = shelves.index)
for k,v in academic_field_mappings.items():
    academic_fields[k] = (shelves[v].sum(axis =1) > 0)

categories = pd.DataFrame(index = shelves.index)
for k,v in category_mappings.items():
    categories[k] = (shelves[v].sum(axis =1) > 0)

In [10]:
categories.sum()

history        235
sociology      202
sciences       302
technology      13
philosophy      99
culture        141
literatures     23
other           95
mythology       19
biography      159
travel          92
culinary        20
health         111
dtype: int64

In [11]:
academic_fields.sum()

humanities           249
social_sciences      134
physical_sciences    175
dtype: int64

In [12]:
shelves.to_pickle('2019112_goodreads_book_shelves.pkl')

In [13]:
categories.to_pickle('2019112_goodreads_book_categories.pkl')
json.dump(category_mappings, open("category_mappings.json","w"))

In [14]:
academic_fields.to_pickle('2019112_goodreads_book_academic_fields.pkl')
json.dump(academic_field_mappings, open("academic_field_mappings.json","w"))