In [1]:
import os
import glob

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Render plots as SVG
%config InlineBackend.figure_format = 'svg'

In [2]:
meta = pd.read_excel('data/metadata.xlsx')
meta.sample(5)

Unnamed: 0,File name,Author,Title,Publication date,Series + n°,Editor,Illustrator,Publisher,Place of publication,Intended reader,Source intended reader,Date of debut,Author's birth date,Author's age at time of publication,Author's gender,Country,Comments,English title
111,COTTRELL-BOYCE_runawayrobot_2019,Frank Cottrell-Boyce,Runaway Robot,2019,,,Steven Lenton,Macmillan,London,9,amazon,2004,1959,60,M,UK,,
728,WOLTZ_aangespoeld_2007,Anna Woltz,Aangespoeld,2007,,,,Leopold,Amsterdam,10,website author,1998,1981,26,F,NL,,
575,VANDEVENDEL_sofieenhetijsbeertje_2013,Edward van de Vendel,Sofie en het ijsbeertje,2013,Sofie#3,,Floor de Goede,Querido,Amsterdam,8,website author,1996,1964,49,M,NL,,
210,FRANCK_amerikaofdenoordpool_1990,Ed Franck,Amerika of de Noordpool,1990,,,Gerda Dendooven,Clavis,Hasselt,9,CBK,1985,1941,49,M,VL,,
659,WILSON_hettyfeather_2009,Jacqueline Wilson,Hetty Feather,2009,Hetty Feather #1,,Nick Sharratt,Doubleday,London,9,website author,1969,1945,64,F,UK,,


In [3]:
meta.shape

(753, 18)

In [4]:
meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753 entries, 0 to 752
Data columns (total 18 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   File name                            753 non-null    object
 1   Author                               753 non-null    object
 2   Title                                753 non-null    object
 3   Publication date                     753 non-null    int64 
 4   Series  + n°                         167 non-null    object
 5   Editor                               1 non-null      object
 6   Illustrator                          410 non-null    object
 7   Publisher                            741 non-null    object
 8   Place of publication                 739 non-null    object
 9   Intended reader                      753 non-null    object
 10  Source intended reader               643 non-null    object
 11  Date of debut                        753 non-

In [5]:
memory_before = meta.memory_usage(deep=True).sum() / 1024
f'{memory_before:.2f} KB'

'596.49 KB'

## Cleaning up the dataset

(following [Matt Harrison's](https://twitter.com/__mharrison__) 'chaining' approach to Pandas)

In [6]:
meta = (
    meta
    .rename(lambda x: x.replace(' ', '_').lower(), axis=1)
    .rename(columns={'publication_date': 'pub_year', "author's_age_at_time_of_publication": 'author_age_pub', "author's_gender": 'author_gender'})
    .loc[:, ['file_name', 'author', 'title', 'pub_year', 'intended_reader', 'author_age_pub', 'author_gender', 'country']]
    .dropna(axis=0, how='any')
    .query("country == 'UK' & intended_reader != '-'")
    .astype({
        'author': 'category',
        'intended_reader': 'category',
        'author_gender': 'category'
    })
    .drop(['country'], axis=1)
)

meta.sample(10)

Unnamed: 0,file_name,author,title,pub_year,intended_reader,author_age_pub,author_gender
402,MURPHY_jeffreystrangeways_1992,Jill Murphy,Jeffrey Strangeways,1992,8,43,F
434,PULLMAN_thebrokenbridge_1990,Philip Pullman,The Broken Bridge,1990,12,44,M
154,FINE_incolddomain_1994,Anne Fine,In Cold Domain,1994,18,47,F
653,WILSON_fallingapart_1989,Jacqueline Wilson,Falling Apart,1989,15,44,F
141,FINE_crummymummyandme_1988,Anne Fine,Crummy Mummy and Me,1988,9,41,F
162,FINE_madamedoubtfire_1987,Anne Fine,Madame Doubtfire,1987,9,40,F
409,MURPHY_worldsapart_1988,Jill Murphy,Worlds Apart,1988,9,39,F
31,ALMOND_warisover_2018,David Almond,War is Over,2018,9,67,M
81,BLACKMAN_themonstercrispguzzler_2002,Malorie Blackman,The Monster Crisp-Guzzler,2002,5,40,F
147,FINE_frozenbilly_2004,Anne Fine,Frozen Billy,2004,9,57,F


In [8]:
meta.shape

(331, 7)

In [7]:
memory_after = meta.memory_usage(deep=True).sum() / 1024
f'{memory_after:.2f} KB'

'62.63 KB'

In [10]:
# memory difference
print(f'{abs(memory_after - memory_before):.2f} KB')

533.87 KB


In [11]:
meta.to_csv('data/metadata.csv', index=False, encoding='utf-8')