# Cleaning data

In [16]:
from collections import Counter
import pandas as pd
import re

In [7]:
# Setting pandas to show float values with 2 decimal places instead of scietific notation
pd.options.display.float_format = '{:.2f}'.format

In [2]:
# Reading the data extracted in `get_data.ipynb`
books = pd.read_pickle('../temp/ya-fiction-books.pickle')
books.head()

Unnamed: 0,id,editions id,title,author,published year,rating,ratings,genres,synopsis
0,8492825,10706553,Where She Went,Gayle Forman (Goodreads Author),2011,4.0,278348,"[Young Adult, Romance, Contemporary, Fiction, ...",It's been three years since the devastating ac...
1,9961796,7149084,Lola and the Boy Next Door,Stephanie Perkins (Goodreads Author),2011,3.93,159795,"[Young Adult, Romance, Contemporary, Womens Fi...",Alternate Cove edition for ISBN 9780525423287L...
2,8492856,13014066,What Happened to Goodbye,Sarah Dessen (Goodreads Author),2011,3.94,87726,"[Young Adult, Romance, Contemporary, Fiction, ...",Who is the real McLean? Since her parents' b...
3,9464733,10808145,Beauty Queens,Libba Bray (Goodreads Author),2011,3.62,56909,"[Young Adult, Contemporary, Humor, Fiction, LG...",Teen beauty queens. A lost island. Mysteries a...
4,8662836,13534308,Chain Reaction,Simone Elkeles (Goodreads Author),2011,4.1,61978,"[Romance, Young Adult, Contemporary, Realistic...",Luis Fuentes is a good boy who doesn't live wi...


## Exploration

In [3]:
# Seeing columns infos to check its date types and null values.
# No columns has null values, but some columns has inappropriate types,
# like rating and ratings that should be float and integers, respectively
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              220 non-null    object
 1   editions id     220 non-null    object
 2   title           220 non-null    object
 3   author          220 non-null    object
 4   published year  220 non-null    int64 
 5   rating          220 non-null    object
 6   ratings         220 non-null    object
 7   genres          220 non-null    object
 8   synopsis        220 non-null    object
dtypes: int64(1), object(8)
memory usage: 15.6+ KB


In [5]:
# Making rating a float column
books['rating'] = books['rating'].astype('float')
# Making ratings an integer column
books['ratings'] = books['ratings'].astype('int')

In [8]:
# Here we can see the importance of specify that those columns are numbers.
# Checking if numeric columns values make sense, and it all does
books.describe()

Unnamed: 0,published year,rating,ratings
count,220.0,220.0,220.0
mean,2015.98,4.03,96340.26
std,3.17,0.2,306752.84
min,2010.0,3.53,572.0
25%,2013.0,3.89,16929.75
50%,2016.0,4.04,36161.0
75%,2019.0,4.17,69759.0
max,2021.0,4.5,4158348.0


In [9]:
# Checking for duplicate books. None.
books[books.duplicated('id')]

Unnamed: 0,id,editions id,title,author,published year,rating,ratings,genres,synopsis


## Removing text between parentheses from `author` column

In [11]:
# Regular expression to match text between parentheses
re_text_between_parentheses = r'\([a-zA-ZÀ-ÿ\s]*\)'

In [14]:
# Counting the quantity of rows in dataframe that contains text between parentheses.
# And it is 210 rows. A lot.
books[books['author'].str.contains(
    re_text_between_parentheses
)].shape

(210, 9)

In [18]:
# Grouping those texts to see if we are really getting no important data
attributes = []
for _, row in books.iterrows():
    attributes += re.findall(re_text_between_parentheses, row['author'])

Counter(attributes).most_common()

[('(Goodreads Author)', 218),
 ('(Translator)', 2),
 ('(Contributor)', 2),
 ('(Narradora)', 1),
 ('(Illustrations)', 1)]

For my objective, those informations are not really important, so lets remove it.

In [25]:
# Removing text between parentheses from author column
books['author'] = books['author'].apply(
                      lambda row: re.sub(re_text_between_parentheses, '', row).strip().replace('\n', ''))

In [28]:
# Done. See how author column seems more clear without those texts
books

Unnamed: 0,id,editions id,title,author,published year,rating,ratings,genres,synopsis
0,8492825,10706553,Where She Went,Gayle Forman,2011,4.00,278348,"[Young Adult, Romance, Contemporary, Fiction, ...",It's been three years since the devastating ac...
1,9961796,7149084,Lola and the Boy Next Door,Stephanie Perkins,2011,3.93,159795,"[Young Adult, Romance, Contemporary, Womens Fi...",Alternate Cove edition for ISBN 9780525423287L...
2,8492856,13014066,What Happened to Goodbye,Sarah Dessen,2011,3.94,87726,"[Young Adult, Romance, Contemporary, Fiction, ...",Who is the real McLean? Since her parents' b...
3,9464733,10808145,Beauty Queens,Libba Bray,2011,3.62,56909,"[Young Adult, Contemporary, Humor, Fiction, LG...",Teen beauty queens. A lost island. Mysteries a...
4,8662836,13534308,Chain Reaction,Simone Elkeles,2011,4.10,61978,"[Romance, Young Adult, Contemporary, Realistic...",Luis Fuentes is a good boy who doesn't live wi...
...,...,...,...,...,...,...,...,...,...
215,54860459,75186585,Hani and Ishu's Guide to Fake Dating,Adiba Jaigirdar,2021,4.21,10835,"[Romance, LGBT, Contemporary, Young Adult, LGB...","Everyone likes Humaira ""Hani"" Khan—she’s easy ..."
216,54998272,71881363,The Girls I've Been,Tess Sharpe,2021,4.18,12437,"[Young Adult, LGBT, Thriller, Contemporary, My...","A slick, twisty YA page-turner about the daugh..."
217,49204960,74656790,Perfect on Paper,Sophie Gonzales,2021,4.13,10903,"[Romance, Contemporary, Young Adult, LGBT, LGB...",In Perfect on Paper: a bisexual girl who gives...
218,49399658,73513987,Counting Down with You,Tashie Bhuiyan,2021,4.17,10641,"[Romance, Contemporary, Young Adult, Romance, ...",A reserved Bangladeshi teenager has twenty-eig...


In [31]:
# Saving it as a pickle so the columns data types are preserved
books.to_pickle('../temp/ya-fiction-books-clean.pickle')