# Importing and Reading Data

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)

# Understanding The Data

- **.shape / .info()** – to check the size and structure of the dataset
- **.dtypes** – to understand data types for each column
- **.describe()** – to get summary statistics of numeric columns

In [3]:
df= pd.read_csv('books_data.csv')

In [4]:
df.shape

(616, 7)

In [5]:
df.columns

Index(['Title', 'Author', 'Rating', 'Avg Rating', 'Date Added', 'Num Pages',
       'Genre'],
      dtype='object')

In [6]:
df.dtypes

Title          object
Author         object
Rating         object
Avg Rating    float64
Date Added     object
Num Pages      object
Genre          object
dtype: object

# Data Preparation

In [9]:
df.head()

Unnamed: 0,Title,Author,Rating,Avg Rating,Date Added,Num Pages,Genre
0,Hunger(Unbound #3),"Gonnella, Nicoli",it was ok,4.43,"Apr 19, 2025","1,035pp",Fantasy
1,Silence(Unbound #2),"Gonnella, Nicoli",liked it,4.23,"Apr 19, 2025",482pp,Fantasy
2,Dissonance(Unbound #1),"Gonnella, Nicoli",really liked it,4.38,"Apr 16, 2025",778pp,Fantasy
3,The Dungeon Anarchist's Cookbook(Dungeon Crawl...,"Dinniman, Matt",it was ok,4.48,"Apr 15, 2025",534pp,Fantasy
4,"Carl's Doomsday Scenario(Dungeon Crawler Carl,...","Dinniman, Matt",liked it,4.53,"Apr 14, 2025",364pp,Fantasy


In [10]:
df = df.rename(columns={
    "Title": "title",
    "Author": "author",
    "Rating": "rating",
    "Avg Rating": "avg_rating",
    "Date Added": "date_added",
    "Num Pages": "num_of_pages",
    "Genre": "genre"
})

In [11]:
rating_map = {
    "did not like it": 1,
    "it was ok": 2,
    "liked it": 3,
    "really liked it": 4,
    "it was amazing": 5
}

# Apply mapping to create the new column
df["rating_num"] = df["rating"].map(rating_map)

In [12]:

df["num_of_pages"] = df["num_of_pages"].str.replace(",", "").str.replace("pp", "")
df["num_of_pages"] = pd.to_numeric(df["num_of_pages"], errors="coerce")  # Converts invalid values to NaN


In [13]:
df['date_added'] = pd.to_datetime(df['date_added'])

In [14]:
df["author"] = df["author"].str.split(", ").str[::-1].str.join(" ")

In [15]:
df["genre"] = df["genre"].str.lower().str.replace(" ", "_")


In [16]:

df = df.drop("rating", axis=1)

# Save the updated DataFrame back to CSV
df.to_csv("cleaned_data.csv", index=False)


In [17]:
df.columns

Index(['title', 'author', 'avg_rating', 'date_added', 'num_of_pages', 'genre',
       'rating_num'],
      dtype='object')

In [18]:
df.head()

Unnamed: 0,title,author,avg_rating,date_added,num_of_pages,genre,rating_num
0,Hunger(Unbound #3),Nicoli Gonnella,4.43,2025-04-19,1035.0,fantasy,2.0
1,Silence(Unbound #2),Nicoli Gonnella,4.23,2025-04-19,482.0,fantasy,3.0
2,Dissonance(Unbound #1),Nicoli Gonnella,4.38,2025-04-16,778.0,fantasy,4.0
3,The Dungeon Anarchist's Cookbook(Dungeon Crawl...,Matt Dinniman,4.48,2025-04-15,534.0,fantasy,2.0
4,"Carl's Doomsday Scenario(Dungeon Crawler Carl,...",Matt Dinniman,4.53,2025-04-14,364.0,fantasy,3.0
