### Q2 - Data Cleaning and Preprocessing
This notebook performs:
- **Data Cleaning** on `books_data.csv` (Part B)

In [25]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [26]:
# Load the dataset
df = pd.read_csv('G:\\My Drive\\MSC\\project\\question2_social_media_analysis\\data_collection\\books_data.csv')
# display the first few rows of the dataframe
df.head()

Unnamed: 0,title,price,rating,category,availability,description
0,A Light in the Attic,51.77,Three,Poetry,In stock (22 available),It's hard to imagine a world without A Light i...
1,Tipping the Velvet,53.74,One,Historical Fiction,In stock (20 available),"""Erotic and absorbing...Written with starling ..."
2,Soumission,50.1,One,Fiction,In stock (20 available),"Dans une France assez proche de la nôtre, un h..."
3,Sharp Objects,47.82,Four,Mystery,In stock (20 available),"WICKED above her hipbone, GIRL across her hear..."
4,Sapiens: A Brief History of Humankind,54.23,Five,History,In stock (20 available),From a renowned historian comes a groundbreaki...


In [27]:
# Remove $ and convert to float
df['price'] = df['price'].replace(r'[\$,]', '', regex=True).astype(float)
df

Unnamed: 0,title,price,rating,category,availability,description
0,A Light in the Attic,51.77,Three,Poetry,In stock (22 available),It's hard to imagine a world without A Light i...
1,Tipping the Velvet,53.74,One,Historical Fiction,In stock (20 available),"""Erotic and absorbing...Written with starling ..."
2,Soumission,50.10,One,Fiction,In stock (20 available),"Dans une France assez proche de la nôtre, un h..."
3,Sharp Objects,47.82,Four,Mystery,In stock (20 available),"WICKED above her hipbone, GIRL across her hear..."
4,Sapiens: A Brief History of Humankind,54.23,Five,History,In stock (20 available),From a renowned historian comes a groundbreaki...
...,...,...,...,...,...,...
993,Beyond Good and Evil,43.38,One,Philosophy,In stock (1 available),Friedrich Nietzsche's Beyond Good and Evil is ...
994,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,Four,Sequential Art,In stock (1 available),High school student Kei Nagai is struck dead i...
995,A Spy's Devotion (The Regency Spies of London #1),16.97,Five,Historical Fiction,In stock (1 available),"In England’s Regency era, manners and elegance..."
996,1st to Die (Women's Murder Club #1),53.98,One,Mystery,In stock (1 available),"James Patterson, bestselling author of the Ale..."


In [28]:
rating_mapping = {
    'One': 1,
    'Two': 2,
    'Three': 3,
    'Four': 4,
    'Five': 5
}
df['rating'] = df['rating'].map(rating_mapping)
df

Unnamed: 0,title,price,rating,category,availability,description
0,A Light in the Attic,51.77,3,Poetry,In stock (22 available),It's hard to imagine a world without A Light i...
1,Tipping the Velvet,53.74,1,Historical Fiction,In stock (20 available),"""Erotic and absorbing...Written with starling ..."
2,Soumission,50.10,1,Fiction,In stock (20 available),"Dans une France assez proche de la nôtre, un h..."
3,Sharp Objects,47.82,4,Mystery,In stock (20 available),"WICKED above her hipbone, GIRL across her hear..."
4,Sapiens: A Brief History of Humankind,54.23,5,History,In stock (20 available),From a renowned historian comes a groundbreaki...
...,...,...,...,...,...,...
993,Beyond Good and Evil,43.38,1,Philosophy,In stock (1 available),Friedrich Nietzsche's Beyond Good and Evil is ...
994,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,4,Sequential Art,In stock (1 available),High school student Kei Nagai is struck dead i...
995,A Spy's Devotion (The Regency Spies of London #1),16.97,5,Historical Fiction,In stock (1 available),"In England’s Regency era, manners and elegance..."
996,1st to Die (Women's Murder Club #1),53.98,1,Mystery,In stock (1 available),"James Patterson, bestselling author of the Ale..."


In [29]:
# Drop rows with missing values in title, price, or rating
df = df.dropna(subset=['title', 'price', 'rating'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 998 entries, 0 to 997
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         998 non-null    object 
 1   price         998 non-null    float64
 2   rating        998 non-null    int64  
 3   category      998 non-null    object 
 4   availability  998 non-null    object 
 5   description   998 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 46.9+ KB


In [30]:
# Remove duplicates
df = df.drop_duplicates()
df

Unnamed: 0,title,price,rating,category,availability,description
0,A Light in the Attic,51.77,3,Poetry,In stock (22 available),It's hard to imagine a world without A Light i...
1,Tipping the Velvet,53.74,1,Historical Fiction,In stock (20 available),"""Erotic and absorbing...Written with starling ..."
2,Soumission,50.10,1,Fiction,In stock (20 available),"Dans une France assez proche de la nôtre, un h..."
3,Sharp Objects,47.82,4,Mystery,In stock (20 available),"WICKED above her hipbone, GIRL across her hear..."
4,Sapiens: A Brief History of Humankind,54.23,5,History,In stock (20 available),From a renowned historian comes a groundbreaki...
...,...,...,...,...,...,...
993,Beyond Good and Evil,43.38,1,Philosophy,In stock (1 available),Friedrich Nietzsche's Beyond Good and Evil is ...
994,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,4,Sequential Art,In stock (1 available),High school student Kei Nagai is struck dead i...
995,A Spy's Devotion (The Regency Spies of London #1),16.97,5,Historical Fiction,In stock (1 available),"In England’s Regency era, manners and elegance..."
996,1st to Die (Women's Murder Club #1),53.98,1,Mystery,In stock (1 available),"James Patterson, bestselling author of the Ale..."


In [31]:
# Reset the index
df = df.reset_index(drop=True)
df

Unnamed: 0,title,price,rating,category,availability,description
0,A Light in the Attic,51.77,3,Poetry,In stock (22 available),It's hard to imagine a world without A Light i...
1,Tipping the Velvet,53.74,1,Historical Fiction,In stock (20 available),"""Erotic and absorbing...Written with starling ..."
2,Soumission,50.10,1,Fiction,In stock (20 available),"Dans une France assez proche de la nôtre, un h..."
3,Sharp Objects,47.82,4,Mystery,In stock (20 available),"WICKED above her hipbone, GIRL across her hear..."
4,Sapiens: A Brief History of Humankind,54.23,5,History,In stock (20 available),From a renowned historian comes a groundbreaki...
...,...,...,...,...,...,...
993,Beyond Good and Evil,43.38,1,Philosophy,In stock (1 available),Friedrich Nietzsche's Beyond Good and Evil is ...
994,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,4,Sequential Art,In stock (1 available),High school student Kei Nagai is struck dead i...
995,A Spy's Devotion (The Regency Spies of London #1),16.97,5,Historical Fiction,In stock (1 available),"In England’s Regency era, manners and elegance..."
996,1st to Die (Women's Murder Club #1),53.98,1,Mystery,In stock (1 available),"James Patterson, bestselling author of the Ale..."


In [32]:
# save the cleaned dataset
df.to_csv('cleaned_books_data.csv', index=False)
print(f"Cleaned data saved to 'cleaned_books_data.csv' with {len(df)} records.")

Cleaned data saved to 'cleaned_books_data.csv' with 998 records.
