In [None]:
import warnings
warnings.filterwarnings("ignore")

# Loading Dataset

- The dataset has been published by [Möbius](https://www.kaggle.com/arashnic) on [Kaggle](https://www.kaggle.com). You can access the data by following the [data link](https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset)
- You can download the dataset using the API command, but make sure you have Kaggle's API credentials
```
kaggle datasets download -d arashnic/book-recommendation-dataset
```
- Here I have already downloaded the dataset in "data" folder, so I can just load it in pandas like below

In [15]:
import pandas as pd

books = pd.read_csv("data/Books.csv")
users = pd.read_csv("data/Users.csv")
ratings = pd.read_csv("data/Ratings.csv")

print("Books Shape:", books.shape)
print("Users Shape:", users.shape)
print("Ratings Shape:", ratings.shape)

Books Shape: (271360, 8)

Users Shape: (278858, 3)

Ratings Shape: (1149780, 3)


In [16]:
books.sample(3)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
58378,140436766,"Hope Leslie, Or, Early Times in the Massachuse...",Catharine Maria Sedgwick,1998,Penguin Books,http://images.amazon.com/images/P/0140436766.0...,http://images.amazon.com/images/P/0140436766.0...,http://images.amazon.com/images/P/0140436766.0...
153962,380774445,The Man Who Shorted Out the Electric Chair,Mitchell Symons,1996,Harper Mass Market Paperbacks (Mm),http://images.amazon.com/images/P/0380774445.0...,http://images.amazon.com/images/P/0380774445.0...,http://images.amazon.com/images/P/0380774445.0...
76407,307020134,Ten items or less: A counting book (A Little g...,Stephanie Calmenson,1985,Western Pub. Co,http://images.amazon.com/images/P/0307020134.0...,http://images.amazon.com/images/P/0307020134.0...,http://images.amazon.com/images/P/0307020134.0...


In [17]:
users.sample(3)

Unnamed: 0,User-ID,Location,Age
48846,48847,"edinburgh, scotland, united kingdom",21.0
39109,39110,"mons, hainaut, belgium",
265326,265327,"bremen, bremen, germany",19.0


In [18]:
ratings.sample(3)

Unnamed: 0,User-ID,ISBN,Book-Rating
660290,160401,786890436,0
211697,49144,8466605029,5
804705,195694,451183843,0


# Merging All DataFrames

In [23]:
# Merging Books and Ratings
merged = ratings.merge(books, on="ISBN")

# Merging Users
merged = merged.merge(users, on="User-ID")

# printing shape of merged dataframe
print("Shape:", merged.shape)

# Showing 3 random samples
merged.sample(3)

Shape: (1031136, 12)


Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Location,Age
720703,106131,380804557,0,Stardust,Neil Gaiman,2003,Avon,http://images.amazon.com/images/P/0380804557.0...,http://images.amazon.com/images/P/0380804557.0...,http://images.amazon.com/images/P/0380804557.0...,"lorton, virginia, usa",32.0
83107,113519,316902918,0,Bob Vila's Guide to Buying Your Dream House (B...,Bob Vila,1990,"Little, Brown",http://images.amazon.com/images/P/0316902918.0...,http://images.amazon.com/images/P/0316902918.0...,http://images.amazon.com/images/P/0316902918.0...,"pleasanton, california, usa",
936084,201792,440904935,3,The Boy Who Drank Too Much,SHEP GREENE,1980,Laure Leaf,http://images.amazon.com/images/P/0440904935.0...,http://images.amazon.com/images/P/0440904935.0...,http://images.amazon.com/images/P/0440904935.0...,"tacoma, washington, usa",21.0


# Removing Duplicates

In [21]:
merged = merged[~ merged.duplicated(subset=["User-ID", "Book-Title"], keep='first')]
merged.head(3)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,Location,Age
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,"tyler, texas, usa",
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,"cincinnati, ohio, usa",23.0
2,2313,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,http://images.amazon.com/images/P/0812533550.0...,http://images.amazon.com/images/P/0812533550.0...,http://images.amazon.com/images/P/0812533550.0...,"cincinnati, ohio, usa",23.0


# Saving the Merged Dataset

In [22]:
merged.to_csv("data/merged.csv", index=False)