## Importing necessary library

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Web scraping book pages

In [None]:
from concurrent.futures import ThreadPoolExecutor
import httplib2

def scrape_page_and_get_links(url):
    http = httplib2.Http()
    response, content = http.request(url)
    page = content.decode('utf-8')

    data = BeautifulSoup(page, 'html.parser')
    book_link_tags = data.find_all('a', attrs={'class': 'bookTitle'})
    book_links = ['https://www.goodreads.com/' + link['href'] for link in book_link_tags]
    return book_links

if __name__ == '__main__':
    all_pages = []
    for serial in range(1, 100, 1):
        page = f"https://www.goodreads.com/list/show/19106.MUST_READS_?page={serial}"
        all_pages.append(page)

    # Use multithreading with ThreadPoolExecutor
    num_threads = 10  # we can adjust the number of threads as needed
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        results = list(executor.map(scrape_page_and_get_links, all_pages))

    # Flatten the results into a single list of book links
    book_links = [link for sublist in results for link in sublist]

## web scraping book details from every book page

In [None]:
from concurrent.futures import ThreadPoolExecutor
import time

def scrape_book_details(link):
    try:
        response = requests.get(link)
        response.raise_for_status()  # Raise an exception if the response status code is not 200

        book_data = BeautifulSoup(response.text, 'html.parser')

        title = book_data.find('h1', attrs={'class': 'Text Text__title1'})
        author = book_data.find('span', attrs={'class': 'ContributorLink__name'})
        rating = book_data.find('div', attrs={'class': 'RatingStatistics__rating'})
        no_of_ratings = book_data.find('div', attrs={'class': 'RatingStatistics__meta'})
        description = book_data.find('div', attrs={'class': 'DetailsLayoutRightParagraph__widthConstrained'})
        genres = book_data.find_all('span', attrs={'class': 'BookPageMetadataSection__genreButton'})

        # Extract the text from each element (if found) or set as None if not found
        title_text = title.text.strip() if title else None
        author_text = author.text.strip() if author else None
        rating_text = rating.text.strip() if rating else None
        no_of_ratings_text = no_of_ratings.text.strip() if no_of_ratings else None
        description_text = description.text.strip() if description else None
        genres_list = [genre.text.strip() for genre in genres] if genres else []

        # Create a dictionary for the current book
        book_dict = {
            'Title': title_text,
            'Author': author_text,
            'Rating': rating_text,
            'Number of Ratings': no_of_ratings_text,
            'Description': description_text,
            'Genres': genres_list
        }

        return book_dict
    except requests.exceptions.RequestException as e:
        print(f"Error while scraping {link}: {e}")
        return None

if __name__ == '__main__':
    # list of book links
    your_book_links = book_links

    # Use multithreading to scrape book details for each link
    num_threads = 300  # Adjust this number based on server capacity and system capabilities

    # Introduce a delay of 2 seconds between each request
    time_delay = 2

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        book_details = list(executor.map(scrape_book_details, your_book_links))
        time.sleep(time_delay)  # Add a delay between requests to be considerate to the server

In [None]:
# Filter out the None values from book_details
book_details = [book for book in book_details if book is not None]

# Create the DataFrame
book_details = pd.DataFrame(book_details)

In [None]:
# Dropping nan values from the dataset
book_details = book_details.dropna(how='any')

In [None]:
# Dropping duplicates from as per title
book_details = book_details.drop_duplicates(subset='Title')

In [None]:
# saving data as csv file 
book_details.to_csv('book_details.csv',index=False)

In [52]:
# Reading data from the csv file
book_details = pd.read_csv('book_details.csv',encoding= 'unicode_escape')

In [53]:
book_details.head(2)

Unnamed: 0,title,author,rating,no_of_ratings,no_of_reviews,description,genres
0,Divergent,Veronica Roth,4.15,3765886,117791,"In Beatrice Prior's dystopian Chicago world, s...","Young Adult, Dystopia, Fantasy, Fiction, Scien..."
1,Catching Fire,Suzanne Collins,4.31,3305054,113480,Sparks are igniting.Flames are spreading.And t...,"Young Adult, Dystopia, Fiction, Fantasy, Scien..."


In [54]:
# Checking dataset information
book_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13324 entries, 0 to 13323
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          13324 non-null  object 
 1   author         13324 non-null  object 
 2   rating         13324 non-null  float64
 3   no_of_ratings  13324 non-null  int64  
 4   no_of_reviews  13324 non-null  object 
 5   description    13273 non-null  object 
 6   genres         12327 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 728.8+ KB


In [55]:
book_details.isnull().sum()

title              0
author             0
rating             0
no_of_ratings      0
no_of_reviews      0
description       51
genres           997
dtype: int64

In [56]:
# Drop null values as we find some null values 
book_details = book_details.dropna(how='any')

In [58]:
# Change column type from object to int
book_details['no_of_ratings'] = book_details['no_of_ratings'].astype('int')

In [59]:
# Replacing unncessary characters from the column
book_details['no_of_reviews'] = book_details['no_of_reviews'].str.replace(',', '').str.replace('\xa0', '').astype(int)

In [60]:
book_details['no_of_reviews'] = book_details['no_of_reviews'].astype('int')

In [61]:
book_details.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12300 entries, 0 to 13322
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          12300 non-null  object 
 1   author         12300 non-null  object 
 2   rating         12300 non-null  float64
 3   no_of_ratings  12300 non-null  int32  
 4   no_of_reviews  12300 non-null  int32  
 5   description    12300 non-null  object 
 6   genres         12300 non-null  object 
dtypes: float64(1), int32(2), object(4)
memory usage: 672.7+ KB


In [62]:
# Checking dataset shape
book_details.shape

(12300, 7)

## Importing necessary library for the books recommendation engine

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack

In [64]:
book_details.columns

Index(['title', 'author', 'rating', 'no_of_ratings', 'no_of_reviews',
       'description', 'genres'],
      dtype='object')

## Data Preprocessing of numerical columns

In [65]:
scaler = MinMaxScaler()
book_details['rating_scaled'] = scaler.fit_transform(book_details['rating'].values.reshape(-1, 1))
book_details['no_of_ratings_scaled'] = scaler.fit_transform(book_details['no_of_ratings'].values.reshape(-1, 1))
book_details['no_of_reviews_scaled'] = scaler.fit_transform(book_details['no_of_reviews'].values.reshape(-1, 1))

## Vectorize the author,description and genres columns

In [66]:
cv = CountVectorizer(max_features=10000, stop_words='english', binary=True)
author_vectorized = cv.fit_transform(book_details['author'])
description_vectorized = cv.fit_transform(book_details['description'])
genres_vectorized = cv.fit_transform(book_details['genres'])

## Combine the vectorized features with the scaled features

In [67]:
combined_features = hstack([author_vectorized, description_vectorized,
                            genres_vectorized,
                            book_details['rating_scaled'].values.reshape(-1, 1),
                            book_details['no_of_ratings_scaled'].values.reshape(-1, 1),
                            book_details['no_of_reviews_scaled'].values.reshape(-1, 1)])

In [68]:
combined_features

<12300x17830 sparse matrix of type '<class 'numpy.float64'>'
	with 911596 stored elements in COOrdinate format>

## Applying similiarity Matrix to get the pattern

In [69]:
similarity_matrix = cosine_similarity(combined_features)

In [70]:
def get_top_recommendations(book_title, top_n=5):
    book_index = book_details[book_details['title'] == book_title].index[0]
    similar_books_indices = similarity_matrix[book_index].argsort()[::-1][1:top_n+1]
    similar_books = book_details.iloc[similar_books_indices]['title']
    return similar_books

## Interface for checking the recommendation engine

In [71]:
def get_top_recommendations(book_title, top_n=5):
    # This function should return a list of recommended books based on the selected book.

    # Find the index of the book in the DataFrame
    book_index = book_details[book_details['title'] == book_title].index[0]

    # Retrieve the similarity scores of the selected book with all other books
    similar_books_indices = similarity_matrix[book_index].argsort()[::-1][1:top_n+1]

    # Retrieve the titles of the recommended books using the indices
    recommended_books = book_details.iloc[similar_books_indices]['title'].tolist()

    return recommended_books

def main():
    while True:
        print("\nBOOK RECOMMENDATION SYSTEM")
        print("1. Input a book title")
        print("2. Exit")
        choice = input("Enter your choice (1/2): ")

        if choice == "1":
            book_title = input("Enter the book title: ")
            if book_title in final_df['title'].tolist():
                top_recommendations = get_top_recommendations(book_title, top_n=5)
                print("\nTop Recommended Books:")
                for idx, book in enumerate(top_recommendations, 1):
                    print(f"{idx}. {book}")
            else:
                print("Book not found in the database.")
        elif choice == "2":
            print("Exiting the program.")
            break
        else:
            print("Invalid choice. Please try again.")


if __name__ == "__main__":
    main()


BOOK RECOMMENDATION SYSTEM
1. Input a book title
2. Exit


Enter your choice (1/2):  1
Enter the book title:  Catching Fire



Top Recommended Books:
1. The Hunger Games Trilogy Boxset
2. The Hunger Games
3. Hungerspelen
4. The Soul
5. Mockingjay

BOOK RECOMMENDATION SYSTEM
1. Input a book title
2. Exit


Enter your choice (1/2):  2


Exiting the program.
