In [1]:
import pandas as pd

# Load dataset
data = pd.read_csv('books_new.csv')

# Explore the data
print(data.head())
print(data.info())

# Handle missing values
# data.dropna(inplace=True)  # or use fillna() for specific columns

# Remove duplicates
# data.drop_duplicates(inplace=True)

# Normalize genre column
# data['genre'] = data['genre'].str.lower()


                      Title            Author       Genre           SubGenre  \
0  Fundamentals of Wavelets  Goswami, Jaideva        tech  signal_processing   
1                Data Smart     Foreman, John        tech       data_science   
2  God Created the Integers  Hawking, Stephen        tech        mathematics   
3         Superfreakonomics   Dubner, Stephen     science          economics   
4               Orientalism      Said, Edward  nonfiction            history   

   Height      Publisher  
0     228          Wiley  
1     235          Wiley  
2     197        Penguin  
3     179  HarperCollins  
4     197        Penguin  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      211 non-null    object
 1   Author     187 non-null    object
 2   Genre      211 non-null    object
 3   SubGenre   211 non-null    object
 4   Height     211

In [2]:
# Fill missing authors with 'Unknown Author'
data['Author'].fillna('Unknown Author', inplace=True)

# Fill missing publishers with 'Unknown Publisher'
data['Publisher'].fillna('Unknown Publisher', inplace=True)

# Check again for missing values
print(data.isnull().sum())


Title        0
Author       0
Genre        0
SubGenre     0
Height       0
Publisher    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Author'].fillna('Unknown Author', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Publisher'].fillna('Unknown Publisher', inplace=True)


In [6]:
# Step 1: Combine 'Title', 'Author', 'Publisher', 'Genre', and 'SubGenre' into a single feature
data['combined_features'] = data['Title'] + " " + data['Author'] + " " + data['Publisher'] + " " + data['Genre'] + " " + data['SubGenre']

# Step 2: Use TF-IDF Vectorizer to convert the combined features into vectors
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'combined_features' column into a TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(data['combined_features'])

# Step 3: Calculate cosine similarity matrix based on the new TF-IDF vectors
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Step 4: Update the recommendation function
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the book that matches the title
    idx = data[data['Title'] == title].index[0]

    # Get pairwise similarity scores for all books
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the 5 most similar books (excluding the book itself)
    book_indices = [i[0] for i in sim_scores[1:6]]

    # Return the top 5 most similar books
    return data['Title'].iloc[book_indices]

# Test the updated function
print(get_recommendations('Fundamentals of Wavelets'))


1                               Data Smart
154         Elements of Information Theory
58                         Learning OpenCV
207           Image Processing with MATLAB
61     Principles of Communication Systems
Name: Title, dtype: object


In [7]:
import string

# Function to clean text
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# Apply the function to relevant columns
for col in ['Title', 'Author', 'Publisher', 'Genre', 'SubGenre']:
    data[col] = data[col].apply(clean_text)


In [8]:
# Emphasize Genre and SubGenre by repeating them
data['combined_features'] = (
    data['Title'] + ' ' +
    data['Author'] + ' ' +
    data['Publisher'] + ' ' +
    (data['Genre'] + ' ') * 2 +  # Double weighting
    (data['SubGenre'] + ' ') * 2  # Double weighting
)


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english', ngram_range=(1, 2))
count_matrix = count.fit_transform(data['combined_features'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)


In [10]:
print(get_recommendations('fundamentals of wavelets'))


1                               data smart
154         elements of information theory
58                         learning opencv
207           image processing with matlab
61     principles of communication systems
Name: Title, dtype: object
