In [None]:
# Cell 1: Data Collection
import pandas as pd

# Assuming you have a CSV file named 'books.csv' with columns 'title', 'description'
df = pd.read_csv('books.csv')

# Display the first few rows of the dataset
df.head()
# Cell 2: Data Preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# Handle missing values in the 'description' column
df['description'].fillna('', inplace=True)

# Combine 'title' and 'description' and tokenize the combined text
df['combined_text'] = df['title'] + ' ' + df['description']
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['combined_text'])

# Convert combined texts into sequences and pad them
sequences = tokenizer.texts_to_sequences(df['combined_text'])
max_sequence_length = max(len(seq) for seq in sequences)
total_words = len(tokenizer.word_index) + 1

X = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

import numpy as np

X_array = np.array(X)

# Cell 3: Model Architecture
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, Dropout

# Define the input layer
input_layer = Input(shape=(max_sequence_length,))
# Embedding layer
embedding_layer = Embedding(total_words, 100, input_length=max_sequence_length)(input_layer)
# LSTM layer with dropout
lstm_layer = LSTM(50, dropout=0.2, recurrent_dropout=0.2)(embedding_layer)
# Dense layer for reconstruction with a non-linear activation function
output_layer = Dense(max_sequence_length, activation='relu')(lstm_layer)

# Build the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error')

# Display the model summary
model.summary()

# Cell 4: Model Training
model.fit(X_array, X_array, epochs=10, batch_size=64, validation_split=0.2)

# Cell 5: User Input and Recommendation
from sklearn.metrics.pairwise import cosine_similarity

user_query = input("Enter a book description: ")
print(user_query)

# Tokenize and pad user input
user_sequence = tokenizer.texts_to_sequences([user_query])
user_padded = pad_sequences(user_sequence, maxlen=max_sequence_length, padding='post')

# Get reconstructed sequence
reconstructed_sequence = model.predict(user_padded)

# Store reconstructed sequence in the original DataFrame
df['reconstructed_sequence'] = model.predict(X_array).tolist()

# Reshape arrays for cosine similarity calculation
user_array_reshaped = np.array(reconstructed_sequence).reshape(1, -1)
book_arrays_reshaped = np.array(df['reconstructed_sequence'].tolist())

# Calculate cosine similarity and recommend books
cosine_similarities = cosine_similarity(book_arrays_reshaped, user_array_reshaped)
df['normalized_similarity'] = (cosine_similarities - np.min(cosine_similarities)) / (
    np.max(cosine_similarities) - np.min(cosine_similarities)
)

# Sort by similarity to user input
recommended_books = df.sort_values(by='normalized_similarity', ascending=False).head(12)[['title', 'normalized_similarity']]

print("Recommended Books:")
print(recommended_books)


In [None]:
# Cell 1: Data Collection
import pandas as pd

# Assuming you have a CSV file named 'books.csv' with columns 'title', 'description'
df = pd.read_csv('books.csv')

# Display the first few rows of the dataset
df.head()

# Cell 2: Data Preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Handle missing values in the 'description' column
df['description'].fillna('', inplace=True)

# Tokenize book descriptions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['description'])

# Convert descriptions into sequences and pad them
sequences = tokenizer.texts_to_sequences(df['description'])
max_sequence_length = max(len(seq) for seq in sequences)
total_words = len(tokenizer.word_index) + 1

X = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Display the tokenized sequences
print(X[:5])

import numpy as np

X_array = np.array(X)

# Cell 3: Model Architecture
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense

# Define the input layer
input_layer = Input(shape=(max_sequence_length,))
# Embedding layer
embedding_layer = Embedding(total_words, 100, input_length=max_sequence_length)(input_layer)
# LSTM layer
lstm_layer = LSTM(50)(embedding_layer)
# Dense layer for reconstruction
output_layer = Dense(max_sequence_length, activation='linear')(lstm_layer)

# Build the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error')

# Display the model summary
model.summary()
# ...
# Cell 4: Model Training
model.fit(X_array, X_array, epochs=1, batch_size=32, validation_split=0.2)

In [2]:
# Cell 1: Data Collection
import pandas as pd

# Assuming you have a CSV file named 'books.csv' with columns 'title', 'description'
df = pd.read_csv('books.csv')

# Display the first few rows of the dataset
df.head()

# Cell 2: Data Preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Handle missing values in the 'description' column
df['description'].fillna('', inplace=True)
df['title'].fillna('', inplace=True)

# Concatenate title and description
df['combined_text'] = df['title'] + ' ' + df['description']

# Tokenize combined text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['combined_text'])

# Convert combined texts into sequences and pad them
sequences = tokenizer.texts_to_sequences(df['combined_text'])
max_sequence_length = max(len(seq) for seq in sequences)
total_words = len(tokenizer.word_index) + 1

X = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Display the tokenized sequences
print(X[:5])

import numpy as np

X_array = np.array(X)

# Cell 3: Model Architecture
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Bidirectional

# Define the input layer
input_layer = Input(shape=(max_sequence_length,))
# Embedding layer
embedding_layer = Embedding(total_words, 100, input_length=max_sequence_length)(input_layer)
# LSTM layer
lstm_layer = LSTM(50, return_sequences=True)(embedding_layer)
# Bidirectional LSTM layer for better representation
bidirectional_lstm = Bidirectional(LSTM(50))(lstm_layer)
# Dense layer for reconstruction
output_layer = Dense(max_sequence_length, activation='linear')(bidirectional_lstm)

# Build the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error')

# Display the model summary
model.summary()

# Cell 4: Model Training
model.fit(X_array, X_array, epochs=5, batch_size=32, validation_split=0.2)




[[6171    4   52 ...    0    0    0]
 [6173  577    4 ...    0    0    0]
 [   1   25 1395 ...    0    0    0]
 [3816    2 3083 ...    0    0    0]
 [   1  161 1021 ...    0    0    0]]


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 936)]             0         
                                                                 
 embedding (Embedding)       (None, 936, 100)          3470800   
                                                                 
 lstm (LSTM)                 (None, 936, 50)           30200     
                                                                 
 bidirectional (Bidirection  (None, 100)               40400     
 al)                                                             
                                                                 
 dense (Dense)               (None, 936)               94536     
    

KeyboardInterrupt: 

In [1]:
# Cell 5: Recommendation
# Take user input from console
user_query = input("Enter a book description: ")

# Tokenize and pad user input
user_sequence = tokenizer.texts_to_sequences([user_query])
user_padded = pad_sequences(user_sequence, maxlen=max_sequence_length, padding='post')

# Get reconstructed sequence
reconstructed_sequence = model.predict(user_padded)

# Store reconstructed sequence in the original DataFrame
df['reconstructed_sequence'] = model.predict(X_array).tolist()

# Calculate similarity and recommend books
df['similarity'] = df['reconstructed_sequence'].apply(
    lambda x: np.linalg.norm(np.array(x) - np.array(reconstructed_sequence[0]))
)

# Sort by similarity to user input
recommended_books = df.sort_values(by='similarity',ascending=False).head(12)[['title' , 'similarity']]

print("Recommended Books:")
print(recommended_books)


NameError: name 'tokenizer' is not defined