In [None]:
# Cell 1: Data Collection
import pandas as pd

# Assuming you have a CSV file named 'books.csv' with columns 'title', 'description'
df = pd.read_csv('books.csv')

# Display the first few rows of the dataset
df.head()

# Cell 2: Data Preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Handle missing values in the 'description' column
df['description'].fillna('', inplace=True)
df['title'].fillna('', inplace=True)

# Concatenate title and description
df['combined_text'] = df['title'] + ' ' + df['description']

# Tokenize combined text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['combined_text'])

# Convert combined texts into sequences and pad them
sequences = tokenizer.texts_to_sequences(df['combined_text'])
max_sequence_length = max(len(seq) for seq in sequences)
total_words = len(tokenizer.word_index) + 1

X = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Display the tokenized sequences
print(X[:5])

import numpy as np

X_array = np.array(X)

# Cell 3: Model Architecture
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Bidirectional

# Define the input layer
input_layer = Input(shape=(max_sequence_length,))
# Embedding layer
embedding_layer = Embedding(total_words, 100, input_length=max_sequence_length)(input_layer)
# LSTM layer
lstm_layer = LSTM(50, return_sequences=True)(embedding_layer)
# Bidirectional LSTM layer for better representation
bidirectional_lstm = Bidirectional(LSTM(50))(lstm_layer)
# Dense layer for reconstruction
output_layer = Dense(max_sequence_length, activation='linear')(bidirectional_lstm)

# Build the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error')

# Display the model summary
model.summary()

# Cell 4: Model Training
model.fit(X_array, X_array, epochs=5, batch_size=32, validation_split=0.2)


In [None]:

# Cell 5: Recommendation
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Take user input for title and description
user_title = "The Reverse of the Medal"
user_description = "In this book, Jack Aubrey returns from his duties protecting whalers off the South American coast and is persuaded by a casual acquaintance to make investments in the City on the strength of supposedly certain information. From there he is led into the half-worlds of the London criminal underground and of government espionage - the province of his friend, Stephen Maturin."

# Concatenate title and description
user_combined_text = user_title + ' ' + user_description

# Tokenize and pad user input
user_sequence = tokenizer.texts_to_sequences([user_combined_text])
user_padded = pad_sequences(user_sequence, maxlen=max_sequence_length, padding='post')


# Get the user input embedding
user_embedding = model.predict(user_padded)

# Pass user input through the model layers
user_embedding_through_layers = model.layers[1](user_padded)  # Embedding layer
user_embedding_through_layers = model.layers[2](user_embedding_through_layers)  # LSTM layer
user_embedding_through_layers = model.layers[3](user_embedding_through_layers)  # Bidirectional LSTM layer

# Find the index of the movie in the DataFrame
movie_index = df[df['title'] == user_title].index[0]

# Get the corresponding title embedding
title_embedding = model.layers[1](np.array(X[movie_index]).reshape(1, -1))  # Embedding layer
title_embedding = model.layers[2](title_embedding)  # LSTM layer
title_embedding = model.layers[3](title_embedding)  # Bidirectional LSTM layer

# Print the embeddings
print("User Input Embedding:")
print(user_embedding)
print("\nEmbedding of the Given Title:")
print(title_embedding.numpy())

In [None]:


# Calculate cosine similarity between user input and title embeddings
user_input_reconstructed_sequence = model.predict(user_padded).tolist()
cosine_similarities = cosine_similarity(
    np.array(user_input_reconstructed_sequence).reshape(1, -1),
    np.array(X[movie_index]).reshape(1, -1)
)[0]

# Store cosine similarities in the original DataFrame
df.at[movie_index, 'cosine_similarity'] = cosine_similarities

# Sort by cosine similarity to user input
recommended_movies = df.sort_values(by='cosine_similarity', ascending=False).head(12)[['title', 'cosine_similarity']]

print("\nRecommended Movies:")
print(recommended_movies)
