In [5]:
# Cell 1: Data Collection
import pandas as pd

# Assuming you have a CSV file named 'books.csv' with columns 'title', 'description'
df = pd.read_csv('books.csv')

# Display the first few rows of the dataset
df.head()

# Cell 2: Data Preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Handle missing values in the 'description' column
df['description'].fillna('', inplace=True)

# Tokenize book descriptions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['description'])

# Convert descriptions into sequences and pad them
sequences = tokenizer.texts_to_sequences(df['description'])
max_sequence_length = max(len(seq) for seq in sequences)
total_words = len(tokenizer.word_index) + 1

X = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Display the tokenized sequences
print(X[:5])

import numpy as np

X_array = np.array(X)

# Cell 3: Model Architecture
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Bidirectional

# Define the input layer
input_layer = Input(shape=(max_sequence_length,))
# Embedding layer
embedding_layer = Embedding(total_words, 100, input_length=max_sequence_length)(input_layer)
# LSTM layer
lstm_layer = LSTM(50, return_sequences=True)(embedding_layer)
# Bidirectional LSTM layer for better representation
bidirectional_lstm = Bidirectional(LSTM(50))(lstm_layer)
# Dense layer for reconstruction
output_layer = Dense(max_sequence_length, activation='linear')(bidirectional_lstm)

# Build the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error')

# Display the model summary
model.summary()

# Cell 4: Model Training
model.fit(X_array, X_array, epochs=10, batch_size=32, validation_split=0.2)




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x294d47fed50>

In [6]:
# Cell 5: Recommendation
from sklearn.metrics.pairwise import cosine_similarity

# Define the movie title you want to print
movie_title_to_print = "The Reverse of the Medal"  # Replace with the actual movie title

# Find the index of the movie in the DataFrame
movie_index = df[df['title'] == movie_title_to_print].index

# Check if the movie is found in the DataFrame
if not movie_index.empty:
    movie_index = movie_index[0]
    
    # Retrieve the movie information and print it
    movie_title = df.loc[movie_index, 'title']
    movie_description = df.loc[movie_index, 'description']
    
    # Combine title and description for embedding
    combined_text = ' '.join([movie_title, movie_description])
    
    # Tokenize and pad the sequence
    sequence = tokenizer.texts_to_sequences([combined_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')
    
    # Get the embedding
    embedding_of_movie_to_print = model.layers[2](model.layers[1](padded_sequence)).numpy().flatten()

    # Print the movie information and embedding
    print(f"\nEmbedding of Movie Title: {movie_title}")
    print(f"Movie Description: {movie_description}")
    print(f"Embedding:\n{embedding_of_movie_to_print}")
    
else:
    print(f"\nMovie '{movie_title_to_print}' not found in the dataset.")

# Take user input from console
user_query = movie_description
print(user_query)

# Tokenize and pad user input
user_sequence = tokenizer.texts_to_sequences([user_query])
user_padded = pad_sequences(user_sequence, maxlen=max_sequence_length, padding='post')

# Use Bidirectional LSTM for better encoding
user_embedding = model.layers[2](model.layers[1](user_padded)).numpy().flatten()

# Calculate cosine similarity and recommend books
cosine_similarities = {}
for idx, row in df.iterrows():
    description = row['description']
    title = row['title']
    combined_text = ' '.join([title, description])  # Concatenate title and description
    sequence = tokenizer.texts_to_sequences([combined_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')
    embedding = model.layers[2](model.layers[1](padded_sequence)).numpy().flatten()
    cosine_similarities[idx] = cosine_similarity([user_embedding], [embedding])[0][0]

# Clip the similarity values to ensure they are within [-1, 1]
for idx in cosine_similarities:
    cosine_similarities[idx] = np.clip(cosine_similarities[idx], -1, 1)

# Add cosine similarity to the DataFrame
df['cosine_similarity'] = cosine_similarities.values()

# Sort by cosine similarity to user input in descending order
recommended_books = df.sort_values(by='cosine_similarity', ascending=False).head(12)[['title', 'cosine_similarity']]

print("Recommended Books:")
print(recommended_books)


Embedding of Movie Title: The Reverse of the Medal
Movie Description: In this book, Jack Aubrey returns from his duties protecting whalers off the South American coast and is persuaded by a casual acquaintance to make investments in the City on the strength of supposedly certain information. From there he is led into the half-worlds of the London criminal underground and of government espionage - the province of his friend, Stephen Maturin.
Embedding:
[ 0.6445068   0.73176515 -0.730848   ...  0.99121255 -0.99019396
  0.98869324]
In this book, Jack Aubrey returns from his duties protecting whalers off the South American coast and is persuaded by a casual acquaintance to make investments in the City on the strength of supposedly certain information. From there he is led into the half-worlds of the London criminal underground and of government espionage - the province of his friend, Stephen Maturin.


KeyboardInterrupt: 