In [1]:
# Cell 1: Data Collection and Preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# Assuming you have a CSV file named 'books.csv' with columns 'title', 'description'
df = pd.read_csv('books.csv')

# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Handle missing values in the 'description' column
train_df['description'].fillna('', inplace=True)
test_df['description'].fillna('', inplace=True)

# Tokenize book descriptions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['description'])

# Convert descriptions into sequences and pad them
train_sequences = tokenizer.texts_to_sequences(train_df['description'])
test_sequences = tokenizer.texts_to_sequences(test_df['description'])

max_sequence_length = max(len(seq) for seq in train_sequences)
total_words = len(tokenizer.word_index) + 1

X_train = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
X_test = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

# Display the tokenized sequences
print(X_train[:5])
# ... (Your existing code)
# Cell 2: Model Architecture (Adjusted)
from keras.layers import Bidirectional

# Adjusted model architecture
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(total_words, 100, input_length=max_sequence_length)(input_layer)
lstm_layer = Bidirectional(LSTM(100, return_sequences=True))(embedding_layer)  # Using Bidirectional LSTM for enhanced learning
output_layer = Dense(total_words, activation='softmax')(lstm_layer)  # Using 'softmax' for output layer as it's a classification task

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')  # Changed loss function for a classification task

model.summary()

# Cell 3: Model Training (Use more epochs)
model.fit(X_train, X_train, epochs=50, batch_size=32, validation_split=0.2)

# Cell 4: Model Evaluation
loss = model.evaluate(X_test, X_test)
print(f"Test Loss: {loss}")

# Cell 5: User Input and Recommendation (Unchanged)
user_query = input("Enter a book description: ")
print(user_query)
# ... (remaining code for recommendation)


# Create a dictionary to store book embeddings
book_embeddings = {}

# Store embeddings in the dictionary
for index, row in train_df.iterrows():
    description = row['description']
    sequence = tokenizer.texts_to_sequences([description])
    padded_sequence = pad_sequences(sequence, maxlen=max_sequence_length, padding='post')
    embedding = model.predict(padded_sequence)[0]
    book_embeddings[row['title']] = {'embedding': embedding, 'description': description}

# Tokenize and pad user input
user_sequence = tokenizer.texts_to_sequences([user_query])
user_padded = pad_sequences(user_sequence, maxlen=max_sequence_length, padding='post')

# Get the user's input embedding
user_embedding = model.predict(user_padded)[0]

# Calculate cosine similarity for each book in the dictionary
similarities = {}
for title, data in book_embeddings.items():
    book_embedding = data['embedding']
    similarity = np.dot(book_embedding, user_embedding) / (np.linalg.norm(book_embedding) * np.linalg.norm(user_embedding))
    similarities[title] = similarity

# Sort by similarity and get top 10 recommendations
# Sort by similarity and get top 10 recommendations
top_recommendations = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:10]

print("Top 10 Recommended Books:")
for title, similarity in top_recommendations:
    print(f"{title}")




[[   0    0    0 ...    0    0    0]
 [   6    1  103 ...    0    0    0]
 [ 102   90 1161 ...    0    0    0]
 [  18  256  108 ...    0    0    0]
 [   4  828 4435 ...    0    0    0]]


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 933)]             0         
                                                                 
 embedding (Embedding)       (None, 933, 100)          3013500   
                                                                 
 bidirectional (Bidirection  (None, 933, 200)          160800    
 al)                                                             
                                                                 
 dense (Dense)               (None, 933, 30135)        6057135   
                                                                 
Total params: 9231435 (35.22 MB)
Trainable params: 9231435 (35.22 MB)


In [None]:
# Cell 1: Data Collection and Preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Assuming you have a CSV file named 'books.csv' with columns 'title', 'description'
df = pd.read_csv('books.csv')

# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Handle missing values in the 'description' column
train_df['description'].fillna('', inplace=True)
test_df['description'].fillna('', inplace=True)

# Tokenize book descriptions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['description'])

# Convert descriptions into sequences and pad them
X_train = pad_sequences(tokenizer.texts_to_sequences(train_df['description']), padding='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(test_df['description']), padding='post')

# Display the tokenized sequences with word embeddings
print(X_train[:5])


In [None]:

# Assuming you have a CSV file named 'books.csv' with columns 'title', 'description'
import pandas as pd
import nltk
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping

# Load and preprocess the data
df = pd.read_csv('books.csv')
nltk.download('stopwords')

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    if isinstance(text, str):
        text = text.lower()
        words = text.split()  # Change from text_to_word_sequence to split
        filtered_words = [word for word in words if word not in stop_words]
        return ' '.join(filtered_words)
    else:
        return ''

df['description'] = df['description'].apply(preprocess_text)

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['description'])
sequences = tokenizer.texts_to_sequences(df['description'])
max_sequence_length = max(len(seq) for seq in sequences)
total_words = len(tokenizer.word_index) + 1

X = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

# Split data into training and validation sets
X_train, _, _, _ = train_test_split(X, X, test_size=0.3, random_state=42)
from keras.layers import TimeDistributed

# Model Architecture
embedding_size = 250
book_model = Sequential()
book_model.add(Embedding(total_words, embedding_size, input_length=max_sequence_length))
book_model.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
book_model.add(TimeDistributed(Dense(max_sequence_length, activation='linear')))  # Use TimeDistributed layer here
book_model.compile(optimizer='adam', loss='mean_squared_error')

# Model Training
early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
book_model.fit(X_train, X_train, epochs=1, batch_size=32, callbacks=[early_stopping])

# Book Recommendation Function
def recommend_books(user_query, book_model, tokenizer, df, top_n=5):
    user_sequence = tokenizer.texts_to_sequences([user_query])
    user_padded = pad_sequences(user_sequence, maxlen=max_sequence_length, padding='post')

    book_embedding = book_model.predict(X_train)  # Use X_train here
    user_embedding = book_model.predict(user_padded)

    similarity_scores = cosine_similarity(user_embedding, book_embedding)

    indices = similarity_scores.argsort(axis=1)[0, ::-1][:top_n]
    recommended_books = df.loc[indices, ['title', 'description']]

    return recommended_books

# User Interaction
user_query = input("Enter a book description: ")
recommended_books = recommend_books(user_query, book_model, tokenizer, df)
print("Recommended Books:")
print(recommended_books)




In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Step 2: Load the Dataset
dataset_path = "books.csv"
df = pd.read_csv(dataset_path)

# Step 3: Text Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description'].fillna(''))

# Step 4: Calculate Similarity Scores
similarity_scores = linear_kernel(tfidf_matrix, tfidf_matrix)

# Step 5: Recommendation Function
def get_recommendations(title, similarity_matrix, dataframe):
    idx = dataframe.index[dataframe['title'] == title].tolist()[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Top 10 recommendations (excluding the input book)
    book_indices = [i[0] for i in sim_scores]
    return dataframe['title'].iloc[book_indices]

# Step 6: Test the Recommendation System
book_title = "The Catcher in the Rye"  # Replace with your desired title
recommendations = get_recommendations(book_title, similarity_scores, df)
print("Recommendations for {}: \n{}".format(book_title, recommendations))


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Step 2: Load the Dataset
dataset_path = "books.csv"
df = pd.read_csv(dataset_path)

# Step 3: Text Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['description'].fillna(''))
df["description"].head()
df[df["description"].isnull()].head()
df['description'] = df['description'].fillna('') 
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df['description'])
tfidf_matrix.shape
# Step 4: Calculate Similarity Scores
similarity_scores = linear_kernel(tfidf_matrix, tfidf_matrix)

# Step 5: Recommendation Function for User Description
def get_recommendations_for_user(description, similarity_matrix, dataframe):
    # Vectorize the user description
    user_vector = tfidf_vectorizer.transform([description])

    # Calculate similarity scores between user description and book summaries
    user_similarity_scores = linear_kernel(user_vector, tfidf_matrix).flatten()

    # Get indices of books with highest similarity scores
    book_indices = user_similarity_scores.argsort()[:-11:-1]  # Top 10 recommendations

    return dataframe['title'].iloc[book_indices]

# Step 6: Test the Recommendation System with User Description
user_description = "romeo and juliet hate story."
print("user input : {}".format(user_description))
recommendations = get_recommendations_for_user(user_description, similarity_scores, df)
print("Recommendations based on user description: \n{}".format(recommendations))


user input : romeo and juliet hate story.
Recommendations based on user description: 
4597              Falling for You
5469     Mockingbird Wish Me Luck
330     The Chaneysville Incident
572           In Watermelon Sugar
1507                  Plum Lovin'
3                  Rage of angels
882             Henry IV Part Two
879        The Merchant of Venice
5922                   Harm's Way
4022          Written on the Body
Name: title, dtype: object


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the Dataset
df = pd.read_csv('books.csv')

# Fill NaN values in the 'description' column
df['description'] = df['description'].fillna('')

# Text Vectorization
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df['description'])

# Calculate Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Recommendation Function for User Description
def content_based_recommender(description, cosine_sim, dataframe):
    # Vectorize the user description
    user_vector = tfidf.transform([description])

    # Calculate similarity scores between user description and book summaries
    user_similarity_scores = cosine_similarity(user_vector, tfidf_matrix).flatten()

    # Get indices of books with highest similarity scores
    book_indices = user_similarity_scores.argsort()[:-11:-1]  # Top 10 recommendations

    return dataframe['title'].iloc[book_indices]

# Test the Recommendation System with User Description
user_description = "I enjoy thrilling stories with a touch of suspense."
recommendations = content_based_recommender(user_description, cosine_sim, df)
print("Recommendations based on user description: \n{}".format(recommendations))
