In [None]:
import pandas as pd

books_df = pd.read_csv('../data/books.csv')
clustered_users_df = pd.read_csv('../data/produced/clustered_users.csv')

# Merge the dataframes on ISBN
data_df = pd.merge(clustered_users_df, books_df, on='isbn', how='inner')

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np

# Tokenizing the summaries
data_df['tokenized_summary'] = data_df['summary'].apply(lambda x: word_tokenize(str(x)))

# Training Word2Vec model
model = Word2Vec(sentences=data_df['tokenized_summary'].tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Function to convert summaries to vectors
def summary_to_vec(tokens):
    vec = np.mean([model.wv[word] for word in tokens if word in model.wv.key_to_index], axis=0)
    return vec

data_df['summary_vec'] = data_df['tokenized_summary'].apply(summary_to_vec)

In [None]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense

# Split data into train and test sets
X = np.array(data_df['summary_vec'].tolist())
y = data_df['cluster'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = Sequential()
model.add(Dense(128, input_dim=100, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(5, activation='softmax'))  # Assuming there are 5 clusters in total

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=10, validation_data=(X_test, y_test))

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

In [None]:
unlabeled_df['tokenized_summary'] = unlabeled_df['summary'].apply(lambda x: word_tokenize(str(x)))
unlabeled_df['summary_vec'] = unlabeled_df['tokenized_summary'].apply(summary_to_vec)

X_unlabeled = np.array(unlabeled_df['summary_vec'].tolist())

# Predict clusters
predicted_clusters = np.argmax(model.predict(X_unlabeled), axis=-1)
unlabeled_df['predicted_cluster'] = predicted_clusters