# Summary processing for DL

In [54]:
import pandas as pd
import numpy as np
import os
import re
import ast
import pickle

from pathlib import Path
from sklearn.neighbors import NearestNeighbors

In [22]:
project_root = Path().resolve().parent.parent
file_path = os.path.join(project_root,'booklore','raw_data','goodreads_transformed.csv')
goodreads_df = pd.read_csv(file_path)

In [35]:
goodreads_df['bookId'] = goodreads_df['bookId'].apply(lambda x: str(re.match(r'^\d+', x).group()) if isinstance(x, str) else None)

In [36]:
goodreads_df.columns

Index(['bookId', 'embed'], dtype='object')

In [37]:
goodreads_df = goodreads_df[['bookId','embed']]

In [25]:
goodreads_df['embed'] = goodreads_df['embed'].apply(lambda x: ast.literal_eval(x.replace('[ ','[').replace('  ',',').replace(' ',',')))

In [38]:
goodreads_df

Unnamed: 0,bookId,embed
0,2767052,"[0.0884953663, 0.0424903035, 0.0208997913, -0...."
1,2,"[0.0389282592, 0.0435557254, 0.0674254596, 0.0..."
2,2657,"[0.0115293786, 0.0193687603, -0.0441893749, 0...."
3,1885,"[-0.0680252761, -0.0345517881, 0.0353352986, 0..."
4,41865,"[-0.0513570197, 0.0767711252, 0.0321286283, 0...."
...,...,...
52473,11492014,"[-0.102054641, -0.0632190257, 0.0984262526, 0...."
52474,11836711,"[0.0575922169, 0.0450388901, 0.0201384202, -0...."
52475,10815662,"[-0.0623739175, -0.0394319966, 0.0193678271, 0..."
52476,11330278,"[-0.0867372677, 0.00575684151, -0.0123552233, ..."


In [27]:
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
embeddings_matrix = np.vstack(goodreads_df['embed'].values)
knn.fit(embeddings_matrix)

In [55]:
# Save the model to a .pkl file
notebook_dir = os.getcwd()
model_path = os.path.join(notebook_dir,'..','package_folder','workflow','models','DL_model.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(knn, f)
print("Model trained and saved as DL_model.pkl")

Model trained and saved as DL_model.pkl


In [28]:
embeddings_matrix

array([[ 0.08849537,  0.0424903 ,  0.02089979, ..., -0.03617722,
        -0.04057312,  0.01302451],
       [ 0.03892826,  0.04355573,  0.06742546, ..., -0.01198787,
        -0.01258975, -0.02005503],
       [ 0.01152938,  0.01936876, -0.04418937, ...,  0.02866794,
        -0.00965863, -0.00498144],
       ...,
       [-0.06237392, -0.039432  ,  0.01936783, ..., -0.0296031 ,
        -0.10156478,  0.01152874],
       [-0.08673727,  0.00575684, -0.01235522, ...,  0.01661649,
        -0.03094   , -0.062875  ],
       [-0.04016149,  0.09059699, -0.02471548, ..., -0.02790975,
         0.02638798,  0.02002146]])

In [47]:
goodreads_df.head()

Unnamed: 0,bookId,embed
0,2767052,"[0.0884953663, 0.0424903035, 0.0208997913, -0...."
1,2,"[0.0389282592, 0.0435557254, 0.0674254596, 0.0..."
2,2657,"[0.0115293786, 0.0193687603, -0.0441893749, 0...."
3,1885,"[-0.0680252761, -0.0345517881, 0.0353352986, 0..."
4,41865,"[-0.0513570197, 0.0767711252, 0.0321286283, 0...."


In [50]:
goodreads_df[goodreads_df['bookId']=='2'].index[0]

np.int64(1)

In [51]:
target_embedding = embeddings_matrix[goodreads_df[goodreads_df['bookId']=='2'].index[0]]

In [30]:
target_embedding = target_embedding.reshape(1,-1)

In [31]:
distance,indices = knn.kneighbors(target_embedding,n_neighbors=6)
similar_books_indices = indices[0][1:]

In [32]:
similar_books = goodreads_df.iloc[similar_books_indices]

In [33]:
similar_books

Unnamed: 0,bookId,embed
32,3.Harry_Potter_and_the_Sorcerer_s_Stone,"[-0.024627883, 0.0366332047, 0.035225857, 0.02..."
2658,91476.Grave_Peril,"[-0.00130891672, 0.0532921962, 0.0451767445, 0..."
93,5.Harry_Potter_and_the_Prisoner_of_Azkaban,"[0.0142691461, 0.0314959586, -0.0240748879, -0..."
2784,91474.Proven_Guilty,"[-0.043068666, 0.0142278988, -0.00867341552, 0..."
7169,8058301-ghost-story,"[0.016107263, 0.0372369438, -0.016295189, 0.00..."
