**Imports**

In [1]:
import os
from pathlib import Path
# Check if the code is running on Google Colab
try:
    import google.colab
    IN_COLAB = True
    base_path = "/content/"
    if Path(f"{base_path}final_project").is_dir():
      %cd {base_path}final_project
      !git pull
      %cd {base_path}
    else:
      !git clone https://github.com/fernandaluft/final_project.git
except ImportError:
    IN_COLAB = False
    base_path = "/workspaces/"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from pickle import dump, load
import re
import sklearn

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
def preprocess_text(text):
    # remove special chars and digits
    text = re.sub(r'\W|\d', ' ', text).lower()
    text = re.sub(r' +', ' ', text)
    return text.strip()

In [7]:
!unzip ../preprocessed_data/xaa_books_reviews.zip

Archive:  ../preprocessed_data/xaa_books_reviews.zip
  inflating: content/final_project/data/books_reviews.csv  


In [8]:
df = pd.read_csv('content/final_project/data/books_reviews.csv')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               59995 non-null  object 
 1   description         47100 non-null  object 
 2   authors             52127 non-null  object 
 3   image               49220 non-null  object 
 4   previewLink         53366 non-null  object 
 5   publisher           44299 non-null  object 
 6   publishedDate       52903 non-null  object 
 7   infoLink            53366 non-null  object 
 8   categories          48894 non-null  object 
 9   ratingsCount        32878 non-null  float64
 10  Id                  60000 non-null  object 
 11  Price               9789 non-null   float64
 12  User_id             48733 non-null  object 
 13  profileName         48731 non-null  object 
 14  review/helpfulness  60000 non-null  object 
 15  review/score        60000 non-null  float64
 16  revi

In [10]:
df.columns

Index(['Title', 'description', 'authors', 'image', 'previewLink', 'publisher',
       'publishedDate', 'infoLink', 'categories', 'ratingsCount', 'Id',
       'Price', 'User_id', 'profileName', 'review/helpfulness', 'review/score',
       'review/time', 'review/summary', 'review/text'],
      dtype='object')

In [11]:
df_rec = df[['Title', 'description', 'review/text', 'categories', 'authors']]

In [12]:
#Disable SettingWithCopyWarning
pd.set_option('mode.chained_assignment', None)

In [13]:
df_rec = df_rec.map(lambda x: preprocess_text(x) if isinstance(x, str) else x)

In [14]:
df_rec.head()

Unnamed: 0,Title,description,review/text,categories,authors
0,run baby run,this is the thrilling story of nicky cruz s de...,this is an inspiring and heart breaking story ...,conversion,nicky cruz jamie buckingham
1,discover your sales strengths how the world s ...,in the ever changing world of sales there is n...,reconmended to me by extremely successful sale...,business economics,benson smith tony rutigliano
2,daily with the king a devotional for self disc...,,i have been using this book for more than fift...,,
3,animal farm th anniversary edition,a satire on totalitarianism in which farm anim...,animal farm is an allegory of a communist nati...,fiction,george orwell
4,mash,before the movie this is the novel that gave l...,very humorous exciting book it really was enjo...,fiction,richard hooker


In [15]:
df_rec['tags'] = df_rec['Title'] + ' ' + df_rec['description'] + ' ' + df_rec['review/text'] + ' ' + df_rec['categories'] + ' ' + df_rec['authors']

In [16]:
df_rec['tags'] = df_rec['tags'].fillna('')

In [19]:
df_rec.to_csv('../data/books_processed.csv')

In [21]:
!zip -r /workspaces/final_project/src/books_processed.zip /workspaces/final_project/data/books_processed.csv

  adding: workspaces/final_project/data/books_processed.csv (deflated 77%)


**Transforming the text in array**

In [20]:
vectorizer = TfidfVectorizer()
vector = vectorizer.fit_transform(df_rec['tags'])

In [30]:
dump(vector, open("/workspaces/final_project/models/vector_books.sav", "wb"))

**Train the model**

In [23]:
model=NearestNeighbors(n_neighbors=7, metric='cosine', n_jobs=-1)
model.fit(vector)

**Saving the model**

In [31]:
dump(model, open("/workspaces/final_project/models/knn_neighbors_books.sav", "wb"))

**Making a recommendation**

In [32]:
model = load(open("../models/knn_neighbors_books.sav", "rb"))

In [33]:
def rec(book, df_rec, model):
    recs=[]
    book = preprocess_text(book)
    book_index = df_rec[df_rec['Title'] == book].index[0]
    distances, indices = model.kneighbors(vector[book_index], n_neighbors=12)
    similar_books = [(df_rec['Title'][i], distances[0][j]) for j, i in enumerate(indices[0])]
    for m in range(len(similar_books)-1):
        recs.append((similar_books[1:][m][0]).capitalize())
    recs = [b for b in recs if preprocess_text(b) != book]
    if recs == []:
        return f"Book '{book}' not found in the dataset."
    else:
        return list(set(recs))[0:5]        

In [34]:
for book in rec("run baby run", df_rec, model):
    print(book)

Polar star
A breath of snow and ashes outlander
The poisonwood bible a novel
Mere christianity
The cross and the switchblade
