## Criando módulos em Python para um Sistema de Recomendação

O objetivo aqui é organizar a base de código para obtermos a funcionalidade de recomendação.

In [None]:
%%writefile data.py
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


def get_data(path):
  return pd.read_csv(path)


def start_pipeline(dataf):
  return dataf.copy()


def get_book_counts(dataf):
  return dataf.groupby(by='ISBN', as_index=False) \
    .agg({'User-ID': pd.Series.nunique}) \
    .rename(columns={'User-ID': 'User-ID_count'}) \
    .set_index('ISBN') \
    .sort_values('User-ID_count', ascending = False)


def get_n_top_values(dataf, n):
  return dataf.head(n) \
    .index \
    .tolist()


def format_sample(dataf):
  return pd.DataFrame({
      "user": dataf["User-ID"],
      "book": dataf["ISBN"],
      "action": dataf["Book-Rating"]
  })


def get_top_books_ids(dataf, sample_size=1000):
  return dataf \
    .pipe(start_pipeline) \
    .pipe(get_book_counts) \
    .pipe(get_n_top_values, sample_size)


def get_item_based_similarity_matrix(data):
  top_book_ids = get_top_books_ids(data)
  book_sampled_relationship_data = data[data["ISBN"].isin(top_book_ids)] \
    .pipe(format_sample) \
    .pivot_table(index="user", columns="book", values="action").fillna(0)
  return cosine_similarity(book_sampled_relationship_data.T), book_sampled_relationship_data.T.index


Writing data.py


In [None]:
%%writefile item_based_recommender.py
import random

import numpy as np

from data import get_data, get_item_based_similarity_matrix


class ItemBasedRecommender:

  def __init__(self, data_path):
    self.data_path = data_path

  def setup(self):
    data = get_data(self.data_path)
    sim_matrix, available_items_index = get_item_based_similarity_matrix(data)
    self.available_items_index = available_items_index
    self.sim_matrix = sim_matrix

  def get_random_item_index(self):
    item_id = self.available_items_index[random.randint(0, self.sim_matrix.shape[0])]
    return item_id

  def get_n_closest(self, item_id, n=10):
    if item_id not in self.available_items_index:
      raise ValueError('Item is not available on data.')
    item_index = self.available_items_index.tolist().index(item_id)
    closest_n_iten = np.argsort(-self.sim_matrix[item_index])[1:n]
    n_closest = []
    for item_data in zip(self.available_items_index[closest_n_iten], self.sim_matrix[item_index][closest_n_iten]):
      n_closest.append(item_data)
    return tuple(n_closest)

Writing item_based_recommender.py


## Setup do Sistema de Recomendação

In [None]:
%env KAGGLE_USERNAME=ricoms
%env KAGGLE_KEY=8bd3a4b719368399d9965b8cdf83f8d9

!kaggle datasets download -d arashnic/book-recommendation-dataset --unzip -p /content/book-recommendation-dataset
!ls /content/book-recommendation-dataset

env: KAGGLE_USERNAME=ricoms
env: KAGGLE_KEY=8bd3a4b719368399d9965b8cdf83f8d9
Downloading book-recommendation-dataset.zip to /content/book-recommendation-dataset
 38% 9.00M/23.8M [00:00<00:00, 23.0MB/s]
100% 23.8M/23.8M [00:00<00:00, 48.4MB/s]
Books.csv  Ratings.csv	Users.csv


In [None]:
from pathlib import Path

from item_based_recommender import ItemBasedRecommender

DATA_PATH = Path("/content/book-recommendation-dataset")

ratings_df_path = DATA_PATH / 'Ratings.csv'

item_recommender = ItemBasedRecommender(ratings_df_path)

item_recommender.setup()

## Usuário do Sistema de Recomendação


In [None]:
some_item_id = item_recommender.get_random_item_index()



item_recommender.get_n_closest(some_item_id)

(('0671004549', 0.1809031859927611),
 ('0440200989', 0.17667998827911666),
 ('0671888587', 0.16220220780328537),
 ('0671867172', 0.1580539855861806),
 ('0671793489', 0.1468981632092825),
 ('0671014919', 0.13109387654292504),
 ('0671867091', 0.1298307241434932),
 ('0743460529', 0.1258442116639573),
 ('0671701231', 0.12575068228674863))

In [None]:
item_recommender.get_n_closest('BOOK_ID_Q_NAO_EXISTE')

ValueError: ignored