# Importando Bibliotecas

In [1]:
import pandas as pd
import numpy as np
import sys
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display

## Carregando zip

In [2]:
import zipfile

dataset_orders = "order_products__train.csv"
dataset_products = 'products.csv'
dataset_departments = 'departments.csv'
dataset_aisles = 'aisles.csv'

archive_orders = zipfile.ZipFile(dataset_orders+".zip","r")
archive_products = zipfile.ZipFile(dataset_products+".zip","r")
archive_departments = zipfile.ZipFile(dataset_departments+".zip","r")
archive_aisles = zipfile.ZipFile(dataset_aisles+".zip","r")

df_order = pd.read_csv(archive_orders.open('order_products__train.csv'))
df_product = pd.read_csv(archive_products.open('products.csv'))
df_departments = pd.read_csv(archive_departments.open('departments.csv'))
df_aisles = pd.read_csv(archive_aisles.open('aisles.csv'))

# Metadata-based Recommender System

* Metadata-based Recommendation System como descrito em [Medium tutorial](https://medium.com/analytics-vidhya/metadata-based-recommender-systems-in-python-c6aae213b25c). Todo o crédito aos autores do artigo!

## Importando bibliotecas necessárias

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer

* O sistema funciona basicamente transformando os meta dados em vetores, jogando todos num mesmo plano vetorial, comparando seus cossenos e recomendando uma quantidade x de vetores (produtos) mais próximos do vetor analisado. Os metadados são fundamentais para o desempenho de modelos desse tipo e é exatamente neles que podemos inserir dados como quem são os amigos de cada usuário, quantas compras fazem por mês, ticket médio do cliente, tamanho médio do carro de compras etc. Por questões de simplificação nos atemos a passa, em um bando sem dados dos clientes, as informações de departamento e categoria de cada produto no contexto de compras no geral, tendo em vista que esse modelo é meramente ilustrativo e que os produtos das lojas Americanas são bem mais complexos que os aqui exemplificados.



# Definindo função de recomendação

In [4]:
def recommend_products_based_on_metadata(product_input):
    
    product_index = mapping[product_input]
    
    similarity_score = list(enumerate(cosine_sim_matrix[product_index]))
    similarity_score = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    
    similarity_score = similarity_score[1:10]
    product_indices = [i[0] for i in similarity_score]
    
    return (df_vec['product_name'].iloc[product_indices])

In [5]:
df_order.head(2)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1


In [6]:
df_product.head(2)

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13


In [7]:
df_departments.head(2)

Unnamed: 0,department_id,department
0,1,frozen
1,2,other


In [8]:
df_aisles.head(2)

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses


# Cria tabela compras

In [9]:
df = pd.merge(pd.merge(pd.merge(df_order, 
                                          df_product, 
                                          on="product_id", how = 'inner'), 
                                   df_aisles, 
                                   on="aisle_id", how='inner'),
                      df_departments, 
                      on="department_id", how = 'inner')

In [10]:
df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,1,49302,1,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs
1,816049,49302,7,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs
2,1242203,49302,1,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs
3,1383349,49302,11,1,Bulgarian Yogurt,120,16,yogurt,dairy eggs
4,1787378,49302,8,0,Bulgarian Yogurt,120,16,yogurt,dairy eggs


# Filtrando departamentos de interesse!

In [11]:
df_departments.department

0              frozen
1               other
2              bakery
3             produce
4             alcohol
5       international
6           beverages
7                pets
8     dry goods pasta
9                bulk
10      personal care
11       meat seafood
12             pantry
13          breakfast
14       canned goods
15         dairy eggs
16          household
17             babies
18             snacks
19               deli
20            missing
Name: department, dtype: object

In [12]:
df_filtered_departments = df_departments[df_departments.department.isin(['bakery','alcool','babies','snacks','missing','personal care','pets'])]

In [13]:
df = df[df.department.isin(df_filtered_departments.department)]

In [14]:
df.sample(5)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
1241309,3070525,48647,2,1,Mini Chocolate Croissant,93,3,breakfast bakery,bakery
946184,1522726,13740,2,0,Organic Summer Strawberry Bunny Fruit Snacks,50,19,fruit vegetable snacks,snacks
1247048,1000578,36412,1,1,Pure Castile Peppermint Soap,25,11,soap,personal care
881213,2074086,43234,4,1,Parmesan Cheese Crisps,107,19,chips pretzels,snacks
859463,1273712,43530,3,0,Caramel Corn Rice Cakes,78,19,crackers,snacks


# Criando dataframe com metadados

In [15]:
df_meta = df[['product_id','department','aisle','product_name']].groupby('product_id', as_index=False).agg(set)

In [16]:
df_meta.head(2)

Unnamed: 0,product_id,department,aisle,product_name
0,1,{snacks},{cookies cakes},{Chocolate Sandwich Cookies}
1,13,{personal care},{cold flu allergy},{Saline Nasal Mist}


In [17]:
df_meta['aisle'] = df_meta['aisle'].apply(lambda x: [i.replace(' ','') for i in x])
df_meta['department'] = df_meta['department'].apply(lambda x: [i.replace(' ','') for i in x])

In [18]:
df_meta.columns

Index(['product_id', 'department', 'aisle', 'product_name'], dtype='object')

# Inserindo os metadados

In [19]:
df_meta['metadata'] = df_meta.apply(lambda x : ' ' + ' '.join(x['aisle']) + ' ' + ' '.join(x['department']), axis = 1)
df_meta

Unnamed: 0,product_id,department,aisle,product_name,metadata
0,1,[snacks],[cookiescakes],{Chocolate Sandwich Cookies},cookiescakes snacks
1,13,[personalcare],[coldfluallergy],{Saline Nasal Mist},coldfluallergy personalcare
2,15,[babies],[diaperswipes],{Overnight Diapers Size 6},diaperswipes babies
3,16,[snacks],[icecreamtoppings],{Mint Chocolate Flavored Syrup},icecreamtoppings snacks
4,21,[pets],[dogfoodcare],{Small & Medium Dental Dog Treats},dogfoodcare pets
...,...,...,...,...,...
13171,49671,[snacks],[candychocolate],{Milk Chocolate Drops},candychocolate snacks
13172,49679,[snacks],[cookiescakes],{Famous Chocolate Wafers},cookiescakes snacks
13173,49686,[bakery],[bread],{Artisan Baguette},bread bakery
13174,49687,[pets],[catfoodcare],{Smartblend Healthy Metabolism Dry Cat Food},catfoodcare pets


In [20]:
df_vec = df_meta   #.sample(10000), caso estrapole a memória!
df_produtos_previstos = pd.merge(df_vec[['product_id']], df_product[['product_id','product_name']], on='product_id', how='inner')

count_vec = CountVectorizer(stop_words='english')
count_vec_matrix = count_vec.fit_transform(df_vec['metadata'])
cosine_sim_matrix = cosine_similarity(count_vec_matrix, count_vec_matrix)
mapping = pd.Series(df_produtos_previstos.index, index = df_produtos_previstos['product_name'])

# Garando recomendação:

In [21]:
recommend_products_based_on_metadata('Cat Food, Premium, Adult Cat Formula') #lista produtos recomendados, dado que o produto passado foi comprado!

13                         {24/7 Performance Cat Litter}
78     {Prime Filets Chicken & Tuna Dinner in Gravy C...
94     {Complete Health Deboned Chicken, Chicken Meal...
137    {Proactive Health Mature Adult Hairball Care C...
146              {Tasty Treasures Cat Food Variety Pack}
181                 {Chicken Liver Paté Canned Cat Food}
218               {Cat Treats Hair Ball Control Chicken}
237               {Cat Food, Premium, Adult Cat Formula}
267               {Salmon and Tuna Formula Dry Cat Food}
Name: product_name, dtype: object

## Produtos que podem ser passados para a função acima:

In [22]:
df_produtos_previstos.sample(15) #Mostra 15 produtos aleatórios para testar o sistema de recomendação!

Unnamed: 0,product_id,product_name
3823,14682,Whole Grain Bread
8000,30437,Lemongrass Pure Essential Oil
7806,29718,Variety Snaps Little Bites with Beef Chicken L...
3918,15063,Organic Cashew Nondairy Vanilla Yogurt
5772,21934,Foaming Body Wash - Cucumber Mint
10543,39850,Sesame Street Organic Original Cruchin' Cracker
5121,19564,"Invisible Solid Anti-Perspirant Deodorant, Sho..."
4195,16196,"Shower Gel, Peaceful Patchouli"
949,3700,Organic Coconut Assam Whole Leaf Tea
5285,20208,Savory Chicken in Gravy Cat Food
