In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [17]:
data = pd.read_feather('database/base.feather')

In [6]:
## RFM => Regency (Último -> data - data - 1) / Frequency (quantidade -> n) / Monetary (soma/média -> R   ~x)

def rfm_variables(df):
    ## FREQUENCY
    f_sales = len(df) ## Quantidade de vendas 
    f_profit = len(df[df['Profit'] > 0]) ## Quantidade de vendas com lucros
    ## MONETARY
    m_sales = round(df['Sales'].sum(), 2) ## Total de vendas
    m_profit = round(df['Profit'].sum(), 2) ## Total de vendas com lucro
    m_quantity = df['Quantity'].sum() ## Total de quantidades
    ## PERIODICITY
    df_sort = df[['Order Date']].sort_values(by='Order Date').drop_duplicates() ## Periodicidade
    df_sort['diff'] = df_sort['Order Date'] - df_sort['Order Date'].shift(1)
    df_sort['diff_int'] = df_sort['diff'].dt.days
    ## REGENCY
    r_days = round(df_sort['diff_int'].mean(), 2)
    return f_sales, f_profit, m_sales, m_profit, m_quantity, r_days

In [18]:
def fit_data(data, variable):
    rfm = pd.DataFrame()
    variaveis = data[variable].unique()
    for variavel in variaveis:
        dados = data[data[variable] == variavel]
        f_vendas, f_lucro, m_vendas, m_lucro, m_qtde, r_dias = rfm_variables(dados)
        new_row = {
            'referencia': variavel,
            'm_vendas': m_vendas,
            'm_lucro': m_lucro,
            'm_qtde': m_qtde,
            'r_dias': r_dias,
            'f_vendas': f_vendas,
            'f_lucro': f_lucro
        }
        rfm = rfm.append(
            new_row,
            ignore_index = True
        )
    return rfm

In [None]:
original = fit_data(data, 'State')
original = original.fillna(0)
original

In [29]:
variaveis = ['m_vendas', 'm_lucro', 'm_qtde', 'r_dias', 'f_vendas', 'f_lucro']

In [None]:
base = original[variaveis]
base

In [None]:
vizinhos = NearestNeighbors(n_neighbors=min(4, len(base))).fit(base)
similares = pd.DataFrame()
for index, row in original.iterrows():
    print('Referencia: {0}'.format(row['referencia']))
    print('Referencias Similares:')
    original_referencia = original[original['referencia'] == row['referencia']][variaveis]
    similar = vizinhos.kneighbors(original_referencia, return_distance=False)[0]
    original_similar = original.iloc[similar][variaveis].reset_index()
    referencia = original.iloc[similar]['referencia'].reset_index()
    referencia = referencia.merge(original_similar, on='index', how='left')
    referencia = referencia.drop(columns=['index'])
    for ind, rw in referencia.iterrows():    
        if row['referencia'] != rw['referencia']:            
            print('--> {0}'.format(rw['referencia']))
            similares = similares.append({'referencia': row['referencia'], 'vizinho': rw['referencia']}, ignore_index = True)
similares

In [27]:
similares.to_feather('database/knn_estado.feather')

In [None]:
## Aula 15 - KNN - Vizinhos mais próximos
original = fit_data(data, 'Category')
original = original.fillna(0)
base = original[variaveis]
vizinhos = NearestNeighbors(n_neighbors=min(4, len(base))).fit(base)
similares = pd.DataFrame()
for index, row in original.iterrows():
    print('Referencia: {0}'.format(row['referencia']))
    print('Referencias Similares:')
    original_referencia = original[original['referencia'] == row['referencia']][variaveis]
    similar = vizinhos.kneighbors(original_referencia, return_distance=False)[0]
    original_similar = original.iloc[similar][variaveis].reset_index()
    referencia = original.iloc[similar]['referencia'].reset_index()
    referencia = referencia.merge(original_similar, on='index', how='left')
    referencia = referencia.drop(columns=['index'])
    for ind, rw in referencia.iterrows():    
        if row['referencia'] != rw['referencia']:            
            print('--> {0}'.format(rw['referencia']))
            similares = similares.append({'referencia': row['referencia'], 'vizinho': rw['referencia']}, ignore_index = True)
similares

similares.to_feather('database/knn_categoria.feather')

In [None]:
## Aula 15 - KNN - Vizinhos mais próximos
original = fit_data(data, 'Sub-Category')
original = original.fillna(0)
base = original[variaveis]
vizinhos = NearestNeighbors(n_neighbors=min(4, len(base))).fit(base)
similares = pd.DataFrame()
for index, row in original.iterrows():
    print('Referencia: {0}'.format(row['referencia']))
    print('Referencias Similares:')
    original_referencia = original[original['referencia'] == row['referencia']][variaveis]
    similar = vizinhos.kneighbors(original_referencia, return_distance=False)[0]
    original_similar = original.iloc[similar][variaveis].reset_index()
    referencia = original.iloc[similar]['referencia'].reset_index()
    referencia = referencia.merge(original_similar, on='index', how='left')
    referencia = referencia.drop(columns=['index'])
    for ind, rw in referencia.iterrows():    
        if row['referencia'] != rw['referencia']:            
            print('--> {0}'.format(rw['referencia']))
            similares = similares.append({'referencia': row['referencia'], 'vizinho': rw['referencia']}, ignore_index = True)
similares

similares.to_feather('database/knn_subcategoria.feather')

In [None]:
## Aula 15 - KNN - Vizinhos mais próximos
original = fit_data(data, 'Product Name')
original = original.fillna(0)
base = original[variaveis]
vizinhos = NearestNeighbors(n_neighbors=min(4, len(base))).fit(base)
similares = pd.DataFrame()
for index, row in original.iterrows():
    print('Referencia: {0}'.format(row['referencia']))
    print('Referencias Similares:')
    original_referencia = original[original['referencia'] == row['referencia']][variaveis]
    similar = vizinhos.kneighbors(original_referencia, return_distance=False)[0]
    original_similar = original.iloc[similar][variaveis].reset_index()
    referencia = original.iloc[similar]['referencia'].reset_index()
    referencia = referencia.merge(original_similar, on='index', how='left')
    referencia = referencia.drop(columns=['index'])
    for ind, rw in referencia.iterrows():    
        if row['referencia'] != rw['referencia']:            
            print('--> {0}'.format(rw['referencia']))
            similares = similares.append({'referencia': row['referencia'], 'vizinho': rw['referencia']}, ignore_index = True)
similares

similares.to_feather('database/knn_produto.feather')

In [None]:
original = fit_data(data, 'State')
original = original.fillna(0)
base = original[variaveis]
vizinhos = NearestNeighbors(n_neighbors=min(4, len(base))).fit(base)
similares = pd.DataFrame()
for index, row in original.iterrows():
    print('Referencia: {0}'.format(row['referencia']))
    print('Referencias Similares:')
    original_referencia = original[original['referencia'] == row['referencia']][variaveis]
    similar = vizinhos.kneighbors(original_referencia, return_distance=False)[0]
    original_similar = original.iloc[similar][variaveis].reset_index()
    referencia = original.iloc[similar]['referencia'].reset_index()
    referencia = referencia.merge(original_similar, on='index', how='left')
    referencia = referencia.drop(columns=['index'])
    for ind, rw in referencia.iterrows():    
        if row['referencia'] != rw['referencia']:            
            print('--> {0}'.format(rw['referencia']))
            similares = similares.append({'referencia': row['referencia'], 'vizinho': rw['referencia']}, ignore_index = True)
similares

similares.to_feather('database/knn_estado.feather')