<img src="./assets/img/univasf-logo.png" height=100 width=100/>

<center>
<h3>
    UNIVERSIDADE FEDERAL DO VALE DO SÃO FRANCISCO
    <br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;COLEGIADO DE ENGENHARIA DE COMPUTAÇÃO
</h3>

<h3>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Orientador</h3>
<span>Prof. Dr. Rosalvo Ferreira de Oliveira Neto</span>

<h3>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;Discentes</h3>
<span>Anísio Pereira Batista Filho
<br>Edjair Aguiar Gomes Filho
<br>Elayne Rute Lessa Lemos</span>
</center>
<br><br>

## Predição de pedidos com Redes Neurais e Random Forest

### Importação da base de dados

In [None]:
import numpy as np
import pandas as pd

In [None]:
transact_train_database = pd.read_csv('data/transact_train.txt', sep = '|')
transact_train_database.sample(3)

In [None]:
transact_test_database = pd.read_csv('data/transact_class.txt', sep = '|')
transact_test_database.sample(3)

<br>

### Alteração da granularidade da base de dados

In [None]:
# remove todas as linhas com valores de sessionNo iguais exceto a última
session_train_database = transact_train_database.drop_duplicates(subset=['sessionNo'], keep='last')

# separa variável alvo no conjunto de treinamento
session_train_X = session_train_database.iloc[:,:-1]
session_train_y = session_train_database.iloc[:,-1]
session_train_y = session_train_y.replace({'y': 1 , 'n': 0 })

In [None]:
session_test_X = transact_test_database.drop_duplicates(subset=['sessionNo'], keep='last')
session_test_y = session_test_X['sessionNo']

In [None]:
session_test_X.sample(3)

<br>

### Tratamento de valores ausentes

In [None]:
def replace_missing_value(df, value, features):
    replaced = df[features].replace(value, np.nan)
    for column in features:
        df[column] = replaced[column]

    return df

In [None]:
# convert_float() é baseadona solução proposta no Estudo de Caso
# do livro Ciência dos Dados pelo Processo de KDD do Prof. Dr. Rosalvo Neto
# livro em: https://www.researchgate.net/publication/352749819_Ciencia_dos_Dados_pelo_Processo_de_KDD
# implementação em: https://github.com/rosalvoneto/Livro

def convert_float(df, numeric_features):
    for column in numeric_features:
        df[column] = df[column].astype(float)

    return df

In [None]:
# replace_missing_by_mean() é baseadona solução proposta no Estudo de Caso
# do livro Ciência dos Dados pelo Processo de KDD do Prof. Dr. Rosalvo Neto
# livro em: https://www.researchgate.net/publication/352749819_Ciencia_dos_Dados_pelo_Processo_de_KDD
# implementação em: https://github.com/rosalvoneto/Livro

def replace_missing_by_mean(df, numeric_features):
    for column in numeric_features:
        average = df[column].mean(axis=0)
        df[column].fillna(average, inplace=True)
    
    return df

In [None]:
def replace_by_reference(df, reference_feature, reference_feature_value, features, replace_to):
    rows = df[reference_feature] == reference_feature_value
    for column in features:
        df.loc[rows, column] = replace_to
    
    return df

In [None]:
def replace_missing_by_fixed_value(df, value, features):
    for column in features:
        df[column].fillna(value, inplace=True)

    return df

In [None]:
numeric_features = ['cMinPrice', 'cMaxPrice', 'cSumPrice', 'bMinPrice', 'bMaxPrice',
    'bSumPrice', 'bStep','maxVal', 'customerScore', 'accountLifetime', 'payments', 
    'age', 'address', 'lastOrder']

string_features = ['availability', 'onlineStatus']

In [None]:
session_train_X = replace_missing_value(session_train_X.copy(), '?', numeric_features)
session_test_X = replace_missing_value(session_test_X.copy(), '?', numeric_features)
session_train_X = replace_missing_value(session_train_X.copy(), '?', string_features)
session_test_X = replace_missing_value(session_test_X.copy(), '?', string_features)

In [None]:
session_train_X = convert_float(session_train_X, numeric_features)
session_test_X = convert_float(session_test_X, numeric_features)

In [None]:
replace_missing_by_mean(session_train_X, numeric_features)
replace_missing_by_mean(session_test_X, numeric_features)

In [None]:
replace_missing_by_fixed_value(session_train_X, 'ausente', string_features)
replace_missing_by_fixed_value(session_test_X, 'ausente', string_features)

In [None]:
session_train_X = replace_by_reference(session_train_X, 'customerNo', '?', ['maxVal',
    'customerScore', 'accountLifetime', 'payments', 'age', 'address', 'lastOrder'], 0)
session_test_X = replace_by_reference(session_test_X, 'customerNo', '?', ['maxVal',
    'customerScore', 'accountLifetime', 'payments', 'age', 'address', 'lastOrder'], 0)

In [None]:
session_train_X.sample(3)

In [None]:
transact_train_database = replace_missing_value(transact_train_database.copy(), '?', numeric_features)
transact_train_database = replace_missing_value(transact_train_database.copy(), '?', string_features)
transact_test_database = replace_missing_value(transact_test_database.copy(), '?', numeric_features)
transact_test_database = replace_missing_value(transact_test_database.copy(), '?', string_features)

transact_train_database = convert_float(transact_train_database, numeric_features)
transact_test_database = convert_float(transact_test_database, numeric_features)

In [None]:
transact_train_database.sample(3)

In [None]:
# torna o sessionNo o índice da base
session_train_X.set_index('sessionNo', inplace=True)
session_test_X.set_index('sessionNo', inplace=True)

In [None]:
session_train_X.sample(3)

<br>

### Criação de variáveis

In [None]:
# bMeanSumPriceOverTransacitions: valor médio do carrinho durante a sessão 
session_train_X['bMeanSumPriceOverTransacitions'] = transact_train_database.groupby('sessionNo').bSumPrice.mean()
session_train_X['bMeanSumPriceOverTransacitions'].fillna(0, inplace=True)
session_test_X['bMeanSumPriceOverTransacitions'] = transact_test_database.groupby('sessionNo').bSumPrice.mean()
session_test_X['bMeanSumPriceOverTransacitions'].fillna(0, inplace=True)

# meanInterationsDuration: valor médio de tempo entre uma transação e outra na sessão
session_train_X['meanInterationsDuration'] = session_train_X['duration']/transact_train_database.groupby('sessionNo').duration.count()
session_train_X['meanInterationsDuration'].fillna(0, inplace=True)
session_test_X['meanInterationsDuration'] = session_test_X['duration']/transact_test_database.groupby('sessionNo').duration.count()
session_test_X['meanInterationsDuration'].fillna(0, inplace=True)

# bMeanCountOverTransacitions: quantidade média de itens no carrinho durante a sessão 
session_train_X['bMeanCountOverTransacitions'] = transact_train_database.groupby('sessionNo').bCount.mean()
session_train_X['bMeanCountOverTransacitions'].fillna(0, inplace=True)
session_test_X['bMeanCountOverTransacitions'] = transact_test_database.groupby('sessionNo').bCount.mean()
session_test_X['bMeanCountOverTransacitions'].fillna(0, inplace=True)

In [None]:
session_train_X.sample(3)

<br>

### Normalização do conjunto de dados

In [None]:
# criação das variáveis dummies

session_train_X = pd.get_dummies(session_train_X, prefix_sep='_')
session_test_X = pd.get_dummies(session_test_X, prefix_sep='_')

In [None]:
# get_Min_Max() e normilize() foram retirados da solução proposta no Estudo de Caso
# do livro Ciência dos Dados pelo Processo de KDD do Prof. Dr. Rosalvo Neto
# livro em: https://www.researchgate.net/publication/352749819_Ciencia_dos_Dados_pelo_Processo_de_KDD
# implementação em: https://github.com/rosalvoneto/Livro

def get_Min_Max(X):
    result = {}
    for v in X.columns:
        result[v] = (np.min(X[v]), np.max(X[v]))
    return result

def normalize(X, MinMax):
    result = X.copy()
    for v in MinMax:
        min_v, max_v = MinMax[v]
        div = max_v - min_v
        if div == 0:
            div = 1

        result[v] = (X[v] - min_v) / div
        idx_0 = result[v]<0
        result.loc[idx_0, v] = 0
        
        idx_1 = result[v]>1
        result.loc[idx_1, v] = 1
        
    return result


norm_min_max = get_Min_Max(session_train_X)

session_train_X = normalize(session_train_X, norm_min_max)
session_test_X = normalize(session_test_X, norm_min_max)

In [None]:
session_train_X.sample(3)

<br>

### Remoção de variáveis não significativas

In [None]:
session_train_X = session_train_X.drop(['customerNo'], axis=1)
session_test_X = session_test_X.drop(['customerNo'], axis=1)


threshold_var=0
l_var = [x for x in session_train_X.columns if session_train_X[x].var() <= threshold_var]
for v in l_var:
    session_train_X = session_train_X.drop([v], axis=1)
    session_test_X = session_test_X.drop([v], axis=1)

In [None]:
session_train_X.sample(3)

<br>

### Exportação dos dados

In [None]:
session_train_X.to_csv('data/session_train_X.csv', index=False)
session_train_y.to_csv('data/session_train_y.csv', index=False)
session_test_X.to_csv('data/session_test_X.csv', index=False)
session_test_y.to_csv('data/session_test_y.csv', index=False)