<br>
<center>
<img src="https://raw.githubusercontent.com/elaynelemos/prediction-of-orders-dmc/main/assets/img/univasf-logo.png" width=200>
<h3>
    UNIVERSIDADE FEDERAL DO VALE DO SÃO FRANCISCO
    <br>COLEGIADO DE ENGENHARIA DE COMPUTAÇÃO
</h3>

<h3>Orientador</h3>
<span>Prof. Dr. Rosalvo Ferreira de Oliveira Neto</span>

<h3>Discentes</h3>
<span>Anísio Pereira Batista Filho
<br>Edjair Aguiar Gomes Filho
<br>Elayne Rute Lessa Lemos</span>
</center>
<br><br>

## Predição de pedidos com Redes Neurais e Random Forest

Projeto em: [github.com/elaynelemos/prediction-of-orders-dmc](https://github.com/elaynelemos/prediction-of-orders-dmc)


### Importação da base de dados

In [1]:
import numpy as np
import pandas as pd

In [2]:
repo_url = 'https://raw.githubusercontent.com/elaynelemos/prediction-of-orders-dmc/main'

transact_train_database = pd.read_csv(f'{repo_url}/data/transact_train.txt', sep = '|')
transact_train_database.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,order
120347,14304,20,5,3397.319,62,17.99,59.99,1933.71,14,17.99,59.99,498.73,?,?,?,7066,600,488,85,21,38,2,36,y
47411,5676,12,5,181.996,6,3.99,19.99,101.96,4,3.99,19.99,62.97,4,y,completely orderable,2861,5000,527,223,32,49,1,18,y
320832,37427,8,7,111.749,6,9.99,9.99,59.94,2,9.99,9.99,19.98,?,?,?,?,?,?,?,?,?,?,?,n


In [3]:
transact_test_database = pd.read_csv(f'{repo_url}/data/transact_class.txt', sep = '|')
transact_test_database.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
1105,116,18,7,43.328,5,459.99,899.99,3519.95,1,899.99,899.99,899.99,?,?,?,?,?,?,?,?,?,?,?
33583,3569,22,7,255.804,6,26.99,27.99,109.96,2,26.99,27.99,54.98,4,y,completely orderable,26802,1000,476,75,9,36,1,50
21521,2235,20,7,95.624,4,8.99,8.99,35.96,1,8.99,8.99,8.99,?,?,?,26173,900,493,46,15,37,2,11


<br>

### Alteração da granularidade da base de dados

In [4]:
# remove todas as linhas com valores de sessionNo iguais exceto a última
session_train_database = transact_train_database.drop_duplicates(subset=['sessionNo'], keep='last')

# separa variável alvo no conjunto de treinamento
session_train_X = session_train_database.iloc[:,:-1]
session_train_y = session_train_database.iloc[:,-1]
session_train_y = session_train_y.replace({'y': 1 , 'n': 0 })


# remove todas as linhas com valores de sessionNo iguais exceto a última na base de teste
session_test_X = transact_test_database.drop_duplicates(subset=['sessionNo'], keep='last')

In [5]:
session_test_X.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
30188,3177,21,7,3494.713,4,15.95,29.99,91.88,1,29.99,29.99,29.99,?,?,?,?,?,?,?,?,?,?,?
13226,1349,19,7,70.955,2,249.99,249.99,499.98,1,249.99,249.99,249.99,2,y,completely orderable,25735,4000,521,220,10,46,2,115
17263,1808,20,7,245.017,6,629.99,1799.99,6859.94,3,629.99,1799.99,3429.97,4,y,completely orderable,2055,15500,507,268,75,35,1,4


<br>

### Tratamento de valores ausentes

In [6]:
def replace_missing_value(df, value, features):
    replaced = df[features].replace(value, np.nan)
    for column in features:
        df[column] = replaced[column]

    return df

In [7]:
# convert_float() é baseadona solução proposta no Estudo de Caso
# do livro Ciência dos Dados pelo Processo de KDD do Prof. Dr. Rosalvo Neto
# livro em: https://www.researchgate.net/publication/352749819_Ciencia_dos_Dados_pelo_Processo_de_KDD
# implementação em: https://github.com/rosalvoneto/Livro

def convert_float(df, numeric_features):
    for column in numeric_features:
        df[column] = df[column].astype(float)

    return df

In [8]:
def replace_missing_by_fixed_value(df, value, features):
    for column in features:
        df[column].fillna(value, inplace=True)

    return df

In [9]:
# replace_missing_by_mean() é baseadona solução proposta no Estudo de Caso
# do livro Ciência dos Dados pelo Processo de KDD do Prof. Dr. Rosalvo Neto
# livro em: https://www.researchgate.net/publication/352749819_Ciencia_dos_Dados_pelo_Processo_de_KDD
# implementação em: https://github.com/rosalvoneto/Livro

def replace_missing_by_mean(df, numeric_features):
    for column in numeric_features:
        average = df[column].mean(axis=0)
        df[column].fillna(average, inplace=True)
    
    return df

In [10]:
def replace_missing_by_median(df, numeric_features):
    for column in numeric_features:
        med = df[column].median(axis=0)
        df[column].fillna(med, inplace=True)
    
    return df

In [11]:
def replace_missing_by_min(df, numeric_features):
    for column in numeric_features:
        minimum = df[column].min(axis=0)
        df[column].fillna(minimum, inplace=True)
    
    return df

In [12]:
numeric_features = ['cMinPrice', 'cMaxPrice', 'cSumPrice', 'bMinPrice', 'bMaxPrice',
    'bSumPrice', 'bStep','maxVal', 'customerScore', 'accountLifetime', 'payments', 
    'age', 'address', 'lastOrder']

string_features = ['availability', 'onlineStatus']

customerno_dependent_feats = ['maxVal', 'customerScore', 'accountLifetime', 'payments',
    'age', 'address', 'lastOrder']

In [13]:
# substitui '?' por NaN na base do projeto
session_train_X = replace_missing_value(session_train_X.copy(), '?', numeric_features)
session_test_X = replace_missing_value(session_test_X.copy(), '?', numeric_features)
session_train_X = replace_missing_value(session_train_X.copy(), '?', string_features)
session_test_X = replace_missing_value(session_test_X.copy(), '?', string_features)

# converte NaN para float
session_train_X = convert_float(session_train_X, numeric_features)
session_test_X = convert_float(session_test_X, numeric_features)

In [14]:
# substitui '?' por NaN na base original
transact_train_database = replace_missing_value(transact_train_database.copy(), '?', numeric_features)
transact_train_database = replace_missing_value(transact_train_database.copy(), '?', string_features)
transact_test_database = replace_missing_value(transact_test_database.copy(), '?', numeric_features)
transact_test_database = replace_missing_value(transact_test_database.copy(), '?', string_features)

# converte NAN para float
transact_train_database = convert_float(transact_train_database, numeric_features)
transact_test_database = convert_float(transact_test_database, numeric_features)

In [15]:
# estabelece valor fixo para valores ausentes em atributos não numéricos
session_train_X = replace_missing_by_fixed_value(session_train_X, 'ausente', string_features)
session_test_X = replace_missing_by_fixed_value(session_test_X, 'ausente', string_features)

In [16]:
session_train_X.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
322826,37666,9,7,220.474,7,5.99,5.99,41.93,1,5.99,5.99,5.99,1.0,y,completely orderable,18565,4000.0,478.0,360.0,12.0,73.0,2.0,39.0
118070,14018,19,5,76.525,3,19.99,19.99,59.97,1,19.99,19.99,19.99,,ausente,ausente,?,,,,,,,
167704,20130,9,6,1148.272,59,7.99,29.9,782.43,9,10.99,19.99,147.91,,ausente,ausente,?,,,,,,,


In [17]:
session_preprocessed = {}

#### Estratégia: substituição pela média

In [18]:
session_train_X_mean_replacing = replace_missing_by_mean(session_train_X.copy(), numeric_features)
session_test_X_mean_replacing = replace_missing_by_mean(session_test_X.copy(), numeric_features)

session_train_X_mean_replacing = replace_missing_value(session_train_X_mean_replacing.copy(),
    '?', customerno_dependent_feats)
session_test_X_mean_replacing = replace_missing_value(session_test_X_mean_replacing.copy(),
    '?', customerno_dependent_feats)

session_train_X_mean_replacing = replace_missing_by_mean(
    session_train_X_mean_replacing,
    customerno_dependent_feats
)
session_test_X_mean_replacing = replace_missing_by_mean(
    session_test_X_mean_replacing,
    customerno_dependent_feats
)

In [19]:
# armazena estágio de pré-processamento para facilitar na exportação
session_preprocessed['session_train_X_mean_replacing'] = session_train_X_mean_replacing
session_preprocessed['session_test_X_mean_replacing'] = session_test_X_mean_replacing
session_preprocessed['session_train_y_mean_replacing'] = session_train_y.copy()

In [20]:
session_train_X_mean_replacing.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
367904,43364,13,7,253.911,9,32.99,39.99,227.94,1,39.99,39.99,39.99,3.0,y,completely orderable,21452,4300.0,541.0,228.0,20.0,42.0,2.0,9.0
256343,30143,18,6,164.825,4,22.99,79.99,222.96,4,22.99,79.99,222.96,5.0,y,completely orderable,14870,13100.0,538.0,277.0,44.0,63.0,1.0,6.0
244107,28768,17,6,6902.995,26,9.0,199.95,1269.62,3,9.0,199.95,228.94,5.0,y,completely orderable,?,2266.582859,483.731568,128.169304,12.893535,44.921048,1.706919,79.007675


#### Estratégia: remoção de registros pouco relevantes para o modelo após substituição pela média

In [21]:
# rows = session_train_database['customerNo'] == '?' 
#     and session_train_database['onlineStatus'] == 'ausente'
#     and session_train_database['availability'] == 'ausente'
temp = session_train_database[session_train_database['customerNo'] == '?']
temp = temp[temp['onlineStatus'] == '?']
temp = temp[temp['availability'] == '?']

# demonstação de que para quando customerNo, onlineStatus e availability
# são nulos, não há variabilidade na classe proporção aproximada de 1% para 'y'
temp.groupby('order').sessionNo.nunique()

order
n    6977
y      73
Name: sessionNo, dtype: int64

In [22]:
rows_to_drop = temp.index.values.tolist()

session_train_X_mean_drop_replacing = session_train_X_mean_replacing.drop(rows_to_drop, axis=0)
session_train_y_mean_drop_replacing = session_train_y.drop(rows_to_drop, axis=0)

In [23]:
session_preprocessed['session_train_X_mean_drop_replacing'] = session_train_X_mean_drop_replacing
session_preprocessed['session_test_X_mean_drop_replacing'] = session_test_X_mean_replacing.copy()
session_preprocessed['session_train_y_mean_drop_replacing'] = session_train_y_mean_drop_replacing

In [24]:
session_train_X.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
117876,13991,19,5,136.372,2,579.99,579.99,1159.98,1,579.99,579.99,579.99,5.0,y,completely orderable,6931,0.0,453.0,19.0,0.0,28.0,1.0,605.0
337676,39778,11,7,36.983,2,3.99,19.99,23.98,2,3.99,19.99,23.98,1.0,y,completely orderable,19622,3000.0,538.0,85.0,5.0,51.0,2.0,18.0
370344,43609,13,7,33.11,7,19.99,19.99,139.93,2,19.99,19.99,39.98,,ausente,ausente,21589,2600.0,511.0,223.0,10.0,49.0,1.0,94.0


#### Estratégia: substituição pela mediana

In [25]:
session_train_X_median_replacing = replace_missing_by_median(session_train_X.copy(), numeric_features)
session_test_X_median_replacing = replace_missing_by_median(session_test_X.copy(), numeric_features)

In [26]:
session_train_X_median_replacing = replace_missing_value(session_train_X_median_replacing.copy(),
    '?', customerno_dependent_feats)
session_test_X_median_replacing = replace_missing_value(session_test_X_median_replacing.copy(),
    '?', customerno_dependent_feats)

session_train_X_median_replacing = replace_missing_by_median(
    session_train_X_median_replacing,
    customerno_dependent_feats
)

session_test_X_median_replacing = replace_missing_by_median(
    session_test_X_median_replacing,
    customerno_dependent_feats
)

In [27]:
session_preprocessed['session_train_X_median_replacing'] = session_train_X_median_replacing
session_preprocessed['session_test_X_median_replacing'] = session_test_X_median_replacing
session_preprocessed['session_train_y_median_replacing'] = session_train_y.copy()

In [28]:
session_train_X_mean_replacing.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
304589,35379,22,6,830.304,46,12.99,59.95,1281.43,5,12.99,29.99,112.95,3.158804,ausente,ausente,?,2266.582859,483.731568,128.169304,12.893535,44.921048,1.706919,79.007675
78761,9488,16,5,527.591,1,34.99,34.99,34.99,1,34.99,34.99,34.99,2.0,y,completely orderable,4682,5000.0,455.0,3.0,5.0,57.0,2.0,10.0
133017,15724,21,5,59.423,7,129.99,129.99,259.98,1,129.99,129.99,129.99,3.158804,ausente,ausente,?,2266.582859,483.731568,128.169304,12.893535,44.921048,1.706919,79.007675


#### Estratégia: substituição pelo mínimo

In [29]:
session_train_X_min_replacing = replace_missing_by_min(session_train_X.copy(), numeric_features)
session_test_X_min_replacing = replace_missing_by_min(session_test_X.copy(), numeric_features)

In [30]:
session_train_X_min_replacing = replace_missing_value(session_train_X_min_replacing.copy(),
    '?', customerno_dependent_feats)
session_test_X_min_replacing = replace_missing_value(session_test_X_min_replacing.copy(),
    '?', customerno_dependent_feats)

session_train_X_min_replacing = replace_missing_by_min(
    session_train_X_min_replacing,
    customerno_dependent_feats
)

session_test_X_min_replacing = replace_missing_by_min(
    session_test_X_min_replacing,
    customerno_dependent_feats
)

In [31]:
session_preprocessed['session_train_X_min_replacing'] = session_train_X_min_replacing
session_preprocessed['session_test_X_min_replacing'] = session_test_X_min_replacing
session_preprocessed['session_train_y_min_replacing'] = session_train_y.copy()

In [32]:
session_train_X_min_replacing.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
313285,36470,4,7,26.098,2,599.99,599.99,599.99,2,599.99,599.99,599.99,1.0,ausente,ausente,?,0.0,0.0,0.0,0.0,17.0,1.0,3.0
154009,18269,2,6,53.906,3,4.99,4.99,9.98,1,4.99,4.99,4.99,5.0,y,completely orderable,?,0.0,0.0,0.0,0.0,17.0,1.0,3.0
23464,2862,10,5,438.96,13,4.99,27.99,244.79,4,5.99,27.99,67.96,1.0,y,completely orderable,?,0.0,0.0,0.0,0.0,17.0,1.0,3.0


<br>

### Criação de variáveis

In [33]:
# bMeanSumPriceOverTransacitions: valor médio do carrinho durante a sessão 
session_train_X['bMeanSumPriceOverTransacitions'] = transact_train_database.groupby('sessionNo').bSumPrice.mean()
session_train_X['bMeanSumPriceOverTransacitions'].fillna(0, inplace=True)
session_test_X['bMeanSumPriceOverTransacitions'] = transact_test_database.groupby('sessionNo').bSumPrice.mean()
session_test_X['bMeanSumPriceOverTransacitions'].fillna(0, inplace=True)

# meanInterationsDuration: valor médio de tempo entre uma transação e outra na sessão
session_train_X['meanInterationsDuration'] = session_train_X['duration']/transact_train_database.groupby('sessionNo').duration.count()
session_train_X['meanInterationsDuration'].fillna(0, inplace=True)
session_test_X['meanInterationsDuration'] = session_test_X['duration']/transact_test_database.groupby('sessionNo').duration.count()
session_test_X['meanInterationsDuration'].fillna(0, inplace=True)

# bMeanCountOverTransacitions: quantidade média de itens no carrinho durante a sessão 
session_train_X['bMeanCountOverTransacitions'] = transact_train_database.groupby('sessionNo').bCount.mean()
session_train_X['bMeanCountOverTransacitions'].fillna(0, inplace=True)
session_test_X['bMeanCountOverTransacitions'] = transact_test_database.groupby('sessionNo').bCount.mean()
session_test_X['bMeanCountOverTransacitions'].fillna(0, inplace=True)

In [34]:
from re import compile as mount


regex = mount('.*_X.*')
session_X = list(filter(regex.match, list(session_preprocessed.keys())))

average_prices = transact_train_database.groupby('sessionNo').bSumPrice.mean()
average_iter_durations = session_train_X['duration']/transact_train_database.groupby('sessionNo').duration.count()
average_counts = transact_train_database.groupby('sessionNo').bCount.mean()

for key in session_X:
    # bMeanSumPriceOverTransacitions: valor médio do carrinho durante a sessão 
    session_preprocessed[key]['bMeanSumPriceOverTransactions'] = average_prices.copy()
    # meanInterationsDuration: valor médio de tempo entre uma transação e outra na sessão
    session_preprocessed[key]['meanInterationsDuration'] = average_iter_durations.copy()
    # bMeanCountOverTransacitions: quantidade média de itens no carrinho durante a sessão 
    session_preprocessed[key]['bMeanCountOverTransactions'] = average_counts.copy() 

In [35]:
columns = ['bMeanSumPriceOverTransactions', 'meanInterationsDuration', 'bMeanCountOverTransactions']

session_preprocessed['session_train_X_mean_replacing'] = replace_missing_by_mean(
    session_preprocessed['session_train_X_mean_replacing'],
    columns
)
session_preprocessed['session_test_X_mean_replacing'] = replace_missing_by_mean(
    session_preprocessed['session_test_X_mean_replacing'],
    columns
)

session_preprocessed['session_train_X_mean_drop_replacing'] = replace_missing_by_mean(
    session_preprocessed['session_train_X_mean_drop_replacing'],
    columns
)
session_preprocessed['session_test_X_mean_drop_replacing'] = replace_missing_by_mean(
    session_preprocessed['session_test_X_mean_drop_replacing'],
    columns
)

session_preprocessed['session_train_X_median_replacing'] = replace_missing_by_median(
    session_preprocessed['session_train_X_median_replacing'],
    columns
)
session_preprocessed['session_test_X_median_replacing'] = replace_missing_by_median(
    session_preprocessed['session_test_X_median_replacing'],
    columns
)

session_preprocessed['session_train_X_min_replacing'] = replace_missing_by_min(
    session_preprocessed['session_train_X_min_replacing'],
    columns
)
session_preprocessed['session_test_X_min_replacing'] = replace_missing_by_min(
    session_preprocessed['session_test_X_min_replacing'],
    columns
)

In [36]:
session_preprocessed['session_test_X_min_replacing'].sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,bMeanSumPriceOverTransactions,meanInterationsDuration,bMeanCountOverTransactions
35587,3786,22,7,1213.062,41,9.99,149.9,1907.06,6,9.99,49.99,125.94,1.0,y,completely orderable,?,0.0,0.0,0.0,0.0,17.0,1.0,4.0,107.197143,0.0,10.761905
28543,2981,21,7,711.2,14,19.99,39.99,379.86,2,19.99,24.99,44.98,1.0,ausente,ausente,3628,500.0,514.0,67.0,8.0,47.0,2.0,79.0,165.360313,0.0,7.84375
41395,4509,0,1,4467.46,31,19.99,699.99,4479.65,2,49.99,99.99,149.98,1.0,ausente,ausente,27188,2600.0,80.0,90.0,1.0,43.0,2.0,19.0,499.99,0.0,1.0


<br>

### Normalização do conjunto de dados

In [37]:
# Listagem dos X de treinamento e teste

regex = mount('.*_X.*')
session_X = list(filter(regex.match, list(session_preprocessed.keys())))

#### Ajusta Indexação e remove coluna não mais significativa

In [38]:
for key in session_X:
    session_preprocessed[key].set_index('sessionNo', inplace=True)
    session_preprocessed[key] = session_preprocessed[key].drop(['customerNo'], axis=1)

In [39]:
session_preprocessed['session_train_X_median_replacing'].sample(3)

Unnamed: 0_level_0,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,bMeanSumPriceOverTransactions,meanInterationsDuration,bMeanCountOverTransactions
sessionNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
38559,10,7,215.608,5,39.99,39.99,159.96,2,39.99,39.99,79.98,3.0,ausente,ausente,1300.0,520.0,98.0,8.0,45.0,2.0,33.0,61.379554,92.078932,1.75
48639,17,7,1452.0,26,9.99,479.99,4842.75,1,479.99,479.99,479.99,2.0,y,completely orderable,1300.0,520.0,98.0,8.0,45.0,2.0,33.0,61.379554,92.078932,1.75
6567,13,5,685.068,21,3.99,34.99,179.15,3,9.99,15.0,39.98,1.0,y,completely orderable,1300.0,520.0,98.0,8.0,45.0,2.0,33.0,61.379554,92.078932,1.75


#### Normalização de valores

In [40]:
# criação das variáveis dummies

for key in session_X:
    session_preprocessed[key] = pd.get_dummies(session_preprocessed[key], prefix_sep='_')

In [41]:
# uniformização de valores numéricos

from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()


for key in session_X:
    X_train_norm = scaler.fit_transform(session_preprocessed[key])
    X_test_norm = scaler.fit_transform(session_preprocessed[key])
    session_preprocessed[key] = pd.DataFrame(dict(zip(session_preprocessed[key].columns.values, X_train_norm.T)))

In [42]:
session_preprocessed['session_test_X_mean_replacing'].sample(3)

Unnamed: 0,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,bMeanSumPriceOverTransactions,meanInterationsDuration,bMeanCountOverTransactions,onlineStatus_ausente,onlineStatus_n,onlineStatus_y,availability_ausente,availability_completely not determinable,availability_completely not orderable,availability_completely orderable,availability_mainly not orderable,availability_mainly orderable,availability_mixed
1169,0.826087,1.0,0.090633,0.105,0.0035,0.012286,0.003424,0.116279,0.0035,0.007822,0.0067,1.0,0.07716,0.776256,0.230371,0.034159,0.363358,0.356199,0.117028,0.028584,0.024364,0.043478,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3006,0.913043,1.0,0.037902,0.06,0.009995,0.005,0.002232,0.046512,0.009995,0.010865,0.005026,1.0,0.1,0.825733,0.421756,0.158273,0.273973,0.5,0.020576,0.007622,0.024043,0.043478,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2371,0.869565,1.0,0.00506,0.015,0.008995,0.003541,0.000281,0.023256,0.008995,0.007822,0.00201,0.529633,0.07716,0.776256,0.230371,0.034159,0.363358,0.356199,0.117028,0.00381,0.024364,0.043478,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


<br>

### Remoção de variáveis não significativas

In [43]:
threshold_var=0

for key in session_X:
    l_var = [x for x in session_preprocessed[key].columns if session_preprocessed[key][x].var() <= threshold_var]
    for v in l_var:
        session_preprocessed[key] = session_preprocessed[key].drop([v], axis=1)

In [44]:
session_train_X.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,bMeanSumPriceOverTransacitions,meanInterationsDuration,bMeanCountOverTransacitions
428542,49959,18,7,4230.592,51,1.0,20.0,472.92,3,3.0,17.99,27.99,1.0,y,ausente,25014,1000.0,563.0,123.0,26.0,48.0,2.0,36.0,0.0,0.0,0.0
411858,48084,17,7,335.08,4,599.0,699.0,2696.0,1,699.0,699.0,699.0,1.0,y,completely orderable,24022,2600.0,528.0,79.0,16.0,29.0,2.0,76.0,0.0,0.0,0.0
344425,40787,11,7,1496.703,17,14.99,44.99,309.88,5,14.99,29.99,114.95,,y,completely orderable,20111,600.0,566.0,222.0,10.0,61.0,1.0,225.0,0.0,0.0,0.0


<br>

### Exportação dos dados

In [45]:
import os


path = 'data'
if not os.path.exists(path):
    os.mkdir(path)

session = list(session_preprocessed.keys())
for key in session:
    session_preprocessed[key].to_csv(f'{path}/{key}.csv', index=False)