<br>
<center>
<img src="https://raw.githubusercontent.com/elaynelemos/prediction-of-orders-dmc/main/assets/img/univasf-logo.png" width=200>
<h3>
    UNIVERSIDADE FEDERAL DO VALE DO SÃO FRANCISCO
    <br>COLEGIADO DE ENGENHARIA DE COMPUTAÇÃO
</h3>

<h3>Orientador</h3>
<span>Prof. Dr. Rosalvo Ferreira de Oliveira Neto</span>

<h3>Discentes</h3>
<span>Anísio Pereira Batista Filho
<br>Edjair Aguiar Gomes Filho
<br>Elayne Rute Lessa Lemos</span>
</center>
<br><br>

## Predição de pedidos com Redes Neurais e Random Forest

Projeto em: [github.com/elaynelemos/prediction-of-orders-dmc](https://github.com/elaynelemos/prediction-of-orders-dmc)


### Importação da base de dados

In [1]:
import numpy as np
import pandas as pd

In [2]:
repo_url = 'https://raw.githubusercontent.com/elaynelemos/prediction-of-orders-dmc/main'

transact_train_database = pd.read_csv(f'{repo_url}/data/transact_train.txt', sep = '|')
transact_train_database.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,order
417786,48759,17,7,363.209,3,39.99,599.0,1237.99,1,599.0,599.0,599.0,2,y,completely orderable,?,?,?,?,?,?,?,?,n
248726,29300,17,6,1170.387,4,89.99,2599.99,2989.96,1,149.99,149.99,149.99,3,y,completely orderable,?,?,?,?,?,?,?,?,n
200929,23985,13,6,2949.756,59,12.99,69.99,1753.96,10,14.99,69.99,349.84,?,y,completely orderable,11551,800,571,7,4,39,2,172,y


In [3]:
transact_test_database = pd.read_csv(f'{repo_url}/data/transact_class.txt', sep = '|')
transact_test_database.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
5591,572,19,7,9852.053,124,4.0,100.83,2811.96,13,4.99,39.99,235.9,?,?,?,25351,2500,583,365,51,65,2,16
8482,858,19,7,126.796,6,29.99,29.99,179.94,1,29.99,29.99,29.99,?,?,?,?,?,?,?,?,?,?,?
18012,1876,20,7,3733.495,58,13.99,44.99,1309.58,7,13.99,34.99,143.94,?,?,?,26001,1700,499,86,14,48,2,44


<br>

### Alteração da granularidade da base de dados

In [4]:
# remove todas as linhas com valores de sessionNo iguais exceto a última
session_train_database = transact_train_database.drop_duplicates(subset=['sessionNo'], keep='last')

# separa variável alvo no conjunto de treinamento
session_train_X = session_train_database.iloc[:,:-1]
session_train_y = session_train_database.iloc[:,-1]
session_train_y = session_train_y.replace({'y': 1 , 'n': 0 })


# remove todas as linhas com valores de sessionNo iguais exceto a última na base de teste
session_test_X = transact_test_database.drop_duplicates(subset=['sessionNo'], keep='last')

In [5]:
session_test_X.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
2897,277,19,7,1563.853,8,5.95,14.99,41.88,2,5.95,14.99,20.94,5,y,completely orderable,?,?,?,?,?,?,?,?
37870,4054,23,7,2379.936,44,499.99,1599.99,21599.78,3,899.99,999.99,2899.97,3,n,completely not determinable,27011,500,520,58,0,24,1,35
35621,3792,22,7,5943.936,62,0.0,19.99,506.83,4,0.0,19.99,38.99,5,y,completely orderable,?,?,?,?,?,?,?,?


<br>

### Tratamento de valores ausentes

In [6]:
def replace_missing_value(df, value, features):
    replaced = df[features].replace(value, np.nan)
    for column in features:
        df[column] = replaced[column]

    return df

In [7]:
# convert_float() é baseadona solução proposta no Estudo de Caso
# do livro Ciência dos Dados pelo Processo de KDD do Prof. Dr. Rosalvo Neto
# livro em: https://www.researchgate.net/publication/352749819_Ciencia_dos_Dados_pelo_Processo_de_KDD
# implementação em: https://github.com/rosalvoneto/Livro

def convert_float(df, numeric_features):
    for column in numeric_features:
        df[column] = df[column].astype(float)

    return df

In [8]:
def replace_missing_by_fixed_value(df, value, features):
    for column in features:
        df[column].fillna(value, inplace=True)

    return df

In [9]:
# replace_missing_by_mean() é baseadona solução proposta no Estudo de Caso
# do livro Ciência dos Dados pelo Processo de KDD do Prof. Dr. Rosalvo Neto
# livro em: https://www.researchgate.net/publication/352749819_Ciencia_dos_Dados_pelo_Processo_de_KDD
# implementação em: https://github.com/rosalvoneto/Livro

def replace_missing_by_mean(df, numeric_features):
    for column in numeric_features:
        average = df[column].mean(axis=0)
        df[column].fillna(average, inplace=True)
    
    return df

In [10]:
def replace_missing_by_median(df, numeric_features):
    for column in numeric_features:
        med = df[column].median(axis=0)
        df[column].fillna(med, inplace=True)
    
    return df

In [11]:
def replace_missing_by_min(df, numeric_features):
    for column in numeric_features:
        minimum = df[column].min(axis=0)
        df[column].fillna(minimum, inplace=True)
    
    return df

In [12]:
numeric_features = ['cMinPrice', 'cMaxPrice', 'cSumPrice', 'bMinPrice', 'bMaxPrice',
    'bSumPrice', 'bStep','maxVal', 'customerScore', 'accountLifetime', 'payments', 
    'age', 'address', 'lastOrder']

string_features = ['availability', 'onlineStatus']

customerno_dependent_feats = ['maxVal', 'customerScore', 'accountLifetime', 'payments',
    'age', 'address', 'lastOrder']

In [13]:
# substitui '?' por NaN na base do projeto
session_train_X = replace_missing_value(session_train_X.copy(), '?', numeric_features)
session_test_X = replace_missing_value(session_test_X.copy(), '?', numeric_features)
session_train_X = replace_missing_value(session_train_X.copy(), '?', string_features)
session_test_X = replace_missing_value(session_test_X.copy(), '?', string_features)

# converte NaN para float
session_train_X = convert_float(session_train_X, numeric_features)
session_test_X = convert_float(session_test_X, numeric_features)

In [14]:
# substitui '?' por NaN na base original
transact_train_database = replace_missing_value(transact_train_database.copy(), '?', numeric_features)
transact_train_database = replace_missing_value(transact_train_database.copy(), '?', string_features)
transact_test_database = replace_missing_value(transact_test_database.copy(), '?', numeric_features)
transact_test_database = replace_missing_value(transact_test_database.copy(), '?', string_features)

# converte NAN para float
transact_train_database = convert_float(transact_train_database, numeric_features)
transact_test_database = convert_float(transact_test_database, numeric_features)

In [15]:
# estabelece valor fixo para valores ausentes em atributos não numéricos
session_train_X = replace_missing_by_fixed_value(session_train_X, 'ausente', string_features)
session_test_X = replace_missing_by_fixed_value(session_test_X, 'ausente', string_features)

In [16]:
session_train_X.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
135198,15980,21,5,336.3,5,2.5,2.5,5.0,1,2.5,2.5,2.5,2.0,y,completely orderable,?,,,,,,,
102554,12197,18,5,913.852,44,9.99,59.95,995.07,2,29.95,29.95,59.9,,ausente,ausente,?,,,,,,,
205591,24512,14,6,590.424,17,699.99,929.99,10329.88,1,799.99,799.99,799.99,2.0,y,completely orderable,?,,,,,,,


In [17]:
session_preprocessed = {}

#### Estratégia: substituição pela média

In [18]:
session_train_X_mean_replacing = replace_missing_by_mean(session_train_X.copy(), numeric_features)
session_test_X_mean_replacing = replace_missing_by_mean(session_test_X.copy(), numeric_features)

session_train_X_mean_replacing = replace_missing_value(session_train_X_mean_replacing.copy(),
    '?', customerno_dependent_feats)
session_test_X_mean_replacing = replace_missing_value(session_test_X_mean_replacing.copy(),
    '?', customerno_dependent_feats)

session_train_X_mean_replacing = replace_missing_by_mean(
    session_train_X_mean_replacing,
    customerno_dependent_feats
)
session_test_X_mean_replacing = replace_missing_by_mean(
    session_test_X_mean_replacing,
    customerno_dependent_feats
)

In [19]:
# armazena estágio de pré-processamento para facilitar na exportação
session_preprocessed['session_train_X_mean_replacing'] = session_train_X_mean_replacing
session_preprocessed['session_test_X_mean_replacing'] = session_test_X_mean_replacing
session_preprocessed['session_train_y_mean_replacing'] = session_train_y.copy()

In [20]:
session_train_X_mean_replacing.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
313221,36450,3,7,5.062,1,49.99,49.99,49.99,1,49.99,49.99,49.99,1.0,y,completely orderable,?,2266.582859,483.731568,128.169304,12.893535,44.921048,1.706919,79.007675
342815,40548,11,7,113.723,6,9.99,59.99,269.94,1,59.99,59.99,59.99,2.0,y,completely orderable,19997,300.0,412.0,13.0,4.0,20.0,2.0,413.0
212613,25271,14,6,2918.425,37,3.0,17.0,321.77,7,3.0,14.99,45.96,5.0,y,completely orderable,12220,2300.0,522.0,214.0,7.0,46.0,2.0,37.0


#### Estratégia: remoção de registros pouco relevantes para o modelo após substituição pela média

In [21]:
# rows = session_train_database['customerNo'] == '?' 
#     and session_train_database['onlineStatus'] == 'ausente'
#     and session_train_database['availability'] == 'ausente'
temp = session_train_database[session_train_database['customerNo'] == '?']
temp = temp[temp['onlineStatus'] == '?']
temp = temp[temp['availability'] == '?']

# demonstação de que para quando customerNo, onlineStatus e availability
# são nulos, não há variabilidade na classe proporção aproximada de 1% para 'y'
temp.groupby('order').sessionNo.nunique()

order
n    6977
y      73
Name: sessionNo, dtype: int64

In [22]:
rows_to_drop = temp.index.values.tolist()

session_train_X_mean_drop_replacing = session_train_X_mean_replacing.drop(rows_to_drop, axis=0)
session_train_y_mean_drop_replacing = session_train_y.drop(rows_to_drop, axis=0)

In [23]:
session_preprocessed['session_train_X_mean_drop_replacing'] = session_train_X_mean_drop_replacing
session_preprocessed['session_test_X_mean_drop_replacing'] = session_test_X_mean_replacing.copy()
session_preprocessed['session_train_y_mean_drop_replacing'] = session_train_y_mean_drop_replacing

In [24]:
session_train_X.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
98837,11785,18,5,647.407,7,14.99,379.99,1199.93,2,14.99,379.99,394.98,5.0,y,completely orderable,?,,,,,,,
233678,27615,16,6,3950.775,116,5.99,2700.0,12701.27,8,8.99,19.99,136.92,5.0,y,completely orderable,13496,600.0,512.0,223.0,3.0,65.0,1.0,100.0
295734,34442,21,6,7751.796,66,3.99,99.95,2226.52,4,3.99,49.99,111.88,2.0,n,mixed,17175,800.0,545.0,210.0,12.0,29.0,2.0,8.0


#### Estratégia: substituição pela mediana

In [25]:
session_train_X_median_replacing = replace_missing_by_median(session_train_X.copy(), numeric_features)
session_test_X_median_replacing = replace_missing_by_median(session_test_X.copy(), numeric_features)

In [26]:
session_train_X_median_replacing = replace_missing_value(session_train_X_median_replacing.copy(),
    '?', customerno_dependent_feats)
session_test_X_median_replacing = replace_missing_value(session_test_X_median_replacing.copy(),
    '?', customerno_dependent_feats)

session_train_X_median_replacing = replace_missing_by_median(
    session_train_X_median_replacing,
    customerno_dependent_feats
)

session_test_X_median_replacing = replace_missing_by_median(
    session_test_X_median_replacing,
    customerno_dependent_feats
)

In [27]:
session_preprocessed['session_train_X_median_replacing'] = session_train_X_median_replacing
session_preprocessed['session_test_X_median_replacing'] = session_test_X_median_replacing
session_preprocessed['session_train_y_median_replacing'] = session_train_y.copy()

In [28]:
session_train_X_mean_replacing.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
128779,15243,20,5,1711.915,7,9.99,49.99,142.93,7,9.99,49.99,142.93,5.0,y,completely orderable,?,2266.582859,483.731568,128.169304,12.893535,44.921048,1.706919,79.007675
142851,16852,22,5,896.504,9,44.99,99.99,734.91,1,79.99,79.99,79.99,5.0,y,completely orderable,?,2266.582859,483.731568,128.169304,12.893535,44.921048,1.706919,79.007675
113622,13517,19,5,563.494,13,14.99,19.99,192.22,2,19.32,19.99,39.31,2.0,y,completely orderable,?,2266.582859,483.731568,128.169304,12.893535,44.921048,1.706919,79.007675


#### Estratégia: substituição pelo mínimo

In [29]:
session_train_X_min_replacing = replace_missing_by_min(session_train_X.copy(), numeric_features)
session_test_X_min_replacing = replace_missing_by_min(session_test_X.copy(), numeric_features)

In [30]:
session_train_X_min_replacing = replace_missing_value(session_train_X_min_replacing.copy(),
    '?', customerno_dependent_feats)
session_test_X_min_replacing = replace_missing_value(session_test_X_min_replacing.copy(),
    '?', customerno_dependent_feats)

session_train_X_min_replacing = replace_missing_by_min(
    session_train_X_min_replacing,
    customerno_dependent_feats
)

session_test_X_min_replacing = replace_missing_by_min(
    session_test_X_min_replacing,
    customerno_dependent_feats
)

In [31]:
session_preprocessed['session_train_X_min_replacing'] = session_train_X_min_replacing
session_preprocessed['session_test_X_min_replacing'] = session_test_X_min_replacing
session_preprocessed['session_train_y_min_replacing'] = session_train_y.copy()

In [32]:
session_train_X_min_replacing.sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder
19038,2338,9,5,29.046,3,34.99,34.99,104.97,1,34.99,34.99,34.99,1.0,ausente,ausente,?,0.0,0.0,0.0,0.0,17.0,1.0,3.0
330798,38639,10,7,24.189,3,219.0,269.99,707.99,1,219.0,219.0,219.0,1.0,ausente,ausente,?,0.0,0.0,0.0,0.0,17.0,1.0,3.0
116629,13852,19,5,815.393,15,9.99,19.99,229.85,5,9.99,19.99,74.95,1.0,y,completely orderable,6862,600.0,581.0,262.0,1.0,53.0,1.0,234.0


<br>

### Criação de variáveis

In [33]:
# bMeanSumPriceOverTransacitions: valor médio do carrinho durante a sessão 
session_train_X['bMeanSumPriceOverTransacitions'] = transact_train_database.groupby('sessionNo').bSumPrice.mean()
session_train_X['bMeanSumPriceOverTransacitions'].fillna(0, inplace=True)
session_test_X['bMeanSumPriceOverTransacitions'] = transact_test_database.groupby('sessionNo').bSumPrice.mean()
session_test_X['bMeanSumPriceOverTransacitions'].fillna(0, inplace=True)

# meanInterationsDuration: valor médio de tempo entre uma transação e outra na sessão
session_train_X['meanInterationsDuration'] = session_train_X['duration']/transact_train_database.groupby('sessionNo').duration.count()
session_train_X['meanInterationsDuration'].fillna(0, inplace=True)
session_test_X['meanInterationsDuration'] = session_test_X['duration']/transact_test_database.groupby('sessionNo').duration.count()
session_test_X['meanInterationsDuration'].fillna(0, inplace=True)

# bMeanCountOverTransacitions: quantidade média de itens no carrinho durante a sessão 
session_train_X['bMeanCountOverTransacitions'] = transact_train_database.groupby('sessionNo').bCount.mean()
session_train_X['bMeanCountOverTransacitions'].fillna(0, inplace=True)
session_test_X['bMeanCountOverTransacitions'] = transact_test_database.groupby('sessionNo').bCount.mean()
session_test_X['bMeanCountOverTransacitions'].fillna(0, inplace=True)

In [34]:
from re import compile as mount


regex = mount('.*_X.*')
session_X = list(filter(regex.match, list(session_preprocessed.keys())))

average_prices = transact_train_database.groupby('sessionNo').bSumPrice.mean()
average_iter_durations = session_train_X['duration']/transact_train_database.groupby('sessionNo').duration.count()
average_counts = transact_train_database.groupby('sessionNo').bCount.mean()

for key in session_X:
    # bMeanSumPriceOverTransacitions: valor médio do carrinho durante a sessão 
    session_preprocessed[key]['bMeanSumPriceOverTransactions'] = average_prices.copy()
    # meanInterationsDuration: valor médio de tempo entre uma transação e outra na sessão
    session_preprocessed[key]['meanInterationsDuration'] = average_iter_durations.copy()
    # bMeanCountOverTransacitions: quantidade média de itens no carrinho durante a sessão 
    session_preprocessed[key]['bMeanCountOverTransactions'] = average_counts.copy() 

In [35]:
columns = ['bMeanSumPriceOverTransactions', 'meanInterationsDuration', 'bMeanCountOverTransactions']

session_preprocessed['session_train_X_mean_replacing'] = replace_missing_by_mean(
    session_preprocessed['session_train_X_mean_replacing'],
    columns
)
session_preprocessed['session_test_X_mean_replacing'] = replace_missing_by_mean(
    session_preprocessed['session_test_X_mean_replacing'],
    columns
)

session_preprocessed['session_train_X_mean_drop_replacing'] = replace_missing_by_mean(
    session_preprocessed['session_train_X_mean_drop_replacing'],
    columns
)
session_preprocessed['session_test_X_mean_drop_replacing'] = replace_missing_by_mean(
    session_preprocessed['session_test_X_mean_drop_replacing'],
    columns
)

session_preprocessed['session_train_X_median_replacing'] = replace_missing_by_median(
    session_preprocessed['session_train_X_median_replacing'],
    columns
)
session_preprocessed['session_test_X_median_replacing'] = replace_missing_by_median(
    session_preprocessed['session_test_X_median_replacing'],
    columns
)

session_preprocessed['session_train_X_min_replacing'] = replace_missing_by_min(
    session_preprocessed['session_train_X_min_replacing'],
    columns
)
session_preprocessed['session_test_X_min_replacing'] = replace_missing_by_min(
    session_preprocessed['session_test_X_min_replacing'],
    columns
)

In [36]:
session_preprocessed['session_test_X_min_replacing'].sample(3)

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,customerNo,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,bMeanSumPriceOverTransactions,meanInterationsDuration,bMeanCountOverTransactions
44917,5079,5,1,5.08,1,49.99,49.99,49.99,1,49.99,49.99,49.99,1.0,y,completely orderable,?,0.0,0.0,0.0,0.0,17.0,1.0,4.0,79.281,13.4909,2.7
5954,604,19,7,707.064,5,3.99,39.99,96.95,5,3.99,39.99,96.95,5.0,y,completely orderable,25366,3000.0,501.0,323.0,8.0,54.0,1.0,19.0,119.99,0.0,1.0
9191,943,19,7,146.742,1,0.0,1.0,3.0,1,0.0,0.0,0.0,1.0,y,completely orderable,25526,600.0,70.0,50.0,0.0,50.0,2.0,639.0,1399.99,0.0,1.0


<br>

### Normalização do conjunto de dados

In [37]:
# Listagem dos X de treinamento e teste

regex = mount('.*_X.*')
session_X = list(filter(regex.match, list(session_preprocessed.keys())))

#### Ajusta Indexação e remove coluna não mais significativa

In [38]:
for key in session_X:
    session_preprocessed[key].set_index('sessionNo', inplace=True)
    session_preprocessed[key] = session_preprocessed[key].drop(['customerNo'], axis=1)

In [39]:
session_preprocessed['session_train_X_median_replacing'].sample(3)

Unnamed: 0_level_0,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,onlineStatus,availability,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,bMeanSumPriceOverTransactions,meanInterationsDuration,bMeanCountOverTransactions
sessionNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
41691,12,7,15719.552,4,9.49,68.99,97.46,1,9.49,9.49,9.49,2.0,y,completely orderable,1300.0,520.0,98.0,8.0,45.0,2.0,33.0,61.379554,92.078932,1.75
30174,18,6,2969.036,32,9.99,299.99,791.75,2,12.99,24.99,37.98,5.0,y,completely orderable,1300.0,520.0,98.0,8.0,45.0,2.0,33.0,61.379554,92.078932,1.75
23241,13,6,1123.138,16,99.99,799.99,2699.87,1,99.99,99.99,99.99,3.0,y,completely orderable,1300.0,520.0,98.0,8.0,45.0,2.0,33.0,61.379554,92.078932,1.75


In [40]:
print(list(set(session_preprocessed['session_train_X_median_replacing'].availability.unique()) - set(session_preprocessed['session_test_X_median_replacing'].availability.unique())))
print(list(set(session_preprocessed['session_train_X_mean_replacing'].availability.unique()) - set(session_preprocessed['session_test_X_mean_replacing'].availability.unique())))
print(list(set(session_preprocessed['session_train_X_mean_drop_replacing'].availability.unique()) - set(session_preprocessed['session_test_X_mean_drop_replacing'].availability.unique())))
print(list(set(session_preprocessed['session_train_X_mean_drop_replacing'].onlineStatus.unique()) - set(session_preprocessed['session_test_X_mean_drop_replacing'].onlineStatus.unique())))
print(list(set(session_preprocessed['session_train_X_median_replacing'].onlineStatus.unique()) - set(session_preprocessed['session_test_X_median_replacing'].onlineStatus.unique())))

['mainly not determinable']
['mainly not determinable']
['mainly not determinable']
[]
[]


#### Normalização de valores

In [41]:
# criação das variáveis dummies

for key in session_X:
    session_preprocessed[key] = pd.get_dummies(session_preprocessed[key], prefix_sep='_')

In [42]:
# Correção da diferença na quantidade de colunas para availability

# Listagem dos X de teste
regex = mount('.*_test_X.*')
session_test = list(filter(regex.match, list(session_preprocessed.keys())))

for key in session_test:
    session_preprocessed[key]['availability_mainly not determinable'] = 0

In [43]:
# uniformização de valores numéricos
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()

for key in session_X:
    X_train_norm = scaler.fit_transform(session_preprocessed[key])
    X_test_norm = scaler.fit_transform(session_preprocessed[key])
    session_preprocessed[key] = pd.DataFrame(dict(zip(session_preprocessed[key].columns.values, X_train_norm.T)))

In [44]:
session_preprocessed['session_test_X_mean_replacing'].sample(3)

Unnamed: 0,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,bMaxPrice,bSumPrice,bStep,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,bMeanSumPriceOverTransactions,meanInterationsDuration,bMeanCountOverTransactions,onlineStatus_ausente,onlineStatus_n,onlineStatus_y,availability_ausente,availability_completely not determinable,availability_completely not orderable,availability_completely orderable,availability_mainly not orderable,availability_mainly orderable,availability_mixed,availability_mainly not determinable
3376,0.956522,1.0,0.107223,0.47,0.003495,0.006042,0.014632,0.255814,0.0045,0.013039,0.023016,1.0,0.012,0.763844,0.425573,0.043165,0.493151,0.0,0.019204,0.00532,0.024364,0.276398,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2557,0.869565,1.0,0.000688,0.005,0.003495,0.001248,3.4e-05,0.023256,0.003495,0.003039,0.000781,0.529633,0.012,0.874593,0.009542,0.0,0.493151,0.0,0.041152,0.004763,0.024364,0.043478,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3122,0.913043,1.0,0.031445,0.08,0.009975,0.006034,0.00074,0.023256,0.009975,0.008674,0.002229,0.0,0.07716,0.776256,0.230371,0.034159,0.363358,0.356199,0.117028,0.00292,0.024364,0.086957,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


<br>

### Exportação dos dados

In [45]:
import os


path = 'data'
if not os.path.exists(path):
    os.mkdir(path)

session = list(session_preprocessed.keys())
for key in session:
    session_preprocessed[key].to_csv(f'{path}/{key}.csv', index=False)