# Tratamento dos arquivos de treinamento e teste

Nesse notebook trataremos os arquivos `trasact_train.csv`, `transact_class.csv` e `realclass_t1.csv`.

O objetivo é preparar os dados na forma de CSV para utiliza-los na fase seguinte, escolha de paramêtros dos algoritmo de classificação.

Ao final do processo serão gerados 3 pares de arquivos, cada par referente a uma estratégia para tratamento de missing values.

### Importar as bibliotecas necessárias

In [2]:
import pandas as pd
import numpy as np

### Ler os arquivos csv

In [42]:
transact_train = pd.read_csv('data/transact_train.csv', sep='|', na_values='?')
transact_class = pd.read_csv('data/transact_class.csv', sep='|', na_values='?')
realclass_t1 = pd.read_csv('data/realclass_t1.csv', sep='|', na_values='?')

# Remover os dados de ID do usuário

transact_train.drop('customerNo', axis=1, inplace=True)
transact_class.drop('customerNo', axis=1, inplace=True)

In [43]:
transact_train

Unnamed: 0,sessionNo,startHour,startWeekday,duration,cCount,cMinPrice,cMaxPrice,cSumPrice,bCount,bMinPrice,...,onlineStatus,availability,maxVal,customerScore,accountLifetime,payments,age,address,lastOrder,order
0,1,6,5,0.000,1,59.99,59.99,59.99,1,59.99,...,,,600.0,70.0,21.0,1.0,43.0,1.0,49.0,y
1,1,6,5,11.940,1,59.99,59.99,59.99,1,59.99,...,y,completely orderable,600.0,70.0,21.0,1.0,43.0,1.0,49.0,y
2,1,6,5,39.887,1,59.99,59.99,59.99,1,59.99,...,y,completely orderable,600.0,70.0,21.0,1.0,43.0,1.0,49.0,y
3,2,6,5,0.000,0,,,,0,,...,y,completely orderable,,,,,,,,y
4,2,6,5,15.633,0,,,,0,,...,y,completely orderable,,,,,,,,y
5,2,6,5,26.235,0,,,,0,,...,y,completely orderable,,,,,,,,y
6,2,6,5,71.200,0,,,,0,,...,y,completely orderable,,,,,,,,y
7,2,6,5,94.469,0,,,,0,,...,y,completely orderable,,,,,,,,y
8,3,6,5,181.477,9,29.99,29.99,89.97,1,29.99,...,,,1800.0,475.0,302.0,12.0,45.0,1.0,11.0,y
9,3,6,5,297.018,11,9.99,29.99,109.95,2,9.99,...,,,1800.0,475.0,302.0,12.0,45.0,1.0,11.0,y


### Diminuir a granularidade dos dados

In [13]:
def reduceGranularity(data):
    ant = data['sessionNo'][0]
    indexes = []
    reduced_data = pd.DataFrame()
    for index, row in data.iterrows():
        if row['sessionNo'] != ant:
            indexes.append(index)      
        ant = row['sessionNo']
    for index in range(0, len(indexes)):
        indexes[index] -= 1
    indexes.append(len(data) - 1)
    reduced_data = data.iloc[indexes].set_index('sessionNo')
    return reduced_data


In [16]:
training_data = reduceGranularity(transact_train)
testing_data = reduceGranularity(transact_class)

sessionNo
1            1.0
2            NaN
3            3.0
4            NaN
5            4.0
6            NaN
7            5.0
8            6.0
9            NaN
10           7.0
11           NaN
12           8.0
13           NaN
14           NaN
15           NaN
16           NaN
17           NaN
18           9.0
19          10.0
20          11.0
21          12.0
22           NaN
23           NaN
24          13.0
25           NaN
26           NaN
27          14.0
28          15.0
29           NaN
30          16.0
          ...   
49971    25021.0
49972        NaN
49973    25022.0
49974    25023.0
49975    25024.0
49976    25025.0
49977        NaN
49978        NaN
49979    25026.0
49980    25027.0
49981    25028.0
49982    25029.0
49983    25030.0
49984        NaN
49985    25031.0
49986    25032.0
49987    25033.0
49988        NaN
49989       47.0
49990    25034.0
49991    25035.0
49992        NaN
49993    25036.0
49994        NaN
49995    25037.0
49996        NaN
49997        NaN
4999

In [None]:
for col in training_data:
    if training_data[col].dtype == 'object':
        training_data[col].fillna(value=training_data[col].mode().iloc[0], inplace=True)
        print("Object", col)
    else:
        training_data[col].fillna(value=training_data[col].mean(), inplace=True)
training_data

In [None]:
training_data.to_csv('training_data_mean.csv')