# PO-240: Pré-Processamento

### Importando Bibliotecas

In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

### Importando o Dataset

In [34]:
input_folder = 'KaggleDatasets/RAW/'
output_folder = 'KaggleDatasets/PRE/'
filename = 'preprocessado.csv'
number_files = 5

files = []
for entry in os.listdir(input_folder):
    if entry.startswith("public") or entry.startswith("sample"):
        continue
    file_path = os.path.join(input_folder, entry)
    if os.path.isfile(file_path):
        files.append(file_path)

datasets = []
chosen = np.choose([number_files], files)
for file_path in chosen:
    ds = pd.read_csv(file_path)
    datasets.append(ds)

dataset = pd.concat(datasets, sort=False)
dataset.head()

Unnamed: 0,id,date,input_0,input_1,input_2,input_3,input_4_1,input_5_1,input_6_1,input_7_1,...,output_7_6,output_8_6,output_9_6,output_10_6,output_11_6,output_12_6,output_13_6,output_14_6,output_15_6,output_16_6
0,27214,2017-06-01,68,0,160,131,0.730351,-0.484791,0.12212,-0.205354,...,2.569752,0.8093,-0.332242,2.000099,1.608055,0.754655,-0.321827,3.504003,1.396752,0.575128
1,27275,2017-06-01,68,1,160,131,-0.279627,-0.484791,0.12212,-0.706161,...,-0.802133,-0.473629,-0.771409,-0.765082,-0.949418,-0.916985,-0.321827,-0.38921,-0.481662,-0.017313
2,27336,2017-06-01,68,2,160,131,-0.606385,-0.484791,-0.725561,-0.706161,...,-0.481001,-0.152897,-0.507909,-0.70865,-0.827634,-0.663706,-0.321827,-0.551427,-0.371167,-0.609755
3,27397,2017-06-01,68,3,160,131,-0.782869,-0.484791,-0.725561,-0.706161,...,-0.802133,-0.473629,-0.771409,-0.990812,-0.990013,-1.068952,-0.321827,-0.713645,-0.592157,-0.609755
4,30056,2017-06-01,75,0,164,135,0.449025,-0.388949,0.274415,-0.026731,...,1.939692,0.14626,-0.409396,1.662065,0.846325,0.567948,-0.236317,2.978473,1.622595,1.049236


### Removendo Dados

In [35]:
dataset = dataset.drop(['id'], axis=1)

### Convertendo a data para número da semana

In [36]:
dataset['date'] = dataset['date'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").day)
dataset.head()

Unnamed: 0,date,input_0,input_1,input_2,input_3,input_4_1,input_5_1,input_6_1,input_7_1,input_8_1,...,output_7_6,output_8_6,output_9_6,output_10_6,output_11_6,output_12_6,output_13_6,output_14_6,output_15_6,output_16_6
0,1,68,0,160,131,0.730351,-0.484791,0.12212,-0.205354,1.217061,...,2.569752,0.8093,-0.332242,2.000099,1.608055,0.754655,-0.321827,3.504003,1.396752,0.575128
1,1,68,1,160,131,-0.279627,-0.484791,0.12212,-0.706161,-0.638826,...,-0.802133,-0.473629,-0.771409,-0.765082,-0.949418,-0.916985,-0.321827,-0.38921,-0.481662,-0.017313
2,1,68,2,160,131,-0.606385,-0.484791,-0.725561,-0.706161,-0.638826,...,-0.481001,-0.152897,-0.507909,-0.70865,-0.827634,-0.663706,-0.321827,-0.551427,-0.371167,-0.609755
3,1,68,3,160,131,-0.782869,-0.484791,-0.725561,-0.706161,-0.638826,...,-0.802133,-0.473629,-0.771409,-0.990812,-0.990013,-1.068952,-0.321827,-0.713645,-0.592157,-0.609755
4,1,75,0,164,135,0.449025,-0.388949,0.274415,-0.026731,1.701169,...,1.939692,0.14626,-0.409396,1.662065,0.846325,0.567948,-0.236317,2.978473,1.622595,1.049236


### Tratando os NA

In [37]:
# Parâmetros
x = 55 # x%
y = 35 # y%
z = 20 # z%

# Tratamento
# NA > x% : descarta
# x% >NA > y% e variancia < z : descarta
# y% > NA: substitui pela media, moda ou mediana

total = len(dataset)
delete_list = []
median_list = []

variance = dataset.var()
mean = dataset.mean()

for column in dataset:
    num_na = dataset[column].isnull().sum()
    per_na = num_na * 100 / total
    aux = variance[column] * 100 / mean[column]
    if per_na > x:
        delete_list.append(column)
    elif per_na > y and aux > z:
        delete_list.append(column)
    elif num_na > 0:
        median_list.append(column)
        
dataset = dataset.drop(delete_list, axis=1)

median = dataset.median()
for column in median_list:
    dataset[column] = dataset[column].fillna(median[column])
    
dataset.to_csv(output_folder + filename, index=False)