In [5]:
import pandas as pd
import numpy as np
import warnings

# preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')

plt.style.use('seaborn-pastel')

%matplotlib inline

In [2]:
df = pd.read_csv('data/boston_house_prices_raw.csv')

# Divisão de treino e teste

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('price', axis=1), 
    df['price'], 
    test_size = 0.2, 
    random_state = 0
)

# Missings

In [7]:
X_train.isnull().sum()

CRIM         0
ZN           0
INDUS        0
CHAS         0
NOX        110
RM          64
AGE          0
DIS          0
RAD         66
TAX         64
PTRATIO      0
B            0
LSTAT        0
dtype: int64

In [8]:
X_test.isnull().sum()

CRIM         0
ZN           0
INDUS        0
CHAS         0
NOX         28
RM          21
AGE          0
DIS          0
RAD        102
TAX         21
PTRATIO      0
B            0
LSTAT        0
dtype: int64

### Imputação por KNN

In [9]:
# criando instância da classe KNNImputer
imputer = KNNImputer(n_neighbors=5)

# fitando e transformado para o treino
train_imputed = imputer.fit_transform(X_train)
X_train = pd.DataFrame(data=train_imputed, columns=X_train.columns)

# aplicando a transformação para o teste (evitando vazamento de dados)
test_imputed = imputer.transform(X_test)
X_test = pd.DataFrame(data=test_imputed, columns=X_test.columns)

# Scaling

$ x_{norm} = \frac{x - x_{mean}}{x_{std}}  $

In [10]:
# criando instância da classe StandardScaler
sc = StandardScaler()

# fitando e transformando para treino
X_train = sc.fit_transform(X_train)

# apenas transformando para teste (evitando vazamento de dados)
X_test = sc.transform(X_test)

In [11]:
X_train = pd.DataFrame(X_train, columns = df.drop('price', axis=1).columns)
X_test = pd.DataFrame(X_test, columns = df.drop('price', axis=1).columns)

In [12]:
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Salvando dados limpos

In [14]:
# Salvando target nos dataframes
X_train['target'] = y_train
X_test['target'] = y_test

# Setar conjunto de treino e teste
X_train['set'] = 'treino'
X_test['set'] = 'test'

# Concatenar treino e teste
df_clean = pd.concat([X_train, X_test], axis=0)

# Salvando o df_clean
df_clean.to_csv('data/boston_house_prices_clean.csv', index=None)