### Importando bibliotecas

In [32]:
import pandas as pd
import numpy as np

### Carregando dados

In [33]:
df = pd.read_csv("Data.csv")
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Separar em treino e teste

In [34]:
X = df.iloc[:, :-1] 
y = df.iloc[:, -1]
# caso queira transformar para o tipo numpy, colocar .values no final, porem isso fará perder algumas informaçoes que hoje já nao tem mais tanta necessidade

### Tratando valores nulos

In [35]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=pd.NA, strategy="mean")
imputer.fit(X[["Age","Salary"]]) #poderia fazer usando iloc, que foi o caso do curso
X[["Age","Salary"]] = imputer.transform(X[["Age","Salary"]])
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


### Aplicando estratégia one-hot encoding no "Country"

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), ["Country"])], remainder="passthrough")
X = ct.fit_transform(X)

X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

### Aplicando estratégia label encoding no "Purchased"

In [37]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [39]:
X_train

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04]])

In [40]:
X_test

array([[0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.7e+01, 6.7e+04]])

In [41]:
y_train

array([0, 1, 0, 0, 1, 1, 0, 1])

In [42]:
y_test

array([0, 1])

### Aplicando metodo de escalonamento dos dados

In [43]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train[:, 3:] = scaler.fit_transform(X_train[:, 3:])
X_test[:, 3:] = scaler.transform(X_test[:, 3:])

In [44]:
X_train

array([[ 0.        ,  0.        ,  1.        , -0.19159184, -1.07812594],
       [ 0.        ,  1.        ,  0.        , -0.01411729, -0.07013168],
       [ 1.        ,  0.        ,  0.        ,  0.56670851,  0.63356243],
       [ 0.        ,  0.        ,  1.        , -0.30453019, -0.30786617],
       [ 0.        ,  0.        ,  1.        , -1.90180114, -1.42046362],
       [ 1.        ,  0.        ,  0.        ,  1.14753431,  1.23265336],
       [ 0.        ,  1.        ,  0.        ,  1.43794721,  1.57499104],
       [ 1.        ,  0.        ,  0.        , -0.74014954, -0.56461943]])

In [45]:
X_test

array([[ 0.        ,  1.        ,  0.        , -1.46618179, -0.9069571 ],
       [ 1.        ,  0.        ,  0.        , -0.44973664,  0.20564034]])