# Data Preprocessing Tools

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
print(np.__version__)
print(pd.__version__)
import sys
print(sys.version)

1.20.3
1.5.3
3.8.16 (default, Jan 17 2023, 22:25:28) [MSC v.1916 64 bit (AMD64)]


## Importing the dataset

In [None]:
df = pd.read_csv('./data/Data.csv')
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [None]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [None]:
df.isnull()

Unnamed: 0,Country,Age,Salary,Purchased
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,True,False
5,False,False,False,False
6,False,True,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [None]:
df.isna()

Unnamed: 0,Country,Age,Salary,Purchased
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,True,False
5,False,False,False,False
6,False,True,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [None]:
df.dropna()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [None]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data


* Si "mean", reemplazar los valores perdidos usando el promedio a lo largo de cada columna. Solo puede ser utilizado con datos numéricos.

* Si "median", reemplazar los valores perdidos usando la mediana a lo largo de cada columna. Solo puede ser utilizado con datos numéricos.

* Si "most_frequent", reemplazar los valores perdidos usando el valor más frecuente a lo largo de cada columna. Puede ser utilizado con cadenas o datos numéricos. Si hay más de un valor así, solo se devuelve el más pequeño.

* Si "constant", reemplazar los valores perdidos con el valor de relleno. Puede ser utilizado con cadenas o datos numéricos.

EL limite superior nunca es tomado en python por eso se pone el 3

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [None]:
print(X_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [None]:
print(X_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [None]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [None]:
print(y_test)

[0 1]


## Feature Scaling (Normalización)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_S = sc.fit_transform(X_train)
X_test_S = sc.transform(X_test)

In [None]:
print(X_train_S)

[[-0.77459667 -0.57735027  1.29099445 -0.19159184 -1.07812594]
 [-0.77459667  1.73205081 -0.77459667 -0.01411729 -0.07013168]
 [ 1.29099445 -0.57735027 -0.77459667  0.56670851  0.63356243]
 [-0.77459667 -0.57735027  1.29099445 -0.30453019 -0.30786617]
 [-0.77459667 -0.57735027  1.29099445 -1.90180114 -1.42046362]
 [ 1.29099445 -0.57735027 -0.77459667  1.14753431  1.23265336]
 [-0.77459667  1.73205081 -0.77459667  1.43794721  1.57499104]
 [ 1.29099445 -0.57735027 -0.77459667 -0.74014954 -0.56461943]]


In [None]:
print(X_test_S)

[[-0.77459667  1.73205081 -0.77459667 -1.46618179 -0.9069571 ]
 [ 1.29099445 -0.57735027 -0.77459667 -0.44973664  0.20564034]]


In [None]:
from sklearn.preprocessing import Normalizer
snc = Normalizer()
X_train_N = snc.fit_transform(X_train)
X_test_N = snc.transform(X_test)


In [None]:
print(X_train_N) # a suma de los cuadrados de los elementos de cada fila sea igual a 1.

[[0.00000000e+00 0.00000000e+00 1.92307639e-05 7.45726288e-04
  9.99999722e-01]
 [0.00000000e+00 1.56794394e-05 0.00000000e+00 6.27177577e-04
  9.99999803e-01]
 [1.38888863e-05 0.00000000e+00 0.00000000e+00 6.11110997e-04
  9.99999813e-01]
 [0.00000000e+00 0.00000000e+00 1.63934394e-05 6.22950699e-04
  9.99999806e-01]
 [0.00000000e+00 0.00000000e+00 2.08333300e-05 5.62499911e-04
  9.99999842e-01]
 [1.26582255e-05 0.00000000e+00 0.00000000e+00 6.07594825e-04
  9.99999815e-01]
 [0.00000000e+00 1.20481906e-05 0.00000000e+00 6.02409529e-04
  9.99999818e-01]
 [1.72413762e-05 0.00000000e+00 0.00000000e+00 6.03448166e-04
  9.99999818e-01]]


In [None]:
print(X_test_N)

[[0.00000000e+00 1.85185157e-05 0.00000000e+00 5.55555470e-04
  9.99999846e-01]
 [1.49253709e-05 0.00000000e+00 0.00000000e+00 5.52238722e-04
  9.99999847e-01]]
