Import libraries and dataset

In [149]:
import numpy as np
import matplotlib as plt
import pandas as pd

dataset = pd.read_csv("Data.csv")
data_matrix = dataset.iloc[:, :-1].values
independent_terms = dataset.iloc[:, 3].values

data_matrix, independent_terms

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

Treatment of NAs

In [150]:
from sklearn.impute import SimpleImputer 

# NaN values ​​will be replaced by the mean
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")

# I am going to use the passed object to apply it to the data matrix
# the columns of 1:3 because the strings do not have mean
imputer.fit(data_matrix[:, 1:3])

# The data matrix will now be equal to the transformed matrix
data_matrix[:, 1:3] = imputer.transform(data_matrix[:,1:3])

pd.DataFrame(data_matrix)

Unnamed: 0,0,1,2
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


Encode categorical data

In [151]:
from sklearn.preprocessing import LabelEncoder

# Create a data encoder
labelencoder_data = LabelEncoder()

# Transform the columns that are categories and transform them to numerical data. In this case the names of the countries
data_matrix[:, 0] = labelencoder_data.fit_transform(data_matrix[:, 0])

pd.DataFrame(data_matrix)

Unnamed: 0,0,1,2
0,0,44.0,72000.0
1,2,27.0,48000.0
2,1,30.0,54000.0
3,2,38.0,61000.0
4,1,40.0,63777.777778
5,0,35.0,58000.0
6,2,38.777778,52000.0
7,0,48.0,79000.0
8,1,50.0,83000.0
9,0,37.0,67000.0


In [152]:
# ! Making this transformation has a problem, the countries do not have an order. That is why the "One Hot" technique is used where we will have a vector for
# each variable that we have in the dataset. This is used to represent categorical variables in the form of binary vectors. These are called "dummy variables."

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Select the first column 
column_transform = ColumnTransformer([('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],  remainder='passthrough')

# Transform it
data_matrix = column_transform.fit_transform(data_matrix)

# Create a data encoder for the column of independent terms
labelencoder_ind_terms = LabelEncoder()

# Transform it
independent_terms = labelencoder_ind_terms.fit_transform(independent_terms)

pd.DataFrame(data_matrix)

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,44.0,72000.0
1,0.0,0.0,1.0,27.0,48000.0
2,0.0,1.0,0.0,30.0,54000.0
3,0.0,0.0,1.0,38.0,61000.0
4,0.0,1.0,0.0,40.0,63777.777778
5,1.0,0.0,0.0,35.0,58000.0
6,0.0,0.0,1.0,38.777778,52000.0
7,1.0,0.0,0.0,48.0,79000.0
8,0.0,1.0,0.0,50.0,83000.0
9,1.0,0.0,0.0,37.0,67000.0
