In [6]:
# Pré-processamento de dados com Scikit-learn
# Fonte: https://colab.research.google.com/github/cwcheng0/practicepython/blob/master/SKLearn_PreProcessing.ipynb
# 2023

from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]] 

import pandas as pd
pd.DataFrame(data)


Unnamed: 0,0,1
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


In [None]:
# Minimize all data
scaler = MinMaxScaler() 
scaler = scaler.fit(data) 
result = scaler.transform(data) 
result

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [None]:
# fit_transform used on the training data so that we can scale the training data and also learn the scaling parameters of that data. 
result_ = scaler.fit_transform(data) 
# learned parameters transformed used to scale our test data 
scaler.inverse_transform(result) 

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [None]:
# Intervalo dos dados [0,1]
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler(feature_range=(5,10)) 
result = scaler.fit_transform(data) 
result

array([[ 5.  ,  5.  ],
       [ 6.25,  6.25],
       [ 7.5 ,  7.5 ],
       [10.  , 10.  ]])

Scaling the values of X to be between 0 and 1, while preserving the original distribution of the data.


In [3]:
# The following lines of code compute the XNOR normalization of the input data X, which transforms the values in X to the range [0, 1]
# by subtracting the minimum value of each column of X from the corresponding values of that column, and dividing the result by 
# the range of values in that column. XNOR normalization is a commonly used technique for feature scaling in machine learning, as 
# it helps to improve the performance and stability of many machine learning algorithms. 


import numpy as np
X = np.array([[-1, 2], [-0.5, 6], [0, 10], [1, 18]])
X_nor = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_nor


array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [4]:
# normalization of the given vector on a numpy array X.
# X.max(axis=0) and X.min(axis=0) compute the maximum and minimum values of each column in the array, respectively.
# X_nor * (X.max(axis=0) - X.min(axis=0)) subtracts the minimum value of each column from the corresponding maximum value, and then multiplies each element in the array by the resulting column-wise difference.
# Finally, X_returned adds back the minimum value of each column to the scaled values, which brings the data back to its original scale.

X_returned = X_nor * (X.max(axis=0) - X.min(axis=0)) + X.min(axis=0)
X_returned

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

In [None]:
## StandardScaler removes the mean and scales each feature/variable to unit variance. 

from sklearn.preprocessing import StandardScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = StandardScaler() 
scaler.fit(data) 


StandardScaler can be influenced by outliers (if they exist in the dataset) since it involves the estimation of the empirical mean and standard deviation of each feature.


In [None]:
scaler.mean_ 

array([-0.125,  9.   ])

In [None]:
scaler.var_ 


array([ 0.546875, 35.      ])

In [None]:
x_std = scaler.transform(data) 
x_std.mean() 

0.0

In [None]:
x_std.std() 

1.0

In [None]:
scaler.fit_transform(data) 

array([[-1.18321596, -1.18321596],
       [-0.50709255, -0.50709255],
       [ 0.16903085,  0.16903085],
       [ 1.52127766,  1.52127766]])

In [None]:
scaler.inverse_transform(x_std) 

array([[-1. ,  2. ],
       [-0.5,  6. ],
       [ 0. , 10. ],
       [ 1. , 18. ]])

Data and Python necessary libraries imports


In [2]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [3]:
# Use PDrive function
#!pip install -U -q PyDrive

In [4]:
# This code imports several Python libraries (pydrive.auth, pydrive.drive, google.colab.auth, and oauth2client.client) 
#and then uses them to authenticate and create a PyDrive client for accessing files on Google Drive.

#from pydrive.auth import GoogleAuth
#from pydrive.drive import GoogleDrive
#from google.colab import auth
#from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client, which is necessary in order to access files on Google Drive.
#auth.authenticate_user()
#gauth = GoogleAuth()
#gauth.credentials = GoogleCredentials.get_application_default()
#drive = GoogleDrive(gauth)

In [7]:
# Check first 5 rows in the modified Titanic file
url = 'https://drive.google.com/file/d/1-18yh7LtOknFxWuC-N_aS-iszY_l-6x3/' # train3.csv
file_id = url.split('/')[-2]
dwn_url = 'https://drive.google.com/uc?id=' + file_id
data = pd.read_csv(dwn_url)
# Without index_col=0, first row will be different.
data.head()
print(data)

      Age     Sex Embarked Survived
0    22.0    male        S       No
1    38.0  female        C      Yes
2    26.0  female        S      Yes
3    35.0  female        S      Yes
4    35.0    male        S       No
..    ...     ...      ...      ...
886  27.0    male        S       No
887  19.0  female        S      Yes
888   NaN  female        S       No
889  26.0    male        C      Yes
890  32.0    male        Q       No

[891 rows x 4 columns]


In [8]:
# Modificando Titanic
data = data[["Age", "Sex", "Embarked", "Survived"]] #
print(data)

      Age     Sex Embarked Survived
0    22.0    male        S       No
1    38.0  female        C      Yes
2    26.0  female        S      Yes
3    35.0  female        S      Yes
4    35.0    male        S       No
..    ...     ...      ...      ...
886  27.0    male        S       No
887  19.0  female        S      Yes
888   NaN  female        S       No
889  26.0    male        C      Yes
890  32.0    male        Q       No

[891 rows x 4 columns]


In [9]:
# Modified Titanic Sample

data.info()

Age = data.loc[:,"Age"].values.reshape(-1,1) #sklearn
Age[:20]
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer() 
imp_median = SimpleImputer(strategy="median") 
imp_0 = SimpleImputer(strategy="constant",fill_value=0) 
imp_mean = imp_mean.fit_transform(Age) 
imp_median = imp_median.fit_transform(Age)
imp_0 = imp_0.fit_transform(Age)
imp_mean[:20]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 28.0+ KB


array([[22.        ],
       [38.        ],
       [26.        ],
       [35.        ],
       [35.        ],
       [29.69911765],
       [54.        ],
       [ 2.        ],
       [27.        ],
       [14.        ],
       [ 4.        ],
       [58.        ],
       [20.        ],
       [39.        ],
       [14.        ],
       [55.        ],
       [ 2.        ],
       [29.69911765],
       [31.        ],
       [29.69911765]])

In [10]:
imp_median[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [28.],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [28.],
       [31.],
       [28.]])

In [11]:
imp_0[:20]

array([[22.],
       [38.],
       [26.],
       [35.],
       [35.],
       [ 0.],
       [54.],
       [ 2.],
       [27.],
       [14.],
       [ 4.],
       [58.],
       [20.],
       [39.],
       [14.],
       [55.],
       [ 2.],
       [ 0.],
       [31.],
       [ 0.]])

In [12]:
data.loc[:,"Age"] = imp_median
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 28.0+ KB


In [13]:
#verify Embarked
Embarked = data.loc[:,"Embarked"].values.reshape(-1,1)
imp_mode = SimpleImputer(strategy = "most_frequent")
data.loc[:,"Embarked"] = imp_mode.fit_transform(Embarked)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  891 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 28.0+ KB


In [14]:
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [15]:
# Use Numpy to fill the gap
import pandas as pd
url = 'https://drive.google.com/file/d/1-18yh7LtOknFxWuC-N_aS-iszY_l-6x3/' # train3.csv
file_id = url.split('/')[-2]
dwn_url = 'https://drive.google.com/uc?id=' + file_id
data = pd.read_csv(dwn_url)
data.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,No
1,38.0,female,C,Yes
2,26.0,female,S,Yes
3,35.0,female,S,Yes
4,35.0,male,S,No


In [16]:
# Fill the gap
data.loc[:,"Age"] = data.loc[:,"Age"].fillna(data.loc[:,"Age"].median())
#.fillna DataFrame
data.dropna(axis=0,inplace=True)
#.dropna(axis=0) # .dropna(axis=1)
data.describe()

Unnamed: 0,Age
count,889.0
mean,29.315152
std,12.984932
min,0.42
25%,22.0
50%,28.0
75%,35.0
max,80.0


In [22]:
#verify datatype and memory usage 
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       889 non-null    float64
 1   Sex       889 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  889 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.7+ KB


In [24]:
# preprocessing categorical variables for use in machine learning models that require numerical inputs.



# The LabelEncoder class from sklearn.preprocessing is used to encode the labels in y into integers, which can be more easily processed by machine learning algorithms.

from sklearn.preprocessing import LabelEncoder
# extracting the last column of data, which presumably contains the categorical variable y.
y = data.iloc[:,-1]
# creates a new LabelEncoder object.
le = LabelEncoder()
# fiting the LabelEncoder object to the data in y, i.e., it determines the unique classes in y and assigns each class a unique integer label.
le = le.fit(y) 
# transforming the labels in y into the corresponding integer labels obtained during the fitting process.
label = le.transform(y) 
# verify the unique classes in y that were used to determine the integer labels.

le.classes_ 
# verify the transformed integer labels for the categorical variable y.

label 

SyntaxError: ignored

In [None]:
# Verificar quantas classes temos
le.classes_

array(['No', 'Yes'], dtype=object)

In [None]:
le.fit_transform(y) 
le.inverse_transform(label) 

array(['No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes',
       'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes',
       'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No',
       'No', 'Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No', 'Yes',
       'No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No',
       'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No',
       'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No',
       'No', 'No', 'No', 'Yes', 'No', 'No', 'No', 'Yes', 'Yes', 'No',
       'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'No', 'No',
       'No', 'No', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'No',
       'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No', 'No',
       'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'No', 'No',
       'Yes', 'No', 'No', 'Yes', 'No', 'No', 'No', 'No', 'Yes', 'Yes',

In [None]:
data.iloc[:,-1] = label 
data.head()

  data.iloc[:,-1] = label


Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,1
2,26.0,female,S,1
3,35.0,female,S,1
4,35.0,male,S,0


The following code is useful for preprocessing categorical variables for  use in machine learning models that require numerical inputs.


In [2]:
"""
This code uses the scikit-learn library to perform label encoding on a categorical variable y in a pandas DataFrame data.

The LabelEncoder class from sklearn.preprocessing is used to encode the labels in y into integers, which can be more easily processed by machine learning algorithms.

y = data.iloc[:,-1] extracts the last column of data, which presumably contains the categorical variable y.

le = LabelEncoder() creates a new LabelEncoder object.

le = le.fit(y) fits the LabelEncoder object to the data in y, which means it determines the unique classes in y and assigns each class a unique integer label.

label = le.transform(y) transforms the labels in y into the corresponding integer labels obtained during the fitting process.

le.classes_ returns the unique classes in y that were used to determine the integer labels.

label contains the transformed integer labels for the categorical variable y.


"""
from sklearn.preprocessing import LabelEncoder
data.iloc[:,-1] = LabelEncoder().fit_transform(data.iloc[:,-1])
data.head()

NameError: ignored

The OrdinalEncoder class from sklearn.preprocessing is used to transform each categorical column in data to a numerical column with integer values.

In [None]:
from sklearn.preprocessing import OrdinalEncoder
data_ = data.copy()
data_.head()

Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,male,S,0
1,38.0,female,C,1
2,26.0,female,S,1
3,35.0,female,S,1
4,35.0,male,S,0


In [3]:
# OrdinalEncoder().fit(data_.iloc[:,1:-1]).categories_ creates a new OrdinalEncoder object 
# and fits it to the categorical data in data_.iloc[:,1:-1]. The .categories_ attribute returns
# the unique categories in each categorical variable as an array of arrays.


OrdinalEncoder().fit(data_.iloc[:,1:-1]).categories_

SyntaxError: ignored

In [None]:
#data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1]) creates a new OrdinalEncoder 
#object and fits it to the categorical data in data_.iloc[:,1:-1]. The .fit_transform() method transforms
# the categorical data into an array of integers and assigns the result to the same slice of data_.
data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])
# data_.head() displays the first five rows of data_ to show the transformed categorical data.
data_.head()


  data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])


Unnamed: 0,Age,Sex,Embarked,Survived
0,22.0,1.0,2.0,0
1,38.0,0.0,0.0,1
2,26.0,0.0,2.0,1
3,35.0,0.0,2.0,1
4,35.0,1.0,2.0,0


In [27]:
#The OneHotEncoder class from sklearn.preprocessing is used to  create a one-hot encoding of the categories in X.
from sklearn.preprocessing import OneHotEncoder

# data.head()
X = data.iloc[:,1:-1]
enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()
result

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

Preprocessing categorical variables for use in ML models that require numerical inputs. The resulting array can be used as input to a machine learning algorithm, where each column represents a unique category and each row represents an instance of the categorical variable.

In [28]:
# categories='auto' specifies that the categories in X should be automatically determined based on the unique values present in X.
# .fit_transform(X) fits the OneHotEncoder object to the data in X and then transforms X into a one-hot encoded array.
# .toarray() converts the resulting sparse matrix from .fit_transform(X) to a dense numpy array.

OneHotEncoder(categories='auto').fit_transform(X).toarray()

pd.DataFrame(enc.inverse_transform(result))

Unnamed: 0,0,1
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S
...,...,...
884,male,S
885,female,S
886,female,S
887,male,C


In [None]:
enc.get_feature_names_out()

array(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype=object)

In [None]:
result

array([[0., 1., 0., 0., 1.],
       [1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 1.],
       [0., 1., 1., 0., 0.],
       [0., 1., 0., 1., 0.]])

In [None]:
result.shape

(889, 5)

In [None]:
#axis=1,axis=0，
newdata = pd.concat([data,pd.DataFrame(result)],axis=1)
newdata.head()

Unnamed: 0,Age,Sex,Embarked,Survived,0,1,2,3,4
0,22.0,male,S,0.0,0.0,1.0,0.0,0.0,1.0
1,38.0,female,C,1.0,1.0,0.0,1.0,0.0,0.0
2,26.0,female,S,1.0,1.0,0.0,0.0,0.0,1.0
3,35.0,female,S,1.0,1.0,0.0,0.0,0.0,1.0
4,35.0,male,S,0.0,0.0,1.0,0.0,0.0,1.0


In [None]:
# teste Marcio
#type(newdata)


In [None]:
newdata.drop(["Sex","Embarked"],axis=1,inplace=True)
newdata.columns =["Age","Survived","Female","Male","Embarked_C","Embarked_Q","Embarked_S"]
newdata.head()

Unnamed: 0,Age,Survived,Female,Male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,0.0,0.0,1.0,0.0,0.0,1.0
1,38.0,1.0,1.0,0.0,1.0,0.0,0.0
2,26.0,1.0,1.0,0.0,0.0,0.0,1.0
3,35.0,1.0,1.0,0.0,0.0,0.0,1.0
4,35.0,0.0,0.0,1.0,0.0,0.0,1.0


The Binarizer class from sklearn.preprocessing is used to transform the continuous values in X into binary values based on a threshold value. It is useful for preprocessing numerical variables for use in machine learning models that require binary inputs. The resulting binary array can be used as input to a machine learning algorithm, where each value represents a binary feature of the numerical variable.

In [2]:
data_2 = data.copy()
from sklearn.preprocessing import Binarizer

# extracts the first column of data_2, which presumably contains the numerical variable X.
# The .values attribute is used to convert the column to a numpy array, and .reshape(-1,1) is
# used to reshape the array to a single column.
X = data_2.iloc[:,0].values.reshape(-1,1) 

#  creates a new Binarizer object with a threshold value of 30, which means any value in X greater than or equal to 30 will be transformed to 1, and any value less than 30 will be transformed to 0. The .fit_transform(X) method fits the Binarizer object to the data in X and then transforms X into a binary array.
transformer = Binarizer(threshold=30).fit_transform(X)
transformer

NameError: ignored

KBinsDiscretizer is good for preprocessing numerical variables for use in ML models that require discrete inputs. The resulting array can be used as input to a machine learning algorithm, where each bin represents a unique range of values in the numerical variable.

In [None]:
#  The KBinsDiscretizer class from sklearn.preprocessing is used to create k bins of equal width for the values in X.

from sklearn.preprocessing import KBinsDiscretizer
# it extracts the first column of data, which presumably contains the numerical variable X. 
#The .values attribute is used to convert the column to a numpy array, and .reshape(-1,1) is used to reshape the array to a single column.
X = data.iloc[:,0].values.reshape(-1,1)

# Set age into 3 categories
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
est.fit_transform(X)[:20]

array([[0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [2.],
       [0.],
       [1.],
       [0.],
       [0.],
       [2.],
       [0.],
       [1.],
       [0.],
       [2.],
       [0.],
       [1.],
       [1.],
       [1.]])

Explore the output

In [None]:
#the fit_transform() method is used to fit the clustering algorithm on the data X and transform it into a cluster assignment for each data point. 
#The set() function is used to extract the unique cluster assignment values and create a set of these values.
#Finally, the resulting set is printed or stored for later use.

set(est.fit_transform(X).ravel())

{0.0, 1.0, 2.0}

In [None]:
# Same
est = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')
est.fit_transform(X)

est.fit_transform(X).toarray()

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

A prática foi muito boa para familiarizar e ver passo a passo de como explorar mesmo os dados que possuimos e, a partir disso, criar informações úteis para explorar e então, futuramente, criar um modelo. Ao seguir essas e outras práticas recomendadas, pude ver que o código é eficiente, scikit é muit bom, e ficou com uma leitura muito intuitiva. Já estou aplicando para tarefas do meu trabalho algumas das metodologias usadas nesse notebook e resolvi fazer os comentários em inglês pois assim o faço no trabalho também e ficará fácil de explicar futuramente. 