Importando módulos necessários

In [1]:
import pandas as pd
import numpy as np
from sklearn import neighbors, metrics, preprocessing
np.set_printoptions(suppress=True)

Abrindo o arquivo penguins_size.csv como dataframe chamado originalDF

In [2]:
originalDF = pd.read_csv('penguins_size.csv', encoding = "ISO-8859-1")

Codificando os campos que não eram compostos por números

In [3]:
label_encoder = preprocessing.LabelEncoder()

encodedDF = originalDF.copy()
encodedDF['island']= label_encoder.fit_transform(encodedDF['island'])
encodedDF['sex']= label_encoder.fit_transform(encodedDF['sex'])
encodedDF['species']= label_encoder.fit_transform(encodedDF['species'])

encodedDF = encodedDF[~np.isnan(encodedDF).any(axis=1)]
encodedDF.reset_index(drop=True, inplace=True)
encodedDF

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.1,18.7,181.0,3750.0,2
1,0,2,39.5,17.4,186.0,3800.0,1
2,0,2,40.3,18.0,195.0,3250.0,1
3,0,2,36.7,19.3,193.0,3450.0,1
4,0,2,39.3,20.6,190.0,3650.0,2
...,...,...,...,...,...,...,...
337,2,0,47.2,13.7,214.0,4925.0,1
338,2,0,46.8,14.3,215.0,4850.0,1
339,2,0,50.4,15.7,222.0,5750.0,2
340,2,0,45.2,14.8,212.0,5200.0,1


Criando um dataframe padronizado

In [4]:
from sklearn.preprocessing import StandardScaler

scaledArr = StandardScaler().fit_transform(encodedDF.loc[:, ['island','culmen_length_mm','culmen_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']].values)
scaledDF = pd.DataFrame(data = scaledArr, columns = ['island','culmen_length_mm','culmen_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex'])
scaledDF

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,1.852870,-0.884499,0.785449,-1.418347,-0.564142,0.846836
1,1.852870,-0.811126,0.126188,-1.062250,-0.501703,-0.974660
2,1.852870,-0.664380,0.430462,-0.421277,-1.188532,-0.974660
3,1.852870,-1.324737,1.089724,-0.563715,-0.938776,-0.974660
4,1.852870,-0.847812,1.748985,-0.777373,-0.689020,0.846836
...,...,...,...,...,...,...
337,-0.914298,0.601305,-1.750171,0.931890,0.903175,-0.974660
338,-0.914298,0.527932,-1.445897,1.003109,0.809516,-0.974660
339,-0.914298,1.188289,-0.735923,1.501644,1.933419,0.846836
340,-0.914298,0.234440,-1.192335,0.789451,1.246590,-0.974660


Incluindo no dataframe padronizado a espécie

In [5]:
scaledDF = pd.concat([scaledDF, encodedDF[['species']]], axis = 1)
scaledDF

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,species
0,1.852870,-0.884499,0.785449,-1.418347,-0.564142,0.846836,0
1,1.852870,-0.811126,0.126188,-1.062250,-0.501703,-0.974660,0
2,1.852870,-0.664380,0.430462,-0.421277,-1.188532,-0.974660,0
3,1.852870,-1.324737,1.089724,-0.563715,-0.938776,-0.974660,0
4,1.852870,-0.847812,1.748985,-0.777373,-0.689020,0.846836,0
...,...,...,...,...,...,...,...
337,-0.914298,0.601305,-1.750171,0.931890,0.903175,-0.974660,2
338,-0.914298,0.527932,-1.445897,1.003109,0.809516,-0.974660,2
339,-0.914298,1.188289,-0.735923,1.501644,1.933419,0.846836,2
340,-0.914298,0.234440,-1.192335,0.789451,1.246590,-0.974660,2


Separando o rótulo dos componentes, sendo y o rótulo

In [6]:
y = scaledDF.loc[:, ['species']].values
X = scaledDF.loc[:, ['island','culmen_length_mm','culmen_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']].values
X

array([[ 1.85286967, -0.88449874,  0.78544923, -1.41834665, -0.56414208,
         0.84683555],
       [ 1.85286967, -0.81112573,  0.1261879 , -1.06225022, -0.50170305,
        -0.97465979],
       [ 1.85286967, -0.66437972,  0.43046236, -0.42127665, -1.18853234,
        -0.97465979],
       ...,
       [-0.91429814,  1.18828874, -0.73592307,  1.50164406,  1.93341896,
         0.84683555],
       [-0.91429814,  0.23443963, -1.19233476,  0.7894512 ,  1.24658968,
        -0.97465979],
       [-0.91429814,  1.09657248, -0.53307343,  0.86067049,  1.49634578,
         0.84683555]])

Aplicando oversampling na base para balancear o número de classes

In [7]:
from imblearn.over_sampling import RandomOverSampler

OverResampler = RandomOverSampler()

X, y = OverResampler.fit_resample(X, y)

Dividindo X e y, compondo 20% como teste e 80% como restante do dado a ser utilizado para treinar o modelo

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

Usando K como 1:

In [9]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [10]:
y_pred = classifier.predict(X_test)

Matriz de confusão com K = 1:

In [11]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[28  0  0]
 [ 0 33  0]
 [ 0  0 30]]


Métricas de avaliação precision, recall e f1-score com K = 1:

In [12]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        33
           2       1.00      1.00      1.00        30

    accuracy                           1.00        91
   macro avg       1.00      1.00      1.00        91
weighted avg       1.00      1.00      1.00        91



Usando K como 2:

In [13]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=2)

In [14]:
y_pred = classifier.predict(X_test)

Matriz de confusão com K = 2:

In [15]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[28  0  0]
 [ 0 33  0]
 [ 0  0 30]]


Métricas de avaliação precision, recall e f1-score com K = 2:

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        33
           2       1.00      1.00      1.00        30

    accuracy                           1.00        91
   macro avg       1.00      1.00      1.00        91
weighted avg       1.00      1.00      1.00        91



Usando K como 3:

In [17]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [18]:
y_pred = classifier.predict(X_test)

Matriz de confusão com K = 3:

In [19]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[28  0  0]
 [ 0 33  0]
 [ 0  0 30]]


Métricas de avaliação precision, recall e f1-score com K = 3:

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        33
           2       1.00      1.00      1.00        30

    accuracy                           1.00        91
   macro avg       1.00      1.00      1.00        91
weighted avg       1.00      1.00      1.00        91



Usando K como 4:

In [21]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=4)
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=4)

In [22]:
y_pred = classifier.predict(X_test)

Matriz de confusão com K = 4:

In [23]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[28  0  0]
 [ 0 33  0]
 [ 0  0 30]]


Métricas de avaliação precision, recall e f1-score com K = 4:

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        33
           2       1.00      1.00      1.00        30

    accuracy                           1.00        91
   macro avg       1.00      1.00      1.00        91
weighted avg       1.00      1.00      1.00        91



Usando K como 5:

In [25]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [26]:
y_pred = classifier.predict(X_test)

Matriz de confusão com K = 5:

In [27]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[28  0  0]
 [ 0 33  0]
 [ 0  0 30]]


Métricas de avaliação precision, recall e f1-score com K = 5:

In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        33
           2       1.00      1.00      1.00        30

    accuracy                           1.00        91
   macro avg       1.00      1.00      1.00        91
weighted avg       1.00      1.00      1.00        91



Usando distância de minkowski e K = 1

In [29]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=1, metric='minkowski')
classifier.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [30]:
y_pred = classifier.predict(X_test)

Matriz de confusão com distância de minkowski e K = 1

In [31]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[28  0  0]
 [ 0 33  0]
 [ 0  0 30]]


Métricas de avaliação precision, recall e f1-score com distância de minkowski e K = 1

In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        33
           2       1.00      1.00      1.00        30

    accuracy                           1.00        91
   macro avg       1.00      1.00      1.00        91
weighted avg       1.00      1.00      1.00        91



Usando distância de chebyshev e K = 1

In [33]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=1, metric='chebyshev')
classifier.fit(X_train, y_train)

KNeighborsClassifier(metric='chebyshev', n_neighbors=1)

In [34]:
y_pred = classifier.predict(X_test)

Matriz de confusão com distância de chebyshev e K = 1

In [35]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[28  0  0]
 [ 0 33  0]
 [ 0  1 29]]


Métricas de avaliação precision, recall e f1-score com distância de chebyshev e K = 1

In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       0.97      1.00      0.99        33
           2       1.00      0.97      0.98        30

    accuracy                           0.99        91
   macro avg       0.99      0.99      0.99        91
weighted avg       0.99      0.99      0.99        91



Como podemos observar, usando como K = 1 e 2, o modelo de classificação teve uma performance levemente superior. Além disso, mudando a métrica de distância para minkowski e chebyshev, não foi possível observar uma melhora ou piora.

Redefinindo y e X para fazer a regressao, usando y como body_mass_g

In [37]:
y = scaledDF.loc[:, ['body_mass_g']].values
X = scaledDF.loc[:, ['island','culmen_length_mm','culmen_depth_mm', 'flipper_length_mm', 'sex', 'species']].values

Dividindo o modelo entre train e test:

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

Treinando o modelo com K = 1:

In [39]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=1)
regressor.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=1)

In [40]:
y_pred = regressor.predict(X_test)

Obtendo score da regressao:

In [41]:
regressor.score(X_test, y_test)

0.7499429591851539

Treinando o modelo com K = 2:

In [42]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=2)
regressor.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=2)

In [43]:
y_pred = regressor.predict(X_test)

Obtendo score da regressao:

In [44]:
regressor.score(X_test, y_test)

0.7835490663111013

Treinando o modelo com K = 3:

In [45]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=3)
regressor.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=3)

In [46]:
y_pred = regressor.predict(X_test)

Obtendo score da regressao:

In [47]:
regressor.score(X_test, y_test)

0.8132614875511811

Treinando o modelo com K = 4:

In [48]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=4)
regressor.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=4)

In [49]:
y_pred = regressor.predict(X_test)

Obtendo score da regressao:

In [50]:
regressor.score(X_test, y_test)

0.8120903694662167

Treinando o modelo com K = 5:

In [51]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=5)
regressor.fit(X_train, y_train)

KNeighborsRegressor()

In [52]:
y_pred = regressor.predict(X_test)

Obtendo score da regressao:

In [53]:
regressor.score(X_test, y_test)

0.8242560229165595

Treinando o modelo com K = 5 e usando metrica minkowski:

In [54]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=5, metric='minkowski')
regressor.fit(X_train, y_train)

KNeighborsRegressor()

In [55]:
y_pred = regressor.predict(X_test)

Obtendo score da regressao:

In [56]:
regressor.score(X_test, y_test)

0.8242560229165595

Treinando o modelo com K = 5 e usando metrica chebyshev:

In [57]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=5, metric='chebyshev')
regressor.fit(X_train, y_train)

KNeighborsRegressor(metric='chebyshev')

In [58]:
y_pred = regressor.predict(X_test)

Obtendo score da regressao:

In [59]:
regressor.score(X_test, y_test)

0.8029094904046139

Como podemos observar, usando como K = 5 e 4 e métrica de minkowski, o modelo de regressão teve uma performance levemente superior.