In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from palmerpenguins import load_penguins

In [10]:
penguins = load_penguins()
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [11]:
penguins.shape

(344, 8)

In [12]:
print(penguins.isnull().sum())

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64


In [13]:
penguins.dropna(axis=0, how='any', inplace=True)

In [14]:
penguins.shape

(333, 8)

In [97]:
penguins_shuffle=shuffle(penguins, random_state=42)

In [98]:
penguins_shuffle.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
30,Adelie,Dream,39.5,16.7,178.0,3250.0,female,2007
320,Chinstrap,Dream,50.9,17.9,196.0,3675.0,female,2009
79,Adelie,Torgersen,42.1,19.1,195.0,4000.0,male,2008
202,Gentoo,Biscoe,46.6,14.2,210.0,4850.0,female,2008
63,Adelie,Biscoe,41.1,18.2,192.0,4050.0,male,2008


In [100]:
df = penguins_shuffle.copy()
df.iloc[:30, df.columns.get_loc("species")] = ""
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
30,,Dream,39.5,16.7,178.0,3250.0,female,2007
320,,Dream,50.9,17.9,196.0,3675.0,female,2009
79,,Torgersen,42.1,19.1,195.0,4000.0,male,2008
202,,Biscoe,46.6,14.2,210.0,4850.0,female,2008
63,,Biscoe,41.1,18.2,192.0,4050.0,male,2008


In [101]:
X = df.drop(["species", "island", "sex", "year"], axis=1)
y = df["species"]

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=30, random_state=42)

In [103]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [104]:
y_pred = model.predict(X_test)

In [105]:
y_pred

array(['Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Adelie', 'Gentoo',
       'Adelie', 'Chinstrap', 'Gentoo', 'Adelie', 'Gentoo', 'Gentoo',
       'Gentoo', 'Adelie', 'Gentoo', 'Gentoo', 'Gentoo', 'Gentoo',
       'Adelie', 'Gentoo', 'Gentoo', 'Chinstrap', 'Gentoo', 'Adelie',
       'Chinstrap', 'Adelie', 'Gentoo', 'Adelie', 'Chinstrap', 'Adelie'],
      dtype=object)

In [106]:
acertos = (y_pred == y_test).sum()
total_previsoes = len(y_pred)

acuracia = acertos / total_previsoes * 100

print(f'Acurácia: {acuracia:.2f}%')

Acurácia: 86.67%


In [107]:
df['species'][:30] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['species'][:30] = y_pred


In [108]:
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
30,Adelie,Dream,39.5,16.7,178.0,3250.0,female,2007
320,Adelie,Dream,50.9,17.9,196.0,3675.0,female,2009
79,Chinstrap,Torgersen,42.1,19.1,195.0,4000.0,male,2008
202,Adelie,Biscoe,46.6,14.2,210.0,4850.0,female,2008
63,Adelie,Biscoe,41.1,18.2,192.0,4050.0,male,2008
...,...,...,...,...,...,...,...,...
195,Gentoo,Biscoe,49.6,15.0,216.0,4750.0,male,2008
77,Adelie,Torgersen,37.2,19.4,184.0,3900.0,male,2008
112,Adelie,Biscoe,39.7,17.7,193.0,3200.0,female,2009
281,Chinstrap,Dream,45.2,17.8,198.0,3950.0,female,2007


In [109]:
df["species"][:30]

30        Adelie
320       Adelie
79     Chinstrap
202       Adelie
63        Adelie
307       Gentoo
292       Adelie
187    Chinstrap
219       Gentoo
204       Adelie
81        Gentoo
14        Gentoo
330       Gentoo
132       Adelie
276       Gentoo
138       Gentoo
120       Gentoo
152       Gentoo
82        Adelie
286       Gentoo
115       Gentoo
143    Chinstrap
326       Gentoo
206       Adelie
6      Chinstrap
116       Adelie
272       Gentoo
334       Adelie
169    Chinstrap
333       Adelie
Name: species, dtype: object

In [110]:
penguins_shuffle["species"][:30]

30        Adelie
320    Chinstrap
79        Adelie
202       Gentoo
63        Adelie
307    Chinstrap
292    Chinstrap
187       Gentoo
219       Gentoo
204       Gentoo
81        Adelie
14        Adelie
330    Chinstrap
132       Adelie
276    Chinstrap
138       Adelie
120       Adelie
152       Gentoo
82        Adelie
286    Chinstrap
115       Adelie
143       Adelie
326    Chinstrap
206       Gentoo
6         Adelie
116       Adelie
272       Gentoo
334    Chinstrap
169       Gentoo
333    Chinstrap
Name: species, dtype: object