In [None]:
from sklearn import tree
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [None]:
df = pd.read_csv('penguins.csv')

In [None]:
df.head()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         344 non-null    int64  
 1   species            344 non-null    object 
 2   island             344 non-null    object 
 3   bill_length_mm     342 non-null    float64
 4   bill_depth_mm      342 non-null    float64
 5   flipper_length_mm  342 non-null    float64
 6   body_mass_g        342 non-null    float64
 7   sex                333 non-null    object 
 8   year               344 non-null    int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 24.3+ KB


In [8]:
df.shape

(344, 9)

In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
count,344.0,342.0,342.0,342.0,342.0,344.0
mean,172.5,43.92193,17.15117,200.915205,4201.754386,2008.02907
std,99.448479,5.459584,1.974793,14.061714,801.954536,0.818356
min,1.0,32.1,13.1,172.0,2700.0,2007.0
25%,86.75,39.225,15.6,190.0,3550.0,2007.0
50%,172.5,44.45,17.3,197.0,4050.0,2008.0
75%,258.25,48.5,18.7,213.0,4750.0,2009.0
max,344.0,59.6,21.5,231.0,6300.0,2009.0


In [12]:
df.isnull()

Unnamed: 0.1,Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,True,True,True,True,True,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
339,False,False,False,False,False,False,False,False,False
340,False,False,False,False,False,False,False,False,False
341,False,False,False,False,False,False,False,False,False
342,False,False,False,False,False,False,False,False,False


In [24]:
df = df.fillna(df.median(numeric_only=True))

In [25]:
df = pd.get_dummies(df)

In [26]:
df

Unnamed: 0.1,Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen,sex_female,sex_male
0,1,39.10,18.7,181.0,3750.0,2007,1,0,0,0,0,1,0,1
1,2,39.50,17.4,186.0,3800.0,2007,1,0,0,0,0,1,1,0
2,3,40.30,18.0,195.0,3250.0,2007,1,0,0,0,0,1,1,0
3,4,44.45,17.3,197.0,4050.0,2007,1,0,0,0,0,1,0,0
4,5,36.70,19.3,193.0,3450.0,2007,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,340,55.80,19.8,207.0,4000.0,2009,0,1,0,0,1,0,0,1
340,341,43.50,18.1,202.0,3400.0,2009,0,1,0,0,1,0,1,0
341,342,49.60,18.2,193.0,3775.0,2009,0,1,0,0,1,0,0,1
342,343,50.80,19.0,210.0,4100.0,2009,0,1,0,0,1,0,0,1


In [39]:
df = df.drop(['Unnamed: 0'], axis=1)

In [40]:
df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen,sex_female,sex_male
0,39.10,18.7,181.0,3750.0,1,0,0,0,0,1,0,1
1,39.50,17.4,186.0,3800.0,1,0,0,0,0,1,1,0
2,40.30,18.0,195.0,3250.0,1,0,0,0,0,1,1,0
3,44.45,17.3,197.0,4050.0,1,0,0,0,0,1,0,0
4,36.70,19.3,193.0,3450.0,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
339,55.80,19.8,207.0,4000.0,0,1,0,0,1,0,0,1
340,43.50,18.1,202.0,3400.0,0,1,0,0,1,0,1,0
341,49.60,18.2,193.0,3775.0,0,1,0,0,1,0,0,1
342,50.80,19.0,210.0,4100.0,0,1,0,0,1,0,0,1


In [43]:
X = df.drop(['species_Adelie', \
           'species_Chinstrap', \
           'species_Gentoo', \
           ], axis=1)

In [44]:
X

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_female,sex_male
0,39.10,18.7,181.0,3750.0,0,0,1,0,1
1,39.50,17.4,186.0,3800.0,0,0,1,1,0
2,40.30,18.0,195.0,3250.0,0,0,1,1,0
3,44.45,17.3,197.0,4050.0,0,0,1,0,0
4,36.70,19.3,193.0,3450.0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...
339,55.80,19.8,207.0,4000.0,0,1,0,0,1
340,43.50,18.1,202.0,3400.0,0,1,0,1,0
341,49.60,18.2,193.0,3775.0,0,1,0,0,1
342,50.80,19.0,210.0,4100.0,0,1,0,0,1


In [51]:
y = df[['species_Adelie', \
           'species_Chinstrap', \
           'species_Gentoo', \
       ]  ]

In [52]:
y

Unnamed: 0,species_Adelie,species_Chinstrap,species_Gentoo
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
339,0,1,0
340,0,1,0
341,0,1,0
342,0,1,0


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [56]:
rf = RandomForestClassifier(min)

rf = RandomForestClassifier()

In [None]:
parametrs = { 'n_estimators' : 10, 'max_depth' : range(2, 15), 'min_samples_split': range(1, 50, 10), 'min_samples_leaf': range(1,50, 10)  }

In [None]:
grid_search_cv = GridSearchCV(rf, parametrs, cv=5)

In [None]:
grid_search_cv.fit(X_train, y_train)

In [None]:
grid_search_cv.best_params_

In [None]:
y_pred = grid_search_cv.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)