In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('penguins_size.csv')
df

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,FEMALE
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,MALE
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,FEMALE


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [4]:
df['species'].value_counts()

Adelie       152
Gentoo       124
Chinstrap     68
Name: species, dtype: int64

In [5]:
df['island'].value_counts()

Biscoe       168
Dream        124
Torgersen     52
Name: island, dtype: int64

In [6]:
df['sex'].value_counts()

MALE      168
FEMALE    165
.           1
Name: sex, dtype: int64

In [7]:
df=df[df['sex']!='.']
df.shape

(343, 7)

In [8]:
df.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [9]:
df=df.dropna()

In [11]:
df.shape

(333, 7)

In [12]:
x=pd.get_dummies(df.drop('species',axis=1),drop_first=True)
y=df['species']

In [13]:
# train & test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)

In [15]:
# modeling & evaluation
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(random_state=0)
model.fit(x_train,y_train)

#predictions
train_ypred=model.predict(x_train)
test_ypred=model.predict(x_test)

#evaluation

# train & test accuracy
from sklearn.metrics import accuracy_score
print('train accuracy:',accuracy_score(y_train,train_ypred))
print('test accuracy:',accuracy_score(y_test,test_ypred))

#2 cross val score
from sklearn.model_selection import cross_val_score
score=cross_val_score(model,x,y,cv=5)
print('cross validation score:',score.mean())

train accuracy: 1.0
test accuracy: 0.98
cross validation score: 0.9849841700587969


In [16]:
#important features
model.feature_importances_

array([0.33079568, 0.19658812, 0.24734647, 0.08977377, 0.10703511,
       0.0210393 , 0.00742154])

In [18]:
pd.DataFrame(index=x.columns,data=model.feature_importances_,columns=['feture importance'])

Unnamed: 0,feture importance
culmen_length_mm,0.330796
culmen_depth_mm,0.196588
flipper_length_mm,0.247346
body_mass_g,0.089774
island_Dream,0.107035
island_Torgersen,0.021039
sex_MALE,0.007422


In [19]:
#hyper parameter tuning
from sklearn.model_selection import GridSearchCV
estimator=RandomForestClassifier(random_state=0)
param_grid={'n_estimators':list(range(1,101))}
grid=GridSearchCV(estimator,param_grid,scoring='accuracy',cv=5)
grid.fit(x_train,y_train)
grid.best_params_

{'n_estimators': 8}

In [20]:
#random forest with best param values
model=RandomForestClassifier(n_estimators=8,random_state=0)
model.fit(x_train,y_train)

train_ypred=model.predict(x_train)
test_ypred=model.predict(x_test)

print('train accuracy:',accuracy_score(y_train,train_ypred))
print('test accuracy:',accuracy_score(y_test,test_ypred))

score=cross_val_score(model,x,y,cv=5)
print('cross validation score:',score.mean())

train accuracy: 1.0
test accuracy: 0.99
cross validation score: 0.9819990954319312
