In [22]:
import pandas as pd 
import numpy as np

import warnings
warnings.simplefilter('ignore')

In [23]:
df = pd.read_csv(r"E:\Data Science by SRK\Machine_learning\Classification\penguin_data\cleaned_dataset_penguin.csv")

In [24]:
df

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,island_Dream,island_Torgersen,gender_MALE
0,0,39.1,18.7,181.0,3750.0,False,True,True
1,0,39.5,17.4,186.0,3800.0,False,True,False
2,0,40.3,18.0,195.0,3250.0,False,True,False
3,0,36.7,19.3,193.0,3450.0,False,True,False
4,0,39.3,20.6,190.0,3650.0,False,True,True
...,...,...,...,...,...,...,...,...
329,2,47.2,13.7,214.0,4925.0,False,False,False
330,2,46.8,14.3,215.0,4850.0,False,False,False
331,2,50.4,15.7,222.0,5750.0,False,False,True
332,2,45.2,14.8,212.0,5200.0,False,False,False


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334 entries, 0 to 333
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            334 non-null    int64  
 1   culmen_length_mm   334 non-null    float64
 2   culmen_depth_mm    334 non-null    float64
 3   flipper_length_mm  334 non-null    float64
 4   body_mass_g        334 non-null    float64
 5   island_Dream       334 non-null    bool   
 6   island_Torgersen   334 non-null    bool   
 7   gender_MALE        334 non-null    bool   
dtypes: bool(3), float64(4), int64(1)
memory usage: 14.2 KB


In [26]:
df.isnull().sum()

species              0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
island_Dream         0
island_Torgersen     0
gender_MALE          0
dtype: int64

# X and y

In [27]:
X = pd.get_dummies(df.drop('species', axis = 1), drop_first = True)
y = df['species']

# Train_Test_split

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = True)

# Modelling and Evaluation

In [30]:
# Random Forest classifire with default parameters
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state = True)
model.fit(X_train, y_train)

# Prediction
ypred_train = model.predict(X_train)
ypred_test = model.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print("Train accuracy : ", accuracy_score(ypred_train, y_train))
print("Test accuracy : ", accuracy_score(ypred_test, y_test))

from sklearn.model_selection import cross_val_score
print("cross_val_score : ", cross_val_score(model, X, y, cv=5).mean())

Train accuracy :  1.0
Test accuracy :  0.9850746268656716
cross_val_score :  0.991044776119403


# Hyper_parameter Tuning

In [31]:
from sklearn.model_selection import GridSearchCV

# model
estimator = RandomForestClassifier(random_state = True)

# Parameters (which you want to tune and identify the best)
param_grid = {"n_estimators" : list(range(1, 10))}

grid = GridSearchCV(estimator, param_grid, scoring = 'accuracy', cv= 5)

grid.fit(X_train, y_train)

grid.best_params_

{'n_estimators': 8}

# Importance of each features

In [34]:
grid.best_estimator_.feature_importances_


array([0.23407027, 0.1931781 , 0.28195195, 0.12682326, 0.12523007,
       0.0341873 , 0.00455905])

In [37]:
feats_imp = pd.DataFrame(data = grid.best_estimator_.feature_importances_, 
                         index = X.columns, 
                         columns = ['Feature Importance'])
important_feats = feats_imp[feats_imp['Feature Importance'] > 0]
important_features_list = important_feats.index.to_list()
important_features_list

['culmen_length_mm',
 'culmen_depth_mm',
 'flipper_length_mm',
 'body_mass_g',
 'island_Dream',
 'island_Torgersen',
 'gender_MALE']

# Final Random Forest model
**with best HPT AND important features**

In [39]:
X_imp = X[important_features_list]

# Train_Test_Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = True)


# Model with best HPT AND important features
final_rf_model = RandomForestClassifier()
final_rf_model.fit(X_train, y_train)

# Prediction
ypred_train = final_rf_model.predict(X_train)
ypred_test = final_rf_model.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print("Train accuracy : ", accuracy_score(ypred_train, y_train))
print("Test accuracy : ", accuracy_score(ypred_test, y_test))

from sklearn.model_selection import cross_val_score
print("cross_val_score : ", cross_val_score(model, X, y, cv=5).mean())

Train accuracy :  1.0
Test accuracy :  1.0
cross_val_score :  0.991044776119403
