## Random Forest Practical with Sample Penguins Data

The penguins dataset, available through Seaborn, contains data on various physical measurements of penguins from different species and islands. 

Importing the Libraries

In [162]:
import pandas as pd
import numpy as np
import seaborn as sns

Load Data Set

In [163]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


Our Task is to classify these into their correct species.
So we may have some data preparation to do
We'll need to convert the data to a format that's easier to work with

In [164]:
df.shape
#Output (rows,columns)

(344, 7)

In [165]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [166]:
#checking for nulls
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [167]:
#drop null values - no replacement
df.dropna(inplace=True)

In [168]:
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

# Feature Engineering
we need to convert object to numeric
### One Hot encoding transforming categorical data into numeric

In [169]:
df.head(10)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,Female
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,Male
12,Adelie,Torgersen,41.1,17.6,182.0,3200.0,Female
13,Adelie,Torgersen,38.6,21.2,191.0,3800.0,Male
14,Adelie,Torgersen,34.6,21.1,198.0,4400.0,Male


In [170]:
df.sex.unique()

array(['Male', 'Female'], dtype=object)

In [171]:
sex_dummies = pd.get_dummies(df['sex'], prefix='sex', drop_first=True)

In [172]:
sex_dummies.head()

Unnamed: 0,sex_Male
0,True
1,False
2,False
4,False
5,True


In [173]:
island_dummies = pd.get_dummies(df['island'], prefix='island', drop_first=True)
island_dummies.head()

Unnamed: 0,island_Dream,island_Torgersen
0,False,True
1,False,True
2,False,True
4,False,True
5,False,True


Concatenate new data to original

In [174]:
new_data = pd.concat([df,sex_dummies,island_dummies], axis=1)

In [175]:
new_data.head(20)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,sex_Male,island_Dream,island_Torgersen
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,True,False,True
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,False,False,True
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,False,False,True
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,False,False,True
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male,True,False,True
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,Female,False,False,True
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,Male,True,False,True
12,Adelie,Torgersen,41.1,17.6,182.0,3200.0,Female,False,False,True
13,Adelie,Torgersen,38.6,21.2,191.0,3800.0,Male,True,False,True
14,Adelie,Torgersen,34.6,21.1,198.0,4400.0,Male,True,False,True


Drop Repeated Columns

In [176]:
new_data.drop(['sex','island'],axis=1,inplace=True)
new_data.head(10)

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex_Male,island_Dream,island_Torgersen
0,Adelie,39.1,18.7,181.0,3750.0,True,False,True
1,Adelie,39.5,17.4,186.0,3800.0,False,False,True
2,Adelie,40.3,18.0,195.0,3250.0,False,False,True
4,Adelie,36.7,19.3,193.0,3450.0,False,False,True
5,Adelie,39.3,20.6,190.0,3650.0,True,False,True
6,Adelie,38.9,17.8,181.0,3625.0,False,False,True
7,Adelie,39.2,19.6,195.0,4675.0,True,False,True
12,Adelie,41.1,17.6,182.0,3200.0,False,False,True
13,Adelie,38.6,21.2,191.0,3800.0,True,False,True
14,Adelie,34.6,21.1,198.0,4400.0,True,False,True


Separate Target Variable

In [177]:
Y = new_data.species
Y.head()

0    Adelie
1    Adelie
2    Adelie
4    Adelie
5    Adelie
Name: species, dtype: object

In [178]:
Y.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [179]:
# map function to convert categorical values to numeric
# save to reuse
results = ['Adelie', 'Chinstrap','Gentoo']
Y = Y.map({'Adelie' : 0, 'Chinstrap':1,'Gentoo' : 2})
Y.head()

0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

Drop Target Variable

In [180]:
new_data.drop('species', inplace=True,axis=1)
new_data.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex_Male,island_Dream,island_Torgersen
0,39.1,18.7,181.0,3750.0,True,False,True
1,39.5,17.4,186.0,3800.0,False,False,True
2,40.3,18.0,195.0,3250.0,False,False,True
4,36.7,19.3,193.0,3450.0,False,False,True
5,39.3,20.6,190.0,3650.0,True,False,True


In [181]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 333 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bill_length_mm     333 non-null    float64
 1   bill_depth_mm      333 non-null    float64
 2   flipper_length_mm  333 non-null    float64
 3   body_mass_g        333 non-null    float64
 4   sex_Male           333 non-null    bool   
 5   island_Dream       333 non-null    bool   
 6   island_Torgersen   333 non-null    bool   
dtypes: bool(3), float64(4)
memory usage: 14.0 KB


In [182]:
 # Convert boolean columns to integers (if needed)
# df['sex_Male'] = df['sex_Male'].astype(int)
# df['island_Dream'] = df['island_Dream'].astype(int)
# df['island_Torgersen'] = df['island_Torgersen'].astype(int)


In [183]:
X = new_data

# Training and Testing the Model

In [184]:
from sklearn.model_selection import train_test_split
# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

# Checking the shapes of the splits
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

X_train: (233, 7)
X_test: (100, 7)
y_train: (233,)
y_test: (100,)


# Training the Random Forest Classifier
The random forest classifier is trained on the training data as follows:


In [185]:
from sklearn.ensemble import RandomForestClassifier

# Initializing and training the Random Forest Classifier
classifier_entropy = RandomForestClassifier(
    n_estimators=5,  # Number of trees
    criterion='entropy',  # Split quality metric
    random_state=0
)

classifier_entropy.fit(X_train, y_train)


In [186]:
# Making predictions
y_pred_entropy = classifier_entropy.predict(X_test)

In [187]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluation
print("=== Entropy Criterion ===")
print("Accuracy (Entropy):", accuracy_score(y_test, y_pred_entropy))
print("Confusion Matrix (Entropy):\n", confusion_matrix(y_test, y_pred_entropy))
print("Classification Report (Entropy):\n", classification_report(y_test, y_pred_entropy))

=== Entropy Criterion ===
Accuracy (Entropy): 0.98
Confusion Matrix (Entropy):
 [[48  0  0]
 [ 1 15  0]
 [ 1  0 35]]
Classification Report (Entropy):
               precision    recall  f1-score   support

           0       0.96      1.00      0.98        48
           1       1.00      0.94      0.97        16
           2       1.00      0.97      0.99        36

    accuracy                           0.98       100
   macro avg       0.99      0.97      0.98       100
weighted avg       0.98      0.98      0.98       100



In [188]:
# Training the Random Forest Classifier using Gini
classifier_gini = RandomForestClassifier(
    n_estimators=2,  # Number of trees
    criterion='gini',  # Split quality metric
    random_state=0
)

classifier_gini.fit(X_train, y_train)




In [189]:
# Making predictions
y_pred_gini = classifier_gini.predict(X_test)

# Evaluation
print("=== Gini Criterion ===")
print("Accuracy (Gini):", accuracy_score(y_test, y_pred_gini))
print("Confusion Matrix (Gini):\n", confusion_matrix(y_test, y_pred_gini))
print("Classification Report (Gini):\n", classification_report(y_test, y_pred_gini))

=== Gini Criterion ===
Accuracy (Gini): 0.96
Confusion Matrix (Gini):
 [[48  0  0]
 [ 3 13  0]
 [ 1  0 35]]
Classification Report (Gini):
               precision    recall  f1-score   support

           0       0.92      1.00      0.96        48
           1       1.00      0.81      0.90        16
           2       1.00      0.97      0.99        36

    accuracy                           0.96       100
   macro avg       0.97      0.93      0.95       100
weighted avg       0.96      0.96      0.96       100



In [190]:
from sklearn.preprocessing import LabelEncoder as label_encoder

def user_input_prediction():
    print("Enter the following feature values:")
    try:
        bill_length_mm = float(input("Bill Length (mm): "))
        bill_depth_mm = float(input("Bill Depth (mm): "))
        flipper_length_mm = float(input("Flipper Length (mm): "))
        body_mass_g = float(input("Body Mass (g): "))
        sex = input("Sex (Male/Female): ").strip().lower()
        island = input("Island (Dream/Biscoe/Torgersen): ").strip().lower()
        
        # Encode the inputs
        sex_Male = 1 if sex == 'male' else 0
        island_Dream = 1 if island == 'dream' else 0
        island_Torgersen = 1 if island == 'torgersen' else 0
        
        # Create input DataFrame
        input_data = pd.DataFrame({
            'bill_length_mm': [bill_length_mm],
            'bill_depth_mm': [bill_depth_mm],
            'flipper_length_mm': [flipper_length_mm],
            'body_mass_g': [body_mass_g],
            'sex_Male': [sex_Male],
            'island_Dream': [island_Dream],
            'island_Torgersen': [island_Torgersen]
        })
        
        # Predict the species
        prediction = classifier_gini.predict(input_data)
        print(prediction)
        
        print(f"\nPredicted Penguin Species: {results[prediction]}")
    except Exception as e:
        print(f"Error: {e}")

In [191]:
user_input_prediction()

Enter the following feature values:
[0]
Error: only integer scalar arrays can be converted to a scalar index


In [192]:
X_test.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex_Male,island_Dream,island_Torgersen
62,37.6,17.0,185.0,3600.0,False,False,False
60,35.7,16.9,185.0,3150.0,False,False,False
283,54.3,15.7,231.0,5650.0,True,False,False
107,38.2,20.0,190.0,3900.0,True,False,False
65,41.6,18.0,192.0,3950.0,True,False,False
