### Importing libraries

In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [39]:
df = pd.read_csv("penguins.csv")
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181,3750,Male
1,Adelie,Torgersen,39.5,17.4,186,3800,Female
2,Adelie,Torgersen,40.3,18.0,195,3250,Female
3,Adelie,Torgersen,36.7,19.3,193,3450,Female
4,Adelie,Torgersen,39.3,20.6,190,3650,Male


In [40]:
df.shape

(333, 7)

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    int64  
 5   body_mass_g        333 non-null    int64  
 6   sex                333 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 18.3+ KB


In [42]:
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

## Feature Engineering

### One Hot Encoding Tranforming categorical data into numeric 

In [43]:
df.sex.unique()

array(['Male', 'Female'], dtype=object)

In [44]:
pd.get_dummies(df['sex']).head()


Unnamed: 0,Female,Male
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True


In [45]:
sex = pd.get_dummies(df['sex'], drop_first=True)
sex.head()

Unnamed: 0,Male
0,True
1,False
2,False
3,False
4,True


We have dropped the first column because only one column id sufficient to determine the gender of the penguins either will be Male(True) or not(False), that means a Female

### Applying One Hot Encoding to island

In [48]:
df.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [49]:
pd.get_dummies(df['island']).head()

Unnamed: 0,Biscoe,Dream,Torgersen
0,False,False,True
1,False,False,True
2,False,False,True
3,False,False,True
4,False,False,True


In [51]:
island = pd.get_dummies(df['island'], drop_first=True)
island.head()

Unnamed: 0,Dream,Torgersen
0,False,True
1,False,True
2,False,True
3,False,True
4,False,True


### Concatenate the above dataframe to the original df

In [57]:
new_data = pd.concat([df, island, sex], axis=1)

In [58]:
new_data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Dream,Torgersen,Male
0,Adelie,Torgersen,39.1,18.7,181,3750,Male,False,True,True
1,Adelie,Torgersen,39.5,17.4,186,3800,Female,False,True,False
2,Adelie,Torgersen,40.3,18.0,195,3250,Female,False,True,False
3,Adelie,Torgersen,36.7,19.3,193,3450,Female,False,True,False
4,Adelie,Torgersen,39.3,20.6,190,3650,Male,False,True,True


### Drop the repeated columns

In [None]:
new_data.drop(['sex', 'island'], axis=1, inplace=True);

In [64]:
new_data.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,Male
0,Adelie,39.1,18.7,181,3750,False,True,True
1,Adelie,39.5,17.4,186,3800,False,True,False
2,Adelie,40.3,18.0,195,3250,False,True,False
3,Adelie,36.7,19.3,193,3450,False,True,False
4,Adelie,39.3,20.6,190,3650,False,True,True


### Create seperate target variable

In [66]:
y = new_data.species
y.head()

0    Adelie
1    Adelie
2    Adelie
3    Adelie
4    Adelie
Name: species, dtype: object

In [68]:
y.unique()

array(['Adelie', 'Gentoo', 'Chinstrap'], dtype=object)

In [69]:
y = y.map({'Adelie':0, 'Chinstrap':1, 'Gentoo':2 })  #Using map function to convert categorical values into numeric.

In [71]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: species, dtype: int64

### Dropping the Target variable : species

In [None]:
new_data.drop('species', inplace=True, axis=1)


In [74]:
new_data.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,Male
0,39.1,18.7,181,3750,False,True,True
1,39.5,17.4,186,3800,False,True,False
2,40.3,18.0,195,3250,False,True,False
3,36.7,19.3,193,3450,False,True,False
4,39.3,20.6,190,3650,False,True,True


In [75]:
X = new_data

### Splitting the dataset into Training and Test Data

In [77]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [78]:
print('X_train', X_train.shape)
print('X_test', X_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)

X_train (233, 7)
X_test (100, 7)
y_train (233,)
y_test (100,)


### Training Random Forest Classification on Training set

In [128]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier( n_estimators=7 ,criterion='entropy', random_state =0)
classifier.fit(X_train, y_train)

### Predict the Test Results

In [129]:
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 2, 1, 2, 2, 1, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0, 0,
       1, 2, 1, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 0,
       0, 0, 0, 0, 2, 0, 2, 0, 2, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 0,
       2, 2, 2, 1, 2, 2, 1, 2, 0, 1, 0, 1, 0, 2, 2, 2, 2, 1, 2, 2, 2, 0,
       0, 1, 1, 0, 2, 0, 1, 0, 2, 0, 2, 2], dtype=int64)

### Confusion Matrix

In [130]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


In [131]:
print(confusion_matrix(y_test, y_pred))

[[48  0  0]
 [ 0 18  0]
 [ 0  0 34]]


In [132]:
print(accuracy_score(y_test, y_pred))

1.0


In [133]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        48
           1       1.00      1.00      1.00        18
           2       1.00      1.00      1.00        34

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



### Try with different number of trees and gini criteria

In [125]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier( n_estimators=5 ,criterion='gini', random_state =0)
classifier.fit(X_train, y_train)

In [134]:
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 2, 1, 2, 2, 1, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0, 0,
       1, 2, 1, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 1, 2, 0, 2, 0, 1, 2, 0, 0,
       0, 0, 0, 0, 2, 0, 2, 0, 2, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 0,
       2, 2, 2, 1, 2, 2, 1, 2, 0, 1, 0, 1, 0, 2, 2, 2, 2, 1, 2, 2, 2, 0,
       0, 1, 1, 0, 2, 0, 1, 0, 2, 0, 2, 2], dtype=int64)

In [127]:
print(accuracy_score(y_test, y_pred))

0.99
