In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
pd.set_option('display.max_rows', 500)
penguin_df = pd.read_csv('penguins.csv')
penguin_df.head(100)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,female,2007
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,male,2007
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,,2007
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,,2007


In [3]:
penguin_df[penguin_df['species']== 'Gentoo']

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
152,Gentoo,Biscoe,46.1,13.2,211.0,4500.0,female,2007
153,Gentoo,Biscoe,50.0,16.3,230.0,5700.0,male,2007
154,Gentoo,Biscoe,48.7,14.1,210.0,4450.0,female,2007
155,Gentoo,Biscoe,50.0,15.2,218.0,5700.0,male,2007
156,Gentoo,Biscoe,47.6,14.5,215.0,5400.0,male,2007
157,Gentoo,Biscoe,46.5,13.5,210.0,4550.0,female,2007
158,Gentoo,Biscoe,45.4,14.6,211.0,4800.0,female,2007
159,Gentoo,Biscoe,46.7,15.3,219.0,5200.0,male,2007
160,Gentoo,Biscoe,43.3,13.4,209.0,4400.0,female,2007
161,Gentoo,Biscoe,46.8,15.4,215.0,5150.0,male,2007


In [4]:
for x in penguin_df[['species','island','sex','year']]:
    y = penguin_df[x].value_counts()
    print(y)

species
Adelie       152
Gentoo       124
Chinstrap     68
Name: count, dtype: int64
island
Biscoe       168
Dream        124
Torgersen     52
Name: count, dtype: int64
sex
male      168
female    165
Name: count, dtype: int64
year
2009    120
2008    114
2007    110
Name: count, dtype: int64


In [5]:
# To check for missing values and sum the total per column
penguin_df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [6]:
# lets replace the missing values in the numerical columns with average values and the gender column with the common i.e. mode
penguin_df['bill_length_mm']=penguin_df['bill_length_mm'].fillna(penguin_df['bill_length_mm'].mean())
penguin_df['bill_depth_mm']=penguin_df['bill_depth_mm'].fillna(penguin_df['bill_depth_mm'].mean())
penguin_df['flipper_length_mm']=penguin_df['flipper_length_mm'].fillna(penguin_df['flipper_length_mm'].mean())
penguin_df['body_mass_g']=penguin_df['body_mass_g'].fillna(penguin_df['body_mass_g'].mean())
penguin_df['sex']=penguin_df['sex'].fillna(penguin_df['sex'].mode()[0])

In [7]:
penguin_df.isna().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
dtype: int64

In [8]:
penguin_df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,male,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,female,2007
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,male,2007
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,male,2007
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,male,2007


In [9]:
X = penguin_df.drop(['species', 'year'], axis = 1)
X

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Torgersen,39.1,18.7,181.0,3750.0,male
1,Torgersen,39.5,17.4,186.0,3800.0,female
2,Torgersen,40.3,18.0,195.0,3250.0,female
3,Torgersen,43.92193,17.15117,200.915205,4201.754386,male
4,Torgersen,36.7,19.3,193.0,3450.0,female
5,Torgersen,39.3,20.6,190.0,3650.0,male
6,Torgersen,38.9,17.8,181.0,3625.0,female
7,Torgersen,39.2,19.6,195.0,4675.0,male
8,Torgersen,34.1,18.1,193.0,3475.0,male
9,Torgersen,42.0,20.2,190.0,4250.0,male


In [10]:
#Using labelencoder to convert the categorical values to numbers
Le = LabelEncoder()
X['island'] = Le.fit_transform(X['island'])
X['sex'] = Le.fit_transform(X['sex'])

In [11]:
X

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,2,39.1,18.7,181.0,3750.0,1
1,2,39.5,17.4,186.0,3800.0,0
2,2,40.3,18.0,195.0,3250.0,0
3,2,43.92193,17.15117,200.915205,4201.754386,1
4,2,36.7,19.3,193.0,3450.0,0
5,2,39.3,20.6,190.0,3650.0,1
6,2,38.9,17.8,181.0,3625.0,0
7,2,39.2,19.6,195.0,4675.0,1
8,2,34.1,18.1,193.0,3475.0,1
9,2,42.0,20.2,190.0,4250.0,1


In [12]:
# Using Standard Scalar to Standardise the values in the numerical columns
Sc = StandardScaler()
X[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']]= Sc.fit_transform(X[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']])

In [13]:
X.astype(int)

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,2,0,0,-1,0,1
1,2,0,0,-1,0,0
2,2,0,0,0,-1,0
3,2,0,0,0,0,1
4,2,-1,1,0,0,0
5,2,0,1,0,0,1
6,2,0,0,-1,0,0
7,2,0,1,0,0,1
8,2,-1,0,0,0,1
9,2,0,1,0,0,1


In [14]:
y = penguin_df['species']

In [15]:
# Splitting the DataSet into Train & Test
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape

((275, 6), (69, 6), (275,), (69,))

### Modelling Using Logistics Regression

In [16]:
lg = LogisticRegression()
lg = lg.fit(Xtrain, ytrain)

In [17]:
# Use this to extract the model for use by Streamlit
joblib.dump(lg, 'trained_penguinlg.joblib')

['trained_penguinlg.joblib']

In [18]:
ypred = lg.predict(Xtest)
ypred

array(['Gentoo', 'Gentoo', 'Gentoo', 'Gentoo', 'Chinstrap', 'Chinstrap',
       'Chinstrap', 'Gentoo', 'Adelie', 'Gentoo', 'Chinstrap', 'Adelie',
       'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Adelie', 'Gentoo',
       'Chinstrap', 'Adelie', 'Adelie', 'Adelie', 'Gentoo', 'Adelie',
       'Gentoo', 'Gentoo', 'Adelie', 'Gentoo', 'Adelie', 'Gentoo',
       'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Adelie', 'Adelie',
       'Chinstrap', 'Gentoo', 'Adelie', 'Gentoo', 'Chinstrap',
       'Chinstrap', 'Chinstrap', 'Adelie', 'Adelie', 'Adelie', 'Gentoo',
       'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Gentoo', 'Chinstrap',
       'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Gentoo', 'Gentoo',
       'Gentoo', 'Gentoo', 'Adelie', 'Chinstrap', 'Adelie', 'Chinstrap',
       'Gentoo', 'Adelie', 'Adelie', 'Adelie'], dtype=object)

In [19]:
accuracy_score(ytest,ypred)

1.0

In [20]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        32
   Chinstrap       1.00      1.00      1.00        13
      Gentoo       1.00      1.00      1.00        24

    accuracy                           1.00        69
   macro avg       1.00      1.00      1.00        69
weighted avg       1.00      1.00      1.00        69



In [21]:
confusion_matrix(ytest,ypred)

array([[32,  0,  0],
       [ 0, 13,  0],
       [ 0,  0, 24]], dtype=int64)

### Modelling using GradientBoostingClassifier

In [22]:
gbc = GradientBoostingClassifier()
gbc = gbc.fit(Xtrain, ytrain)

In [23]:
ypred_gbc = gbc.predict(Xtest)

In [24]:
accuracy_score(ytest,ypred_gbc)

0.9855072463768116

In [25]:
print(classification_report(ytest,ypred_gbc))

              precision    recall  f1-score   support

      Adelie       1.00      0.97      0.98        32
   Chinstrap       1.00      1.00      1.00        13
      Gentoo       0.96      1.00      0.98        24

    accuracy                           0.99        69
   macro avg       0.99      0.99      0.99        69
weighted avg       0.99      0.99      0.99        69



In [26]:
confusion_matrix(ytest,ypred_gbc)

array([[31,  0,  1],
       [ 0, 13,  0],
       [ 0,  0, 24]], dtype=int64)