# Penguins dataset

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [52]:
filename = 'penguins.csv'
df = pd.read_csv(filename)

### EDA - Exploratory Data Analysis

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rowid              344 non-null    int64  
 1   species            344 non-null    object 
 2   island             344 non-null    object 
 3   bill_length_mm     342 non-null    float64
 4   bill_depth_mm      342 non-null    float64
 5   flipper_length_mm  342 non-null    float64
 6   body_mass_g        342 non-null    float64
 7   sex                333 non-null    object 
 8   year               344 non-null    int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 24.3+ KB


### Data Cleaning

In [54]:
df.drop(['rowid', 'year'], axis = 1, inplace=True)

In [55]:
df.dropna(inplace=True)

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 20.8+ KB


### Target and Features columns

In [57]:
target_name = 'sex'

feature_names = list(df.columns)
feature_names.remove(target_name)

feature_names

['species',
 'island',
 'bill_length_mm',
 'bill_depth_mm',
 'flipper_length_mm',
 'body_mass_g']

In [58]:
df_target = df[target_name].copy()
df_features = df[feature_names].copy()

In [59]:
df_features

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0
...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0


In [60]:
df_target

0        male
1      female
2      female
4      female
5        male
        ...  
339      male
340    female
341      male
342      male
343    female
Name: sex, Length: 333, dtype: object

### One-hot encoding

In [71]:
df_features = pd.get_dummies(df_features, drop_first=True)
df_features

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species_Chinstrap,species_Gentoo,island_Dream,island_Torgersen
0,0.254545,0.666667,0.152542,0.291667,0.0,0.0,0.0,1.0
1,0.269091,0.511905,0.237288,0.305556,0.0,0.0,0.0,1.0
2,0.298182,0.583333,0.389831,0.152778,0.0,0.0,0.0,1.0
4,0.167273,0.738095,0.355932,0.208333,0.0,0.0,0.0,1.0
5,0.261818,0.892857,0.305085,0.263889,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
339,0.861818,0.797619,0.593220,0.361111,1.0,0.0,1.0,0.0
340,0.414545,0.595238,0.508475,0.194444,1.0,0.0,1.0,0.0
341,0.636364,0.607143,0.355932,0.298611,1.0,0.0,1.0,0.0
342,0.680000,0.702381,0.644068,0.388889,1.0,0.0,1.0,0.0


### Scaling numeric columns

In [72]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df_features)
df_features[df_features.columns] = scaler.transform(df_features)

In [73]:
df_features

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species_Chinstrap,species_Gentoo,island_Dream,island_Torgersen
0,0.254545,0.666667,0.152542,0.291667,0.0,0.0,0.0,1.0
1,0.269091,0.511905,0.237288,0.305556,0.0,0.0,0.0,1.0
2,0.298182,0.583333,0.389831,0.152778,0.0,0.0,0.0,1.0
4,0.167273,0.738095,0.355932,0.208333,0.0,0.0,0.0,1.0
5,0.261818,0.892857,0.305085,0.263889,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...
339,0.861818,0.797619,0.593220,0.361111,1.0,0.0,1.0,0.0
340,0.414545,0.595238,0.508475,0.194444,1.0,0.0,1.0,0.0
341,0.636364,0.607143,0.355932,0.298611,1.0,0.0,1.0,0.0
342,0.680000,0.702381,0.644068,0.388889,1.0,0.0,1.0,0.0


### Split in Training and Test dataset

In [74]:
from sklearn.model_selection import train_test_split
df_train_features, df_test_features, df_train_target, df_test_target = \
    train_test_split(df_features, df_target, test_size = 0.3)

In [75]:
print( df_train_features.shape )
print( df_train_target.shape )
print( df_test_features.shape )
print( df_test_target.shape )

(233, 8)
(233,)
(100, 8)
(100,)


In [77]:
df_test = pd.concat([df_test_features, df_test_target], axis = 1)

### Classification

In [93]:
def calc_accuracy(bool_series):
    n_true, n_false = bool_series.value_counts()
    accuracy = (n_true / (n_true + n_false)) * 100
    return round(accuracy)

In [95]:
method = '3NN'

from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(df_train_features, df_train_target)

df_test[method + '_predicted'] = classifier.predict(df_test_features)
df_test[method + '_correct'] = df_test[method + '_predicted'] == df_test[target_name]

accuracy = calc_accuracy(df_test[method + '_correct'])
print('accuracy ', method, accuracy)

accuracy  3NN 88


In [102]:
method = 'RF'

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(df_train_features, df_train_target)

df_test[method + '_predicted'] = classifier.predict(df_test_features)
df_test[method + '_correct'] = df_test[method + '_predicted'] == df_test[target_name]

accuracy = calc_accuracy(df_test[method + '_correct'])
print(f'Accuracy for {method}: {accuracy}%')

Accuracy for RF: 94%


In [None]:
method = 'NB'

from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(df_train_features, df_train_target)

df_test[method + '_predicted'] = classifier.predict(df_test_features)
df_test[method + '_correct'] = df_test[method + '_predicted'] == df_test[target_name]

accuracy = calc_accuracy(df_test[method + '_correct'])
print(f'Accuracy for {method}: {accuracy}%')

In [100]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

classifier_methods = {
    'NB': GaussianNB,
    '3NN': KNeighborsClassifier
}

for method, classifier in classifier_methods.items():

    classifier.fit(df_train_features, df_train_target)

    df_test[method + '_predicted'] = classifier.predict(df_test_features)
    df_test[method + '_correct'] = df_test[method + '_predicted'] == df_test[target_name]

    accuracy = calc_accuracy(df_test[method + '_correct'])
    print(f'Accuracy for {method}: {accuracy}%')


TypeError: GaussianNB.fit() missing 1 required positional argument: 'y'