# Import all necessary libreries

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_classif

# Load the dataset

In [14]:
data = pd.read_csv('data\penguins_size.csv')
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


# FEATURES ENGINEERING
* # Imputation

1. This involved detecting of missing value
        .The best way to correct missing value is by imputation 
        For the numerical, mean should be imputed to replace the missing value
        For the categorical, MODE that is the common frequent value should be used



2. For an invalid value
        .This can be drop or replace. E.g the sex column have a (.) value which doesn't represent the real value of either male or female

In [15]:
# detect missing value using pandas function .isnull
print(data.isnull().sum())

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64


##### In our dataset only species and island have no missing value. Other have missing value of 2 and sex is 10

* mode is used to replace the missing value in sex
* mean for other

In [16]:
#imputation 

data['culmen_length_mm'].fillna((data['culmen_length_mm'].mean()), inplace=True)
data['culmen_depth_mm'].fillna((data['culmen_depth_mm'].mean()), inplace=True)
data['flipper_length_mm'].fillna((data['flipper_length_mm'].mean()), inplace=True)
data['body_mass_g'].fillna((data['body_mass_g'].mean()), inplace=True)

data['sex'].fillna((data['sex'].value_counts().index[0]), inplace=True)



In [17]:
data.reset_index()
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,MALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


### For an invalid value

For example, we know that for the ‘sex‘ feature we can have two values: FEMALE and MALE. We can check if we have values other than this:

In [18]:
data.loc[(data['sex'] != 'FEMALE') & (data['sex'] != 'MALE')]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.


In [19]:
# this row can be drop
data = data.drop([336])
data.reset_index()

Unnamed: 0,index,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,Adelie,Torgersen,39.10000,18.70000,181.000000,3750.000000,MALE
1,1,Adelie,Torgersen,39.50000,17.40000,186.000000,3800.000000,FEMALE
2,2,Adelie,Torgersen,40.30000,18.00000,195.000000,3250.000000,FEMALE
3,3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,MALE
4,4,Adelie,Torgersen,36.70000,19.30000,193.000000,3450.000000,FEMALE
...,...,...,...,...,...,...,...,...
338,339,Gentoo,Biscoe,43.92193,17.15117,200.915205,4201.754386,MALE
339,340,Gentoo,Biscoe,46.80000,14.30000,215.000000,4850.000000,FEMALE
340,341,Gentoo,Biscoe,50.40000,15.70000,222.000000,5750.000000,MALE
341,342,Gentoo,Biscoe,45.20000,14.80000,212.000000,5200.000000,FEMALE


In [20]:
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,MALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


# Categorical Encoding

Since categorical variable are discrete values, encoding have to be done 

There are several ways of encoding the dataset

1. Label Encoding
2. One-Hot Encoding
3. Count Encoding
4. Target Encoding
5. Leave One Out Target Encoding

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 343 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            343 non-null    object 
 1   island             343 non-null    object 
 2   culmen_length_mm   343 non-null    float64
 3   culmen_depth_mm    343 non-null    float64
 4   flipper_length_mm  343 non-null    float64
 5   body_mass_g        343 non-null    float64
 6   sex                343 non-null    object 
dtypes: float64(4), object(3)
memory usage: 21.4+ KB


In [23]:
# let’s extract those variables from our dataset into a separate 
# variable and mark them as categorical type

data['species'] = data['species'].astype('category')
data['island'] = data['island'].astype('category')
data['sex'] = data['sex'].astype('category')

data.dtypes

species              category
island               category
culmen_length_mm      float64
culmen_depth_mm       float64
flipper_length_mm     float64
body_mass_g           float64
sex                  category
dtype: object

In [24]:
categorical_data = data.drop(['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g'], axis=1)

categorical_data.head()

Unnamed: 0,species,island,sex
0,Adelie,Torgersen,MALE
1,Adelie,Torgersen,FEMALE
2,Adelie,Torgersen,FEMALE
3,Adelie,Torgersen,MALE
4,Adelie,Torgersen,FEMALE


#### 1. Label Encoding

Label encoding is converting each categorical value into some number. For example, the ‘species‘ feature contains 3 categories. We can assign value 0 to Adelie, 1 to Gentoo and 2 to Chinstrap.

In [25]:
categorical_data["species_cat"] = categorical_data["species"].cat.codes
categorical_data["island_cat"] = categorical_data["island"].cat.codes
categorical_data["sex_cat"] = categorical_data["sex"].cat.codes

categorical_data.head()

Unnamed: 0,species,island,sex,species_cat,island_cat,sex_cat
0,Adelie,Torgersen,MALE,0,2,1
1,Adelie,Torgersen,FEMALE,0,2,0
2,Adelie,Torgersen,FEMALE,0,2,0
3,Adelie,Torgersen,MALE,0,2,1
4,Adelie,Torgersen,FEMALE,0,2,0


As you can see, we added three new features each containing encoded categorical features. From the first five instances, we can see that species category Adelie is encoded with value 0, island category Torgensesn is encoded with value 2 and sex categories FEMALE and MALE are encoded with values 0 and 1 respectively.

#### 2. One-Hot Encoding

his is one of the most popular categorical encoding techniques. It spreads the values in a feature to multiple flag features and assigns values 0 or 1 to them. This binary value represents the relationship between non-encoded and encoded features.

In [29]:
encoded_spicies = pd.get_dummies(categorical_data['species'])
encoded_island = pd.get_dummies(categorical_data['island'])
encoded_sex = pd.get_dummies(categorical_data['sex'])


categorical_data = categorical_data.join(encoded_spicies)
categorical_data = categorical_data.join(encoded_island)
categorical_data = categorical_data.join(encoded_sex)

In [30]:
categorical_data

Unnamed: 0,species,island,sex,species_cat,island_cat,sex_cat,Adelie,Chinstrap,Gentoo,Biscoe,Dream,Torgersen,FEMALE,MALE
0,Adelie,Torgersen,MALE,0,2,1,1,0,0,0,0,1,0,1
1,Adelie,Torgersen,FEMALE,0,2,0,1,0,0,0,0,1,1,0
2,Adelie,Torgersen,FEMALE,0,2,0,1,0,0,0,0,1,1,0
3,Adelie,Torgersen,MALE,0,2,1,1,0,0,0,0,1,0,1
4,Adelie,Torgersen,FEMALE,0,2,0,1,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,MALE,2,0,1,0,0,1,1,0,0,0,1
340,Gentoo,Biscoe,FEMALE,2,0,0,0,0,1,1,0,0,1,0
341,Gentoo,Biscoe,MALE,2,0,1,0,0,1,1,0,0,0,1
342,Gentoo,Biscoe,FEMALE,2,0,0,0,0,1,1,0,0,1,0


#### 3. Count Encoding

Count encoding is converting each categorical value to its frequency, ie. the number of times it appears in the dataset. For example, if the ‘species‘ feature contains 6 occurrences of class Adelie we will replace every Adelie value with the number 6.

In [31]:
categorical_data = data.drop(['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g'], axis=1)

species_count = categorical_data['species'].value_counts()
island_count = categorical_data['island'].value_counts()
sex_count = categorical_data['sex'].value_counts()

categorical_data['species_count_enc'] = categorical_data['species'].map(species_count)
categorical_data['island_count_enc'] = categorical_data['island'].map(island_count)
categorical_data['sex_count_enc'] = categorical_data['sex'].map(sex_count)

categorical_data

Unnamed: 0,species,island,sex,species_count_enc,island_count_enc,sex_count_enc
0,Adelie,Torgersen,MALE,152,52,178
1,Adelie,Torgersen,FEMALE,152,52,165
2,Adelie,Torgersen,FEMALE,152,52,165
3,Adelie,Torgersen,MALE,152,52,178
4,Adelie,Torgersen,FEMALE,152,52,165
...,...,...,...,...,...,...
339,Gentoo,Biscoe,MALE,123,167,178
340,Gentoo,Biscoe,FEMALE,123,167,165
341,Gentoo,Biscoe,MALE,123,167,178
342,Gentoo,Biscoe,FEMALE,123,167,165
