In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Step 1 — Load and Inspect Data

In [17]:
df=sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [19]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

Step 2 — Handle Missing Values

2.1 identify columns with missing values

In [20]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

2.2 Drop columns with >60% missing values

In [21]:
threshold=0.6*len(df)
df=df.dropna(axis=1,thresh=threshold)

2.3 impute numerix columns (median)

In [22]:
num_cols=df.select_dtypes(include=['float64','int64']).columns
df[num_cols]=df[num_cols].fillna(df[num_cols].median())

2.4 impute categorical columns (mode)

In [23]:
cat_cols=df.select_dtypes(include='object').columns
df[cat_cols]=df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

verify

In [24]:
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

Step 3 — Handle Outliers (IQR Method)

In [25]:
def cap_outliers(col):
    Q1=col.quantile(0.25)
    Q3=col.quantile(0.75)
    IQR=Q3-Q1
    lower=Q1-1.5*IQR
    upper=Q3+1.5*IQR
    return np.clip(col,lower,upper)

for col in num_cols:
    df[col]=cap_outliers(df[col])

Step 4 — Feature Scaling

Use Standardization

In [26]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
df[num_cols]=scaler.fit_transform(df[num_cols])

Step 5 — Feature Engineering

5.1 Create new features

In [27]:
df['bill_ratio'] = df['bill_length_mm'] / df['bill_depth_mm']

df['mass_per_flipper'] = df['body_mass_g'] / df['flipper_length_mm']

In [28]:
df.replace([np.inf, -np.inf],np.nan,inplace=True)
df.fillna(df.median(),inplace=True)

TypeError: Cannot convert [['Adelie' 'Adelie' 'Adelie' ... 'Gentoo' 'Gentoo' 'Gentoo']
 ['Torgersen' 'Torgersen' 'Torgersen' ... 'Biscoe' 'Biscoe' 'Biscoe']
 ['Male' 'Female' 'Female' ... 'Male' 'Female' 'Male']] to numeric

5.2 Encode categorical variables (non-ordinal : one-hot )

In [15]:
df = pd.get_dummies(df,columns=cat_cols, drop_first=True)

Step 6 — Feature Selection

Separate features & target

In [None]:
X=df.drop('species_Gentoo',axis=1)
y=df['species_Gentoo']

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

selector=SelectKBest(score_func=f_classif,k="all")
selector.fit(X,y)

feature_scores=pd.DataFrame({
    'Feature':X.columns,
    "Score":selector.scores_
}).sort_values(by='Score',ascending=False)

feature_scores

Unnamed: 0,Feature,Score
2,flipper_length_mm,1015.066355
1,bill_depth_mm,709.37642
3,body_mass_g,676.564593
7,island_Dream,159.235465
0,bill_length_mm,108.65273
6,species_Chinstrap,55.151163
8,island_Torgersen,38.157807
4,bill_ratio,2.68166
5,mass_per_flipper,0.51812
9,sex_Male,0.109575


In [None]:
selected_features=feature_scores[
    feature_scores['Score']>feature_scores["Score"].mean()
    ]['Feature']

X_selected=X[selected_features]

Step 7 - verify Final Dataset