1. Load Data (sns.load_dataset("titanic"))
2. Impute Missing Values:

    Fill age with median.
    Fill embarked with mode.
3. Categorical Encoding:

    One-hot encode sex, embarked.
4. Feature Splitting:

    Split name into first_name and title using string operations.
5. Discretization:

    Bin age into categories (e.g., child, adult, senior).
6. Scaling:

    Use MinMaxScaler on fare.
7. Handling Outliers:

    Clip extreme fare values above the 95th percentile.
8. Variable Transformation:

    Apply log transformation to fare for skewness reduction.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

df=sns.load_dataset('titanic')

df['age'].fillna(df['age'].median(),inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0],inplace=True)

df=pd.get_dummies(df,columns=['sex','embarked'],drop_first=True)



# def extract_name_title(name):
#     if pd.isnull(name):
#         return pd.Series(['Unknown', 'Unknown'])
#     parts = name.split(',')
#     first_name = parts[1].split()[1] if len(parts) > 1 else 'Unknown'
#     title = parts[1].split('.')[0].strip() if '.' in parts[1] else 'Unknown'
#     return pd.Series([first_name, title])

# df[['first_name', 'title']] = df['name'].apply(extract_name_title)



# def age_bin(age):
#     if age < 18:
#         return 'child'
#     elif age < 60:
#         return 'adult'
#     else:
#         return 'senior'

# df['age_category'] = df['age'].apply(age_bin)

labels=['child','adult','senior']
bins=[18,40,60,np.inf]
df['age_category']=pd.cut(df['age'],bins=bins,labels=labels,right=False)



scaler=MinMaxScaler()
df['fare_scaled']=scaler.fit_transform(df[['fare']])

fare_95 = df['fare'].quantile(0.95)
df['fare_clipped'] = np.clip(df['fare'], a_min=None, a_max=fare_95)


df.head()

Part 1: Feature Engineering Task 

Dataset: Use Titanic or any dataset with mixed types.

Tasks:
	1.	Impute missing values (numeric and categorical separately).

	2.	Apply one categorical encoding technique.

	3.	Discretize a numeric column into bins.
	
	4.	Split a datetime feature into day/month/year.

Part 2: Feature Selection Task 

Dataset: Use the output from Part 1.

Tasks:
	
	1.	Apply a Filter method (e.g., SelectKBest).
	
	2.	Apply a Wrapper method (e.g., RFE with Logistic Regression).
	
	3.	Apply an Embedded method (e.g., feature importance using RandomForest).
	
	4.	Briefly compare the selected features.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns

df=sns.load_dataset('titanic')

# Load the dataset



# View initial info
print(df.info())


In [None]:
from sklearn.impute import SimpleImputer

# Numeric imputation (e.g., Age)
num_imputer = SimpleImputer(strategy='median')
df['Age'] = num_imputer.fit_transform(df[['Age']])

# Categorical imputation (e.g., Embarked)
cat_imputer = SimpleImputer(strategy='most_frequent')
df['Embarked'] = cat_imputer.fit_transform(df[['Embarked']]).ravel()



In [None]:
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)


In [None]:
df['Fare_bin'] = pd.qcut(df['Fare'], q=4, labels=False)


In [None]:
import numpy as np

# Create a fake datetime feature (random boarding dates)
df['Boarded_date'] = pd.to_datetime('1912-04-01') + pd.to_timedelta(np.random.randint(0, 30, df.shape[0]), unit='D')

# Extract day/month/year
df['Boarded_day'] = df['Boarded_date'].dt.day
df['Boarded_month'] = df['Boarded_date'].dt.month
df['Boarded_year'] = df['Boarded_date'].dt.year

# Drop the fake datetime
df = df.drop(columns='Boarded_date')


In [None]:
print(df.head())


In [None]:

from sklearn.feature_selection import SelectKBest, chi2

# Select top 5 features
selector = SelectKBest(score_func=chi2, k=5)
X_new_filter = selector.fit_transform(X, y)

# Selected feature names
selected_filter_features = X.columns[selector.get_support()]
print("Filter method selected:", list(selected_filter_features))


In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
rfe = RFE(estimator=model, n_features_to_select=5)
rfe.fit(X, y)

selected_wrapper_features = X.columns[rfe.support_]
print("Wrapper method selected:", list(selected_wrapper_features))


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X, y)

# Feature importance ranking
importances = rf.feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)
selected_embedded_features = feat_imp.head(5).index.tolist()
print("Embedded method selected:", selected_embedded_features)


📘 Part 1: Data Preprocessing and Feature Engineering

Dataset: Use any dataset with mixed data types (e.g., Titanic, House Prices, etc.)

Tasks:

Identify and handle missing values separately for numerical and categorical features.

Apply an appropriate encoding strategy to convert categorical variables to numerical ones.

Normalize or standardize at least one numerical feature.

Create new date-based features (like weekday, quarter, etc.) from an existing datetime column.

📘 Part 2: Feature Selection Techniques

Dataset: Use the preprocessed output from Part 1.

Tasks:

Apply a Filter-based feature selection method (e.g., f_classif or mutual_info_classif).

Apply a Wrapper-based method using RFE with a Decision Tree classifier.

Apply an Embedded method using Lasso or any regularization technique.

Compare the selected features across the three methods and briefly discuss any differences.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

print(sns.get_dataset_names())

In [None]:
df=sns.load_dataset('penguins')

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(subset=['sex'],inplace=True)
df=pd.get_dummies(df,columns=['sex'],drop_first=True)


In [None]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

df['species_encoded']=le.fit_transform(df[['species']])

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()

df['scaled_body_mass']=scaler.fit_transform(df[['body_mass_g']])



In [None]:
print(df['scaled_body_mass'])

In [None]:
df.head()

In [None]:
X=df.select_dtypes(include='number')
y=df['species']

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

filter_selector=SelectKBest(score_func=f_classif,k=5)
filter_selector.fit(X,y)

filter_selected=X.columns[filter_selector.get_support()]

print("🔹 Filter-based selected features:", list(filter_selected))

In [None]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

model=DecisionTreeClassifier()
rfe=RFE(estimator=model,n_features_to_select=5)
rfe.fit(X,y)

wrapper_selected=X.columns[rfe.support_]

print(list(wrapper_selected))

In [None]:
from sklearn.linear_model import LassoCV

lasso_cv = LassoCV(alphas=[0.01, 0.1, 1.0, 10.0], cv=5)
lasso_cv.fit(X_train, y_train)

print("Best alpha:", lasso_cv.alpha_)
