In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_iris
from ucimlrepo import fetch_ucirepo
from sklearn.datasets import load_breast_cancer


In [None]:
# Q1
# Target Transformation (Log Transformation)
# Load the california housing data set and apply a log transformation to
# the target variable (MedHouseVal) to make its distribution closer to normal

housing_data = fetch_california_housing()
y = housing_data.target

plt.hist(y, bins=50)
plt.xlabel("MedHouseVal")
plt.ylabel("Frequency")
plt.title("Original Distribution of MedHouseVal")
plt.show()

y_log = np.log1p(y) # log transfromation
plt.hist(y_log, bins=50)
plt.xlabel("log MedHouseVal")
plt.ylabel("Frequency")
plt.title("Log-Transformed Distribution of MedHouseVal")
plt.show()


In [None]:
# Q2 
# Categorical Feature Encoding (One-Hot Encoding)
# Perform one-hot encoding on the Embarked feature using pandas.get_dummies()

df = sns.load_dataset('titanic')
print(df['embarked'].head())
df_encoded = pd.get_dummies(df, columns=['embarked'])
print(df_encoded.head())

In [None]:
# Q3
# Handling Missing Values (Imputation)
# Apply imputation for missing numerical values using the median on the iris dataset

iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)

np.random.seed(42)
missing_indices = np.random.choice(df.index, size=10, replace=False) # randomly assign missing values
df.loc[missing_indices, 'sepal length (cm)'] = np.nan

print("Amount of missing values:")
print(df.isna().sum())

imputer = SimpleImputer(strategy="median")
df[["sepal length (cm)"]] = imputer.fit_transform(df[["sepal length (cm)"]])

print("Amount of missing values after imputation:")
print(df.isna().sum())

In [None]:
# Q4
# Outlier Detection (Z-Score)
# Detect and remove outliers from the California housing dataset using the Z-score method

housing_data = fetch_california_housing()
df = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)
print('Shape before outlier removal:', df.shape)

z_scores = np.abs(zscore(df))
no_outliers_df = df[(z_scores <= 3).all(axis=1)]
print("Shape after outlier removal:", no_outliers_df.shape)



In [None]:
# Q5 
# Feature Scaling (Standardization vs Normailization)
# Perform feature scaling on the wine quality dataset using both standardization and normalization

wine_data = fetch_ucirepo(id=186)

X = wine_data.data.features
y = wine_data.data.targets
print('Features:')
print(X.head())
print('Targets:')
print(y.head())

standard_scaler = StandardScaler()
X_standardized = standard_scaler.fit_transform(X)
X_standardized_df = pd.DataFrame(X_standardized, columns=X.columns)
print('Standardized Features:')
print(X_standardized_df.head())

minmax_scaler = MinMaxScaler()
X_normalized = minmax_scaler.fit_transform(X)
X_normalized_df = pd.DataFrame(X_normalized, columns=X.columns)
print('Normalized Features:')
print(X_normalized_df.head())


In [None]:
# Q6
# Binning (Discretization)
# Apply binning to the Age feature in the diabetes dataset

df = pd.read_csv('../datasets/diabetes.csv')

bins = [0, 30, 65, 100]
labels = ["Under 30", "Middle-aged", "Senior(65+)"]
df["Age_binned"] = pd.cut(
    df["Age"],
    bins=bins,
    labels=labels
)
print(df["Age_binned"].value_counts())

In [None]:
# Q7
# Feature Extraction (Principal Component Analysis)
# Perform feature extraction using Principal Component Analysis (PCA) on Breast Cancer Dataset

cancer_data = load_breast_cancer()
X = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)
y = cancer_data.target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # PCA is scale-sensitive so standardization is required

pca = PCA(n_components=2) # Reduce dataset to 2 dimensions
X_pca = pca.fit_transform(X_scaled)

plt.scatter(X_pca[:,0], X_pca[:,1], c=y, cmap="coolwarm")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA Breast Cancer Dataset")
plt.show()


In [None]:
# Q8
# Feature Selection (Recursive Feature Elimination)
# Use Recursive Feature Elimination (RFE) to select the most important (top 5) features for predicting wine quality.

wine_data = fetch_ucirepo(id=186)

X = wine_data.data.features
y = wine_data.data.targets
print('Original:')
print(X.columns)

model = LinearRegression()
rfe = RFE(estimator=model, n_features_to_select=5)
rfe.fit(X, y)
print('Selected features:')
print(X.columns[rfe.support_])

In [None]:
# Q9
# Feature Engineering with Time Series Data
# Create new features from the time series data to predict the air quality in a given city.

air_quality = fetch_ucirepo(id=360) 

df = air_quality.data.features

df["Datetime"] = pd.to_datetime(
    df["Date"].astype(str) + " " + df["Time"].astype(str),
    errors="coerce"
)

df["day_of_week"] = df["Datetime"].dt.dayofweek
df["hour"] = df["Datetime"].dt.hour
df["month"] = df["Datetime"].dt.month
print(df[["Datetime", "day_of_week", "hour", "month"]].head())

In [None]:
# Q10
# Feature Transformation (Polynomial Features)
# Use polynomial features to enhance a linear regression model for predicting house prices of California housing dataset.

housing_data = fetch_california_housing()

X = housing_data.data
y = housing_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nPolynomial Regression (Degree 2)")
print("MSE:", mse)
print("R Squared:", r2)

poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
model_poly = LinearRegression()
model_poly.fit(X_train_poly, y_train)

y_pred_poly = model_poly.predict(X_test_poly)
mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)

print("\nPolynomial Regression (Degree 2)")
print("MSE:", mse_poly)
print("R Squared:", r2_poly)