# Feature Engineering: A Comprehensive Guide with Examples and ML Terminology

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from category_encoders import BinaryEncoder, TargetEncoder

# Set random seed for reproducibility
np.random.seed(42)

## Creating Sample Data

In [None]:
# Create sample data
data = pd.DataFrame({
    'age': np.random.randint(18, 70, 1000),
    'income': np.random.randint(20000, 200000, 1000),
    'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 1000),
    'job_category': np.random.choice(['Entry', 'Mid', 'Senior', 'Executive'], 1000),
    'credit_score': np.random.randint(300, 850, 1000),
    'has_car': np.random.choice([True, False], 1000),
    'favorite_color': np.random.choice(['Blue', 'Red', 'Green', 'Yellow', 'Purple'], 1000)
})

# Add some missing values
data.loc[np.random.choice(data.index, 50), 'credit_score'] = np.nan

print("Original Data:")
print(data.head())
print("\nData Info:")
print(data.info())

## 1. Feature Creation

In [None]:
data['income_per_age'] = data['income'] / data['age']
print(data[['age', 'income', 'income_per_age']].head())

## 2. Feature Transformation

In [None]:
data['log_income'] = np.log(data['income'])

plt.figure(figsize=(12, 5))
plt.subplot(121)
sns.histplot(data['income'], kde=True)
plt.title('Income Distribution')
plt.subplot(122)
sns.histplot(data['log_income'], kde=True)
plt.title('Log Income Distribution')
plt.tight_layout()
plt.show()

print(data[['income', 'log_income']].describe())

## 3. Handling Categorical Variables

### a. One-Hot Encoding

In [None]:
onehot = OneHotEncoder(sparse=False)
education_encoded = onehot.fit_transform(data[['education']])
education_df = pd.DataFrame(education_encoded, columns=onehot.get_feature_names_out(['education']))
print(education_df.head())

### b. Label Encoding

In [None]:
le = LabelEncoder()
data['job_category_encoded'] = le.fit_transform(data['job_category'])
print(data[['job_category', 'job_category_encoded']].head())

### c. Binary Encoding

In [None]:
be = BinaryEncoder(cols=['favorite_color'])
color_binary = be.fit_transform(data['favorite_color'])
print(color_binary.head())

### d. Target Encoding

In [None]:
te = TargetEncoder(cols=['education'])
data['education_target_encoded'] = te.fit_transform(data['education'], data['income'])
print(data[['education', 'education_target_encoded']].head())

## 4. Feature Scaling

In [None]:
scaler = StandardScaler()
data['scaled_age'] = scaler.fit_transform(data[['age']])
print(data[['age', 'scaled_age']].describe())

## 5. Handling Missing Values

In [None]:
imputer = SimpleImputer(strategy='mean')
data['credit_score_imputed'] = imputer.fit_transform(data[['credit_score']])

print("Original credit_score:")
print(data['credit_score'].describe())
print("\nImputed credit_score:")
print(data['credit_score_imputed'].describe())

## 6. Feature Selection

In [None]:
X = data[['age', 'income', 'credit_score_imputed']]
y = data['income_per_age']
selector = SelectKBest(score_func=f_regression, k=2)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()].tolist()
print("Selected features:", selected_features)

## 7. Dimensionality Reduction

### a. Principal Component Analysis (PCA)

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(10, 8))
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5)
plt.title('PCA of Numeric Features')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.show()

print("Explained variance ratio:", pca.explained_variance_ratio_)

### b. Truncated SVD

In [None]:
svd = TruncatedSVD(n_components=2, random_state=42)
X_svd = svd.fit_transform(X)

plt.figure(figsize=(10, 8))
plt.scatter(X_svd[:, 0], X_svd[:, 1], alpha=0.5)
plt.title('Truncated SVD of Numeric Features')
plt.xlabel('First SVD Component')
plt.ylabel('Second SVD Component')
plt.show()

print("Explained variance ratio:", svd.explained_variance_ratio_)

### c. t-SNE

In [None]:
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

plt.figure(figsize=(10, 8))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.5)
plt.title('t-SNE of Numeric Features')
plt.xlabel('First t-SNE Component')
plt.ylabel('Second t-SNE Component')
plt.show()

print("t-SNE does not provide an explained variance ratio.")

## Final Processed Data

In [None]:
print("Final Processed Data:")
print(data.head())
print("\nFinal Data Info:")
print(data.info())