In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE


X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=5, random_state=42)
X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])


print("Original Data:")
print(X.head())


X['feature_20_21_sum'] = X['feature_0'] + X['feature_1']
X['feature_10_11_diff'] = X['feature_10'] - X['feature_11']


print("\nData with Engineered Features:")
print(X.head())


pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)
print("\nPCA Components:")
print(X_pca[:5])


clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, y)


importances = clf.feature_importances_
feature_names = X.columns
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)


print("\nFeature Importances:")
print(feature_importances)


selector = RFE(clf, n_features_to_select=10, step=1)
selector = selector.fit(X, y)
selected_features = X.columns[selector.support_]


print("\nSelected Features using RFE:")
print(selected_features)

X_selected = X[selected_features]
print("\nOptimized Feature Set Data:")
print(X_selected.head())

Original Data:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0   1.470848  -0.360450  -0.591602  -0.728228   0.941690   1.065964   
1   4.513369  -2.227103  -1.140747   2.018263  -2.238358  -0.497370   
2  -2.355643   2.218601  -1.603269   0.873394   0.401483   0.717264   
3  -1.596198  -0.857427   1.772434  -0.639361   1.419409  -0.438525   
4   2.840049  -2.489600  -0.844902  -1.594362  -4.688517   0.459637   

   feature_6  feature_7  feature_8  feature_9  feature_10  feature_11  \
0   0.017832  -0.596184   1.840712  -1.497093   -1.202672   -0.603968   
1   0.714550   0.938883  -2.395169   0.159837    0.133942    1.461499   
2  -0.859399  -1.042190  -2.175965   0.980231    1.498546    0.544434   
3   0.281949   2.345145   1.006230   0.389135    0.238566   -1.025051   
4   0.913607  -1.143505   1.263937  -2.040928    0.675664    4.176424   

   feature_12  feature_13  feature_14  feature_15  feature_16  feature_17  \
0    2.899256    0.037567   -1.249523    0