In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.experimental import enable_iterative_imputer
import warnings
warnings.filterwarnings('ignore')

In [105]:
# Load the cleaned datasets
train_clean = pd.read_csv('dataset/train_cleaned.csv')
test_clean = pd.read_csv('dataset/test_cleaned.csv')

### Missing value imputation

#### Temperature

In [106]:
train_clean.isna().sum()

id                       0
temperature            972
irradiance             985
humidity               120
panel_age              981
maintenance_count     1003
soiling_ratio          991
voltage                970
current                954
module_temperature     962
cloud_coverage         974
wind_speed             114
pressure               134
string_id                0
error_code            5795
installation_type     4920
efficiency               0
dtype: int64

In [107]:
# Count missing values
missing_temp = train_clean['temperature'].isna()
missing_module_temp = train_clean['module_temperature'].isna()

# Check overlap
both_missing = missing_temp & missing_module_temp
only_temp_missing = missing_temp & (~missing_module_temp)
only_module_missing = (~missing_temp) & missing_module_temp

print(f"Total rows with missing temperature: {missing_temp.sum()}")
print(f"Total rows with missing module_temperature: {missing_module_temp.sum()}")
print(f"Rows where BOTH are missing: {both_missing.sum()}")
print(f"Rows where ONLY temperature is missing: {only_temp_missing.sum()}")
print(f"Rows where ONLY module_temperature is missing: {only_module_missing.sum()}")

Total rows with missing temperature: 972
Total rows with missing module_temperature: 962
Rows where BOTH are missing: 31
Rows where ONLY temperature is missing: 941
Rows where ONLY module_temperature is missing: 931


In [108]:
from sklearn.linear_model import LinearRegression

# Function to impute one column from another using regression
def impute_from_another(df, target_col, predictor_col):
    df_copy = df.copy()
    
    # Prepare training data
    train = df_copy[df_copy[target_col].notna() & df_copy[predictor_col].notna()]
    model = LinearRegression()
    model.fit(train[[predictor_col]], train[target_col])

    # Predict missing values
    missing_mask = df_copy[target_col].isna() & df_copy[predictor_col].notna()
    predicted_values = model.predict(df_copy.loc[missing_mask, [predictor_col]])
    
    # Fill missing values
    df_copy.loc[missing_mask, target_col] = predicted_values
    print(f"Imputed {missing_mask.sum()} missing values in '{target_col}' using '{predictor_col}'")

    return df_copy

# First impute temperature from module_temperature
df = impute_from_another(train_clean, 'temperature', 'module_temperature')
test_df = impute_from_another(test_clean, 'temperature', 'module_temperature')

# Then impute module_temperature from temperature (if needed)
df = impute_from_another(df, 'module_temperature', 'temperature')
test_df = impute_from_another(test_df, 'module_temperature', 'temperature')

Imputed 941 missing values in 'temperature' using 'module_temperature'
Imputed 541 missing values in 'temperature' using 'module_temperature'
Imputed 931 missing values in 'module_temperature' using 'temperature'
Imputed 537 missing values in 'module_temperature' using 'temperature'


In [109]:
df.isna().sum()

id                       0
temperature             31
irradiance             985
humidity               120
panel_age              981
maintenance_count     1003
soiling_ratio          991
voltage                970
current                954
module_temperature      31
cloud_coverage         974
wind_speed             114
pressure               134
string_id                0
error_code            5795
installation_type     4920
efficiency               0
dtype: int64

In [110]:
corr = df[['temperature', 'module_temperature']].corr().iloc[0, 1]
print(f"Correlation between temperature and module_temperature: {corr:.4f}")

Correlation between temperature and module_temperature: 0.9719


In [111]:
df[df['irradiance'].isna()]

Unnamed: 0,id,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure,string_id,error_code,installation_type,efficiency
12,12,2.682375,,16.319750,11.084578,4.0,0.782356,20.927677,1.861089,8.004642,15.243883,0.467009,1016.505323,B2,E02,,0.576775
31,31,1.348561,,98.595111,0.221669,7.0,0.459757,0.000000,1.237910,9.558745,77.078146,1.150128,1013.048070,B2,E02,dual-axis,0.511868
40,40,47.973606,,22.058749,16.093618,4.0,0.428969,47.819612,3.264913,52.238395,80.213642,2.785403,,B2,E01,tracking,0.515999
68,68,29.307608,,99.898344,1.732981,3.0,0.468616,21.474244,1.247087,36.027634,65.807281,7.651904,1011.861027,D4,,dual-axis,0.484732
168,169,15.546451,,48.107634,13.902809,2.0,0.716255,41.747717,1.002628,23.726328,71.205079,14.211828,1008.848629,C3,E01,,0.527169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19480,19938,24.427234,,77.702432,4.047269,5.0,0.466077,13.701232,3.211816,29.662117,73.437600,8.229808,1015.616723,C3,E01,fixed,0.000000
19491,19949,38.850955,,39.843715,8.457577,3.0,0.508990,,,44.613603,94.762371,11.281793,998.887874,C3,,fixed,0.488913
19495,19953,14.976377,,90.535240,13.823331,2.0,0.930509,9.106015,1.084803,17.341767,9.478544,0.060479,1007.823292,B2,,tracking,0.650654
19509,19967,20.422967,,90.701671,23.983481,4.0,0.893158,44.418868,1.913165,30.382020,,0.424010,1019.911345,D4,E01,tracking,0.635271


MICE imputer for irradiance, voltage, current, panel age, cloud coverage, maintanence count

In [112]:
from sklearn.experimental import enable_iterative_imputer  # Needed to enable IterativeImputer
from sklearn.impute import IterativeImputer
import pandas as pd

# List of columns to apply MICE on
mice_cols = ['irradiance', 'voltage', 'current', 'panel_age', 'cloud_coverage', 'maintenance_count','soiling_ratio']

# Subset of the dataframe to impute
mice_data = df[mice_cols]
mice_data_test = test_df[mice_cols]

# Create the imputer
mice_imputer = IterativeImputer(random_state=42, max_iter=10, sample_posterior=False)

# Fit and transform
mice_imputed_array = mice_imputer.fit_transform(mice_data)
mice_imputed_array_test = mice_imputer.transform(mice_data_test)

# Convert back to DataFrame
mice_imputed_df = pd.DataFrame(mice_imputed_array, columns=mice_cols, index=df.index)
mice_imputed_df_test = pd.DataFrame(mice_imputed_array_test, columns=mice_cols, index=test_df.index)

# Replace the original columns in df with the imputed ones
df[mice_cols] = mice_imputed_df
test_df[mice_cols] = mice_imputed_df_test

print(f"MICE imputation completed on columns: {', '.join(mice_cols)}")

MICE imputation completed on columns: irradiance, voltage, current, panel_age, cloud_coverage, maintenance_count, soiling_ratio


In [113]:
df.isna().sum()

id                       0
temperature             31
irradiance               0
humidity               120
panel_age                0
maintenance_count        0
soiling_ratio            0
voltage                  0
current                  0
module_temperature      31
cloud_coverage           0
wind_speed             114
pressure               134
string_id                0
error_code            5795
installation_type     4920
efficiency               0
dtype: int64

In [114]:
from sklearn.impute import KNNImputer
import pandas as pd

# Columns to apply KNN imputer on
knn_cols = ['wind_speed', 'pressure', 'temperature', 'module_temperature', 'humidity']

# Subset the data
knn_data = df[knn_cols]
knn_data_test = test_df[knn_cols]

# Initialize KNN Imputer
knn_imputer = KNNImputer(n_neighbors=5, weights='uniform')  # You can change n_neighbors if needed

# Fit and transform the data
knn_imputed_array = knn_imputer.fit_transform(knn_data)
knn_imputed_array_test = knn_imputer.transform(knn_data_test)

# Convert to DataFrame and update original df
knn_imputed_df = pd.DataFrame(knn_imputed_array, columns=knn_cols, index=df.index)
knn_imputed_df_test = pd.DataFrame(knn_imputed_array_test, columns=knn_cols, index=test_df.index)
df[knn_cols] = knn_imputed_df
test_df[knn_cols] = knn_imputed_df_test

print(f"KNN imputation completed on columns: {', '.join(knn_cols)}")


KNN imputation completed on columns: wind_speed, pressure, temperature, module_temperature, humidity


In [115]:
df.isna().sum()

id                       0
temperature              0
irradiance               0
humidity                 0
panel_age                0
maintenance_count        0
soiling_ratio            0
voltage                  0
current                  0
module_temperature       0
cloud_coverage           0
wind_speed               0
pressure                 0
string_id                0
error_code            5795
installation_type     4920
efficiency               0
dtype: int64

In [116]:
df['error_code'] = df['error_code'].fillna('NO_ERROR')
test_df['error_code'] = test_df['error_code'].fillna('NO_ERROR')

In [117]:
# Get mode (most frequent value)
mode_installation_type = df['installation_type'].mode()[0]

# Fill missing values with mode
df['installation_type'] = df['installation_type'].fillna(mode_installation_type)
test_df['installation_type'] = test_df['installation_type'].fillna(mode_installation_type)

In [118]:
df.isna().sum()

id                    0
temperature           0
irradiance            0
humidity              0
panel_age             0
maintenance_count     0
soiling_ratio         0
voltage               0
current               0
module_temperature    0
cloud_coverage        0
wind_speed            0
pressure              0
string_id             0
error_code            0
installation_type     0
efficiency            0
dtype: int64

In [119]:
test_df.isna().sum()

id                    0
temperature           0
irradiance            0
humidity              0
panel_age             0
maintenance_count     0
soiling_ratio         0
voltage               0
current               0
module_temperature    0
cloud_coverage        0
wind_speed            0
pressure              0
string_id             0
error_code            0
installation_type     0
dtype: int64

In [120]:
df.sample(10)

Unnamed: 0,id,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure,string_id,error_code,installation_type,efficiency
1908,1966,14.593308,559.597094,61.934894,27.435067,4.0,0.765379,31.805476,0.17338,15.634162,59.133347,12.076287,1008.918304,B2,E02,dual-axis,0.576055
2634,2711,47.106343,676.633128,81.815202,15.705831,2.0,0.595191,7.18001,3.730048,50.198628,43.082266,4.167525,1018.838899,A1,E00,dual-axis,0.512392
1281,1319,9.220482,797.603391,58.196623,22.795528,5.0,0.425558,22.002977,0.113044,17.010675,57.045243,7.039565,1026.120874,D4,E01,tracking,0.512032
7805,7996,11.847353,1010.123354,31.733338,34.033587,6.0,0.748286,70.84302,3.145319,14.272717,10.352696,14.935356,1006.867932,C3,NO_ERROR,dual-axis,0.6961
8951,9167,52.279963,406.460851,31.770413,30.218076,6.0,0.74901,26.01434,1.306009,61.684088,0.977997,1.984053,1016.791999,B2,E01,fixed,0.467972
12503,12793,28.512617,475.655021,48.644517,9.77529,6.0,0.694073,36.786456,0.74521,31.698215,87.909721,3.291386,1007.946149,D4,E02,tracking,0.546812
16669,17065,34.871924,599.25619,37.591281,17.529867,1.0,0.694957,16.36,5.414236,32.471193,70.508406,13.298716,1009.714489,B2,E00,dual-axis,0.559717
14507,14847,24.621439,433.247615,85.268474,22.375491,6.0,0.756108,24.983959,1.223686,25.798223,84.616351,10.338249,1005.070743,A1,E02,tracking,0.478092
14572,14912,30.828414,741.198309,71.446614,29.519182,2.0,0.694273,29.730585,2.455758,34.800479,32.948339,1.983784,1024.490758,D4,NO_ERROR,fixed,0.586548
6532,6691,19.003952,609.74316,24.434707,4.054841,2.0,0.79849,28.90259,0.292833,26.852388,59.934491,9.305369,1022.39141,B2,E02,tracking,0.671128


In [121]:
df.describe()

Unnamed: 0,id,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure,efficiency
count,19542.0,19542.0,19542.0,19542.0,19542.0,19542.0,19542.0,19542.0,19542.0,19542.0,19542.0,19542.0,19542.0,19542.0
mean,10003.926875,24.904875,514.922185,50.049263,17.503515,4.009304,0.699328,16.445494,1.729406,29.918104,49.754641,7.408708,1012.985282,0.514547
std,5772.075772,11.739286,231.908971,28.634418,9.838813,1.948388,0.16774,17.539145,1.133221,12.099398,28.10419,4.320534,10.01259,0.138346
min,0.0,-1.345699,0.633327,0.010714,0.001264,0.0,0.400149,0.0,5.4e-05,0.0,0.000244,0.001277,970.087365,0.0
25%,5006.25,16.859068,353.496992,25.366157,9.22111,3.0,0.559335,0.0,0.821033,21.551461,26.508603,3.685623,1006.226721,0.450694
50%,10005.5,24.719956,504.998102,50.223352,17.499696,4.0,0.699075,13.310627,1.593866,29.858189,49.748436,7.384527,1012.906673,0.518481
75%,14996.75,32.794953,667.09259,74.371458,25.819418,5.0,0.839928,26.069038,2.457625,38.089743,73.583077,11.131396,1019.74661,0.592263
max,19999.0,60.0,1537.810349,99.995202,34.998379,15.0,0.999949,494.279016,7.315597,65.071839,99.999267,14.999448,1052.865715,0.987066


In [122]:
df.to_csv('dataset/imputed/train_imputed.csv', index=False)
test_df.to_csv('dataset/imputed/test_imputed.csv', index=False)

### Domain-Specific Feature Creation

In [89]:
def create_solar_features(df):
    """Create domain-specific features for solar panel analysis"""
    df_engineered = df.copy()
    
    # Power calculation (P = V * I)
    if 'voltage' in df.columns and 'current' in df.columns:
        df_engineered['power_output'] = df_engineered['voltage'] * df_engineered['current']
    
    # Temperature difference (module vs ambient)
    if 'module_temperature' in df.columns and 'temperature' in df.columns:
        df_engineered['temp_difference'] = (df_engineered['module_temperature'] - 
                                          df_engineered['temperature'])
    
    # Performance ratio (considering irradiance and temperature effects)
    if 'irradiance' in df.columns and 'temperature' in df.columns:
        # Normalized irradiance (relative to standard test conditions: 1000 W/m²)
        df_engineered['irradiance_normalized'] = df_engineered['irradiance'] / 1000
        
        # Temperature coefficient effect (typical -0.4%/°C)
        df_engineered['temp_coefficient_effect'] = 1 - 0.004 * (df_engineered['temperature'] - 25)
    
    # Soiling impact on expected performance
    if 'soiling_ratio' in df.columns and 'irradiance' in df.columns:
        df_engineered['expected_irradiance_clean'] = (df_engineered['irradiance'] / 
                                                     df_engineered['soiling_ratio'])
        df_engineered['soiling_loss'] = (df_engineered['expected_irradiance_clean'] - 
                                        df_engineered['irradiance'])
    
    # Weather interaction features
    if 'cloud_coverage' in df.columns and 'irradiance' in df.columns:
        df_engineered['irradiance_cloud_ratio'] = (df_engineered['irradiance'] / 
                                                  (100 - df_engineered['cloud_coverage'] + 1))
    
    # Aging effects
    if 'panel_age' in df.columns:
        # Typical degradation rate: 0.5-0.8% per year
        df_engineered['age_degradation_factor'] = 1 - (0.006 * df_engineered['panel_age'])
        df_engineered['age_category'] = pd.cut(df_engineered['panel_age'], 
                                              bins=[0, 2, 5, 10, float('inf')],
                                              labels=['New', 'Young', 'Mature', 'Old'])
    
    # Maintenance effectiveness
    if 'maintenance_count' in df.columns and 'panel_age' in df.columns:
        df_engineered['maintenance_frequency'] = (df_engineered['maintenance_count'] / 
                                                 (df_engineered['panel_age'] + 1))
    
    # Environmental stress factors
    if 'humidity' in df.columns and 'temperature' in df.columns:
        # Heat index approximation
        df_engineered['environmental_stress'] = (df_engineered['humidity'] * 
                                               df_engineered['temperature'] / 100)
    
    # Wind cooling effect
    if 'wind_speed' in df.columns and 'module_temperature' in df.columns:
        df_engineered['wind_cooling_effect'] = df_engineered['wind_speed'] * 2  # Simplified model
        df_engineered['effective_module_temp'] = (df_engineered['module_temperature'] - 
                                                 df_engineered['wind_cooling_effect'])
    
    return df_engineered

# Apply feature engineering
train_engineered = create_solar_features(df)
test_engineered = create_solar_features(df)

print("New features created:")
new_features = set(train_engineered.columns) - set(df.columns)
for feature in new_features:
    print(f"- {feature}")

New features created:
- irradiance_cloud_ratio
- soiling_loss
- environmental_stress
- maintenance_frequency
- expected_irradiance_clean
- power_output
- temp_coefficient_effect
- wind_cooling_effect
- irradiance_normalized
- age_degradation_factor
- age_category
- temp_difference
- effective_module_temp


In [92]:
train_engineered

Unnamed: 0,id,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,...,temp_coefficient_effect,expected_irradiance_clean,soiling_loss,irradiance_cloud_ratio,age_degradation_factor,age_category,maintenance_frequency,environmental_stress,wind_cooling_effect,effective_module_temp
0,0,7.817315,576.179270,41.243087,32.135501,4.0,0.803199,37.403527,1.963787,13.691147,...,1.068731,717.355436,141.176166,14.963381,0.807187,Old,0.120716,3.224102,25.649824,-11.958677
1,1,24.785727,240.003973,1.359648,19.977460,8.0,0.479456,21.843315,0.241473,27.545096,...,1.000857,500.575786,260.571813,4.199636,0.880135,Old,0.381362,0.336999,24.024087,3.521009
2,2,46.652695,687.612799,91.265368,1.496401,4.0,0.822398,48.222882,4.191800,43.363708,...,0.913389,836.106607,148.493808,13.391385,0.991022,New,1.602307,42.577754,3.628800,39.734909
3,3,53.339567,735.141179,96.190955,18.491582,3.0,0.837529,46.295748,0.960567,57.720436,...,0.886642,877.750499,142.609320,21.854143,0.889051,Old,0.153913,51.307839,17.472518,40.247918
4,4,5.575374,12.241203,27.495073,30.722697,6.0,0.551833,0.000000,0.898062,6.786263,...,1.077699,22.182814,9.941612,0.125721,0.815664,Old,0.189139,1.532953,1.045368,5.740895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19537,19995,16.868428,624.845067,93.530318,14.393967,3.0,0.738911,12.147711,3.005355,26.206810,...,1.032526,845.629465,220.784398,6.294591,0.913636,Old,0.194882,15.777095,25.188245,1.018565
19538,19996,53.415061,296.970303,93.985714,25.997012,2.0,0.513061,0.000000,0.532119,65.000000,...,0.886340,578.820651,281.850348,8.149271,0.844018,Old,0.074082,50.202526,1.953982,63.046018
19539,19997,2.442727,660.328019,37.968918,32.818396,9.0,0.548602,13.047950,4.075498,11.584869,...,1.090229,1203.656537,543.328518,15.260690,0.803090,Old,0.266127,0.927477,9.501874,2.082994
19540,19998,16.660075,632.760700,43.014702,19.063517,4.0,0.700757,0.000000,1.068906,21.149351,...,1.033360,902.967922,270.207222,27.660085,0.885619,Old,0.199367,7.166282,22.608317,-1.458966


In [94]:
# Drop original columns that are no longer needed -> information is now captured in engineered features

train_engineered.drop(columns=['id', 'voltage','current', 'temperature','module_temperature', 'irradiance', 'wind_speed', 'panel_age', 'cloud_coverage', 'soiling_ratio', 'maintenance_count', 'humidity'], inplace=True, errors='ignore')
test_engineered.drop(columns=['id', 'voltage','current', 'temperature','module_temperature', 'irradiance', 'wind_speed', 'panel_age', 'cloud_coverage', 'soiling_ratio', 'maintenance_count', 'humidity'], inplace=True, errors='ignore')

In [97]:
test_engineered

Unnamed: 0,pressure,string_id,error_code,installation_type,efficiency,power_output,temp_difference,irradiance_normalized,temp_coefficient_effect,expected_irradiance_clean,soiling_loss,irradiance_cloud_ratio,age_degradation_factor,age_category,maintenance_frequency,environmental_stress,wind_cooling_effect,effective_module_temp
0,1018.866505,A1,NO_ERROR,tracking,0.562096,73.452561,5.873832,0.576179,1.068731,717.355436,141.176166,14.963381,0.807187,Old,0.120716,3.224102,25.649824,-11.958677
1,1025.623854,D4,E00,dual-axis,0.396447,5.274577,2.759369,0.240004,1.000857,500.575786,260.571813,4.199636,0.880135,Old,0.381362,0.336999,24.024087,3.521009
2,1010.922654,C3,E00,tracking,0.573776,202.140687,-3.288987,0.687613,0.913389,836.106607,148.493808,13.391385,0.991022,New,1.602307,42.577754,3.628800,39.734909
3,1021.846663,A1,NO_ERROR,dual-axis,0.629009,44.470168,4.380870,0.735141,0.886642,877.750499,142.609320,21.854143,0.889051,Old,0.153913,51.307839,17.472518,40.247918
4,1008.555958,B2,E00,fixed,0.341874,0.000000,1.210889,0.012241,1.077699,22.182814,9.941612,0.125721,0.815664,Old,0.189139,1.532953,1.045368,5.740895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19537,1018.374467,B2,E02,tracking,0.664907,36.508185,9.338381,0.624845,1.032526,845.629465,220.784398,6.294591,0.913636,Old,0.194882,15.777095,25.188245,1.018565
19538,1016.081102,D4,E00,fixed,0.354070,0.000000,11.584939,0.296970,0.886340,578.820651,281.850348,8.149271,0.844018,Old,0.074082,50.202526,1.953982,63.046018
19539,1009.684461,D4,NO_ERROR,tracking,0.419734,53.176896,9.142142,0.660328,1.090229,1203.656537,543.328518,15.260690,0.803090,Old,0.266127,0.927477,9.501874,2.082994
19540,1006.673875,A1,E00,tracking,0.661963,0.000000,4.489275,0.632761,1.033360,902.967922,270.207222,27.660085,0.885619,Old,0.199367,7.166282,22.608317,-1.458966


In [100]:
train_engineered.isna().sum()

pressure                     0
string_id                    0
error_code                   0
installation_type            0
efficiency                   0
power_output                 0
temp_difference              0
irradiance_normalized        0
temp_coefficient_effect      0
expected_irradiance_clean    0
soiling_loss                 0
irradiance_cloud_ratio       0
age_degradation_factor       0
age_category                 0
maintenance_frequency        0
environmental_stress         0
wind_cooling_effect          0
effective_module_temp        0
dtype: int64

In [101]:
train_engineered.to_csv('dataset/train_engineered.csv', index=False)
test_engineered.to_csv('dataset/test_engineered.csv', index=False)
print("Feature engineering completed and saved to 'dataset/train_engineered.csv' and 'dataset/test_engineered.csv'.")

Feature engineering completed and saved to 'dataset/train_engineered.csv' and 'dataset/test_engineered.csv'.


### Feature Selection Strategy

In [4]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load data
df = pd.read_csv('dataset/train_engineered.csv')
X = df.drop(columns=['efficiency'], errors='ignore')
y = df['efficiency']

# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include='number').columns.tolist()

# Column transformer: scale numeric, encode categorical
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(drop='first'), categorical_cols)
])

# Feature selector
selector = SelectKBest(score_func=f_regression, k=10)

# Full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', selector)
])

# Fit pipeline
pipeline.fit(X, y)

# Get transformed feature names
ohe = pipeline.named_steps['preprocessor'].named_transformers_['cat']
encoded_cat_features = ohe.get_feature_names_out(categorical_cols)
all_feature_names = numeric_cols + list(encoded_cat_features)

# Get scores and map to feature names
scores = selector.scores_
feature_scores = pd.Series(scores, index=all_feature_names)

# Sort and show top scores
sorted_scores = feature_scores.sort_values(ascending=False)
print("\nTop feature scores:\n")
print(sorted_scores.head(20))



Top feature scores:

irradiance_normalized         8308.476914
expected_irradiance_clean     1819.499986
power_output                  1120.855762
age_degradation_factor         693.356100
age_category_Old               404.824034
irradiance_cloud_ratio         265.600623
maintenance_frequency          253.292553
age_category_Young             114.056072
environmental_stress           113.958273
age_category_New               113.823313
temp_coefficient_effect         45.908522
effective_module_temp           28.831057
soiling_loss                     6.999853
temp_difference                  3.391808
error_code_E01                   2.356931
string_id_C3                     2.249496
installation_type_tracking       2.066514
error_code_NO_ERROR              1.891768
pressure                         0.598145
wind_cooling_effect              0.287245
dtype: float64
