In [None]:
import os
os.environ['NUMBA_CUDA_SUPPORTED'] = "0"
os.environ["TSFRESH_NO_Numba"] = "1"
import pandas as pd
import numpy as np
from scipy import stats, signal, fft
from scipy.stats import skew, kurtosis
import librosa
from tsfresh.feature_extraction import feature_calculators
import gc
from tqdm.auto import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

input_dir = '/kaggle/input/earthquake/split_events'
output_dir = '/kaggle/working/feature_created/'
os.makedirs(output_dir, exist_ok=True)

print("Setup Complete.")

# --- 피처 함수 1: Basic Statistics ---
def calculate_basic_statistics(data):
    features = {}
    features['mean'] = np.mean(data); features['median'] = np.median(data)
    features['min'] = np.min(data); features['max'] = np.max(data)
    features['range'] = features['max'] - features['min']; features['std'] = np.std(data)
    features['variance'] = np.var(data); features['q25'] = np.percentile(data, 25)
    features['q75'] = np.percentile(data, 75); features['iqr'] = features['q75'] - features['q25']
    features['skewness'] = skew(data); features['kurtosis'] = kurtosis(data)
    return features

# --- 피처 함수 2: Rolling Statistics ---
def calculate_rolling_statistics(data, window=1000):
    features = {}
    series = pd.Series(data)
    rolling_mean = series.rolling(window=window, min_periods=1).mean()
    features['rolling_mean_mean'] = rolling_mean.mean()
    features['rolling_mean_std'] = rolling_mean.std(ddof=0)
    rolling_std = series.rolling(window=window, min_periods=1).std()
    features['rolling_std_mean'] = rolling_std.mean()
    features['rolling_std_std'] = rolling_std.std(ddof=0)
    return features

# --- 피처 함수 3: Signal Features ---
def calculate_signal_features(data):
    features = {}
    features['zero_crossing_rate'] = len(np.where(np.diff(np.sign(data)))[0]) / len(data)
    peaks, properties = signal.find_peaks(data, height=np.mean(data))
    features['num_peaks'] = len(peaks)
    features['peak_mean_height'] = np.mean(properties['peak_heights']) if len(peaks) > 0 else 0
    return features

# --- 피처 함수 4: Spectral Features ---
def calculate_spectral_features(data, sr=4000000, n_mfcc=13):
    features = {}
    data_float = data.astype(np.float32)
    mfccs = librosa.feature.mfcc(y=data_float, sr=sr, n_mfcc=n_mfcc, n_mels=40)
    for i in range(n_mfcc):
        features[f'mfcc_{i}_mean'] = np.mean(mfccs[i])
    fft_values = np.abs(fft.fft(data))
    fft_freq = fft.fftfreq(len(data), 1/sr)
    sorted_indices = np.argsort(fft_values)[::-1]
    features['fft_magnitude_1'] = fft_values[sorted_indices[0]]
    features['fft_frequency_1'] = np.abs(fft_freq[sorted_indices[0]])
    return features

# --- 피처 함수 5: Selected Tsfresh Features ---
def calculate_selected_tsfresh(x):
    features = {}
    features['tsf_abs_energy'] = feature_calculators.abs_energy(x)
    features['tsf_mean_abs_change'] = feature_calculators.mean_abs_change(x)
    features['tsf_binned_entropy'] = feature_calculators.binned_entropy(x, max_bins=10)
    features['tsf_autocorrelation_lag10'] = feature_calculators.autocorrelation(x, lag=10)
    features['tsf_longest_strike_above_mean'] = feature_calculators.longest_strike_above_mean(x)
    try:
        features['tsf_ar_coefficient_1'] = feature_calculators.ar_coefficient(x, [{"coeff": 1, "k": 10}])[0][1]
    except Exception:
        features['tsf_ar_coefficient_1'] = np.nan
    return features
    try:
        features['tsf_fft_real_coeff_1'] = feature_calculators.fft_coefficient(x, [{"coeff": 1, "attr": "real"}])[0][1]
        features['tsf_fft_real_coeff_5'] = feature_calculators.fft_coefficient(x, [{"coeff": 5, "attr": "real"}])[0][1]
    except Exception:
        features['tsf_fft_real_coeff_1'], features['tsf_fft_real_coeff_5'] = np.nan, np.nan
    try:
        features['tsf_spkt_welch_density_5'] = feature_calculators.spkt_welch_density(x, [{"coeff": 5}])[0][1]
    except Exception:
        features['tsf_spkt_welch_density_5'] = np.nan
    try:
        # CWT: 특정 스케일(주파수 대역)에서의 에너지
        cwt_coeffs = feature_calculators.cwt_coefficients(x, widths=[2, 5, 10, 20], coeff=5, w=5)
        features['tsf_cwt_coeff_5_w5'] = cwt_coeffs[0][1] if cwt_coeffs else np.nan
    except Exception:
        features['tsf_cwt_coeff_5_w5'] = np.nan
    try:
        features['tsf_approximate_entropy'] = feature_calculators.approximate_entropy(x, m=2, r=0.5)
    except Exception:
        features['tsf_approximate_entropy'] = np.nan

def extract_comprehensive_features(segment_data, target):
    data = segment_data.values
    
    features = calculate_basic_statistics(data)
    features.update(calculate_rolling_statistics(data))
    features.update(calculate_signal_features(data))
    features.update(calculate_spectral_features(data))
    features.update(calculate_selected_tsfresh(data))
    
    features['time_to_failure'] = target
    return features

def segment_generator(cycle_df):
    n_samples = len(cycle_df)
    segment_len = 150000
    for i in range(n_samples, segment_len -1, -segment_len):
        start = i - segment_len; end = i
        segment_df = cycle_df.iloc[start:end]
        yield segment_df['acoustic_data'], segment_df['time_to_failure'].iloc[-1]
    rem_len = n_samples % segment_len
    if rem_len > 0:
        leftover_df = cycle_df.iloc[:rem_len]
        padding_value = leftover_df['acoustic_data'].median()
        padding = pd.Series([padding_value] * (segment_len - rem_len), dtype=np.int16)
        padded_segment = pd.concat([padding, leftover_df['acoustic_data']], ignore_index=True)
        yield padded_segment, leftover_df['time_to_failure'].iloc[-1]

start_total_time = time.time()
print("\nStarting comprehensive feature extraction for all 17 event files...")

for cycle_id in range(1, 18):
    start_cycle_time = time.time()
    input_file = os.path.join(input_dir, f'event_{cycle_id:02d}.csv')
    
    if not os.path.exists(input_file):
        print(f"Warning: File not found for event {cycle_id}, skipping. Path: {input_file}")
        continue
        
    print(f"\n--- Processing Event {cycle_id}/{17} ---")

    try:
        cycle_df = pd.read_csv(input_file)
        print(f"  Data loaded ({len(cycle_df):,} rows).")

        feature_rows = []
        num_segments = (len(cycle_df) + 149999) // 150000
        for seg_data, target in tqdm(segment_generator(cycle_df), total=num_segments, desc=f"  Event {cycle_id} Segments"):
            features = extract_comprehensive_features(seg_data, target)
            features['cycle_id'] = cycle_id
            feature_rows.append(features)
        
        del cycle_df
        gc.collect()

        cycle_feature_df = pd.DataFrame(feature_rows)
        cycle_feature_df = cycle_feature_df.iloc[::-1].reset_index(drop=True)
        cycle_feature_df['segment_in_cycle_id'] = range(len(cycle_feature_df))
        
        output_file = os.path.join(output_dir, f'event_{cycle_id:02d}.parquet')
        cycle_feature_df.to_parquet(output_file, index=False)

        end_cycle_time = time.time()
        print(f"  Finished processing Event {cycle_id}, generated {len(feature_rows)} segments.")
        print(f"  Time taken for this event: {(end_cycle_time - start_cycle_time):.2f} seconds.")
        print(f"  Feature file saved to: {output_file}")

    except Exception as e:
        print(f"An error occurred while processing Event {cycle_id}: {e}")

end_total_time = time.time()
print(f"All 17 feature files are saved in the directory: {output_dir}")

In [None]:
import pandas as pd
import os
from tqdm.auto import tqdm

input_dir = '/kaggle/working/feature_created/' 
output_dir = '/kaggle/working/'
final_output_file = os.path.join(output_dir, 'final_features_comprehensive.parquet')

all_dfs_list = []
print("Starting to combine all 17 feature files...")

for i in tqdm(range(1, 18), desc="Combining Files"):
    file_path = os.path.join(input_dir, f'event_{i:02d}.parquet')
    try:
        df_part = pd.read_parquet(file_path)
        all_dfs_list.append(df_part)
    except FileNotFoundError:
        print(f"Warning: File not found for event {i}, skipping.")

if all_dfs_list:
    final_df = pd.concat(all_dfs_list, ignore_index=True)
    print(f"\nSuccessfully combined all parts. Total rows: {len(final_df)}")
    
    print(f"Saving final combined data to Parquet file: {final_output_file}")
    final_df.to_parquet(final_output_file, index=False)
    print("Final file saved successfully.")
else:
    print("No intermediate files found to combine.")

print("\n--- Final DataFrame Info ---")
final_df.info()
print("\n--- First 5 rows ---")
print(final_df.head())

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
import seaborn as sns

feature_file = '/kaggle/working/final_features_comprehensive.parquet'
df = pd.read_parquet(feature_file)
df = df.fillna(0)

X = df.drop(columns=['time_to_failure', 'cycle_id', 'segment_in_cycle_id'])
y = df['time_to_failure']
groups = df['cycle_id']

n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

feature_importances = pd.DataFrame(index=X.columns)

print(f"Starting {n_splits}-Fold GroupKFold to calculate feature importances...")
for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups)):
    print(f"--- Fold {fold+1}/{n_splits} ---")
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    model = lgb.LGBMRegressor(
        objective='mae',
        n_estimators=1000,
        learning_rate=0.05,
        n_jobs=-1,
        random_state=42
    )
    
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='mae',
              callbacks=[lgb.early_stopping(100, verbose=False)])
    
    feature_importances[f'fold_{fold+1}'] = model.feature_importances_

feature_importances['average'] = feature_importances.mean(axis=1)
feature_importances = feature_importances.sort_values(by='average', ascending=False)

plt.figure(figsize=(12, 10))
sns.barplot(x='average', y=feature_importances.head(30).index, data=feature_importances.head(30))
plt.title('Top 30 Feature Importances (Averaged over GroupKFold)')
plt.xlabel('Average Importance')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

top_30_features = feature_importances.head(30).index.tolist()
print("\n--- Top 30 Most Important Features ---")
print(top_30_features)

top_features_df = df[top_30_features]
correlation_matrix = top_features_df.corr().abs()

upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

print(f"\nFound {len(to_drop)} features to drop due to high correlation (>0.95):")
print(to_drop)

final_feature_list = [f for f in top_30_features if f not in to_drop]

print(f"\n--- Final Selected Features ({len(final_feature_list)} features) ---")
print(final_feature_list)

df_final_train = pd.read_parquet('/kaggle/working/final_features_comprehensive.parquet')

X_final = df_final_train[final_feature_list]
y_final = df_final_train['time_to_failure']
groups_final = df_final_train['cycle_id']

print(f"Shape of X_final: {X_final.shape}")

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

np.random.seed(42)
tf.random.set_seed(42)
feature_file = '/kaggle/input/feature-created/final_features_comprehensive.parquet'
df_final_train = pd.read_parquet(feature_file).fillna(0)
final_feature_list = [
    'mfcc_5_mean', 'mfcc_11_mean', 'mfcc_10_mean', 'skewness', 'tsf_binned_entropy', 
    'mfcc_9_mean', 'mfcc_7_mean', 'mfcc_6_mean', 'peak_mean_height', 'mean', 
    'zero_crossing_rate', 'mfcc_8_mean', 'mfcc_4_mean', 'mfcc_12_mean', 'rolling_mean_std', 
    'mfcc_0_mean', 'num_peaks', 'tsf_autocorrelation_lag10', 'mfcc_3_mean', 'mfcc_2_mean', 
    'kurtosis', 'rolling_std_std', 'tsf_mean_abs_change', 'tsf_ar_coefficient_1', 
    'tsf_longest_strike_above_mean'
]
X = df_final_train[final_feature_list]
y = df_final_train['time_to_failure']
groups = df_final_train['cycle_id']

lgb_params = {
    'objective': 'mae', 
    'metric': 'mae', 
    'n_estimators': 10000, 
    'learning_rate': 0.01,
    'feature_fraction': 0.7, 
    'bagging_fraction': 0.7, 
    'bagging_freq': 1,
    'lambda_l1': 0.3, 
    'lambda_l2': 0.3, 
    'num_leaves': 31, 
    'verbose': -1,
    'n_jobs': -1, 
    'seed': 42, 
    'boosting_type': 'gbdt',
}

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    x = layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(inputs, inputs)
    x = layers.Dropout(dropout)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    res = x + inputs
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(res)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    return x + res

def build_transformer_model(input_shape, head_size, num_heads, ff_dim, num_transformer_blocks, mlp_units, dropout=0, mlp_dropout=0):
    inputs = keras.Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
    x = layers.GlobalAveragePooling1D(data_format="channels_first")(x)
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(mlp_dropout)(x)
    outputs = layers.Dense(1)(x)
    model = keras.Model(inputs, outputs)
    model.compile(loss="mae", optimizer=keras.optimizers.Adam(learning_rate=1e-3), metrics=["mae"])
    return model

n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

oof_lgb = np.zeros(len(X))
oof_transformer = np.zeros(len(X))
lgb_models = []
transformer_models = []
scalers = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups)):
    print(f"===================== FOLD {fold+1}/{n_splits} =====================")
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    # --- LightGBM 학습 및 저장 ---
    print("--- Training LightGBM model... ---")
    model_lgb = lgb.LGBMRegressor(**lgb_params)
    model_lgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='mae', callbacks=[lgb.early_stopping(200, verbose=False)])
    oof_lgb[val_idx] = model_lgb.predict(X_val)
    lgb_models.append(model_lgb)
    
    # --- Transformer 학습 및 저장 ---
    print("--- Training Transformer model... ---")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    scalers.append(scaler)
    
    X_train_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
    X_val_reshaped = X_val_scaled.reshape((X_val_scaled.shape[0], 1, X_val_scaled.shape[1]))
    
    model_transformer = build_transformer_model(
        input_shape=X_train_reshaped.shape[1:], head_size=256, num_heads=4, ff_dim=4,
        num_transformer_blocks=4, mlp_units=[128], mlp_dropout=0.45, dropout=0.3,
    )
    early_stopping = keras.callbacks.EarlyStopping(monitor="val_mae", patience=20, mode="min", restore_best_weights=True)
    model_transformer.fit(X_train_reshaped, y_train, epochs=200, batch_size=64, validation_data=(X_val_reshaped, y_val), callbacks=[early_stopping], verbose=0)
    oof_transformer[val_idx] = model_transformer.predict(X_val_reshaped).flatten()
    transformer_models.append(model_transformer) # 학습된 모델을 리스트에 저장

    print(f"Fold {fold+1} MAE (LGBM): {mean_absolute_error(y_val, oof_lgb[val_idx]):.4f}")
    print(f"Fold {fold+1} MAE (Transformer): {mean_absolute_error(y_val, oof_transformer[val_idx]):.4f}")
    gc.collect()

oof_ensemble = 0.5 * oof_lgb + 0.5 * oof_transformer
print(f"Overall OOF MAE (LGBM): {mean_absolute_error(y, oof_lgb):.4f}")
print(f"Overall OOF MAE (Transformer): {mean_absolute_error(y, oof_transformer):.4f}")
print(f"Overall OOF MAE (Ensemble 0.5/0.5): {mean_absolute_error(y, oof_ensemble):.4f}")

print(f"\nTraining complete. {len(lgb_models)} LGBM models, {len(transformer_models)} Transformer models, and {len(scalers)} scalers are saved in memory.")

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import gc
from scipy import stats, signal, fft
from scipy.stats import skew, kurtosis
import librosa
from tsfresh.feature_extraction import feature_calculators
from sklearn.metrics import mean_absolute_error
import time
import warnings
import os

best_w = 0
min_mae = 100
for w in np.arange(0, 1.01, 0.01):
    oof_ens = w * oof_lgb + (1 - w) * oof_transformer
    mae = mean_absolute_error(y, oof_ens)
    if mae < min_mae:
        min_mae = mae
        best_w = w
print(f"Optimal weight for LGBM: {best_w:.2f}")
print(f"Optimal weight for Transformer: {1-best_w:.2f}")
print(f"Best possible OOF MAE with these models: {min_mae:.4f}")

submission = pd.read_csv('/kaggle/input/testing/sample_submission.csv', index_col='seg_id')
test_files = submission.index.tolist()

predictions = []

def extract_comprehensive_features(segment_data, target):
    data = segment_data.values
    
    features = calculate_basic_statistics(data)
    features.update(calculate_rolling_statistics(data))
    features.update(calculate_signal_features(data))
    features.update(calculate_spectral_features(data))
    features.update(calculate_selected_tsfresh(data))
    
    features['time_to_failure'] = target
    return features

def calculate_basic_statistics(data):
    features = {}
    features['mean'] = np.mean(data); features['median'] = np.median(data)
    features['min'] = np.min(data); features['max'] = np.max(data)
    features['range'] = features['max'] - features['min']; features['std'] = np.std(data)
    features['variance'] = np.var(data); features['q25'] = np.percentile(data, 25)
    features['q75'] = np.percentile(data, 75); features['iqr'] = features['q75'] - features['q25']
    features['skewness'] = skew(data); features['kurtosis'] = kurtosis(data)
    return features

def calculate_rolling_statistics(data, window=1000):
    features = {}
    series = pd.Series(data)
    rolling_mean = series.rolling(window=window, min_periods=1).mean()
    features['rolling_mean_mean'] = rolling_mean.mean()
    features['rolling_mean_std'] = rolling_mean.std(ddof=0)
    rolling_std = series.rolling(window=window, min_periods=1).std()
    features['rolling_std_mean'] = rolling_std.mean()
    features['rolling_std_std'] = rolling_std.std(ddof=0)
    return features

def calculate_signal_features(data):
    features = {}
    features['zero_crossing_rate'] = len(np.where(np.diff(np.sign(data)))[0]) / len(data)
    peaks, properties = signal.find_peaks(data, height=np.mean(data))
    features['num_peaks'] = len(peaks)
    features['peak_mean_height'] = np.mean(properties['peak_heights']) if len(peaks) > 0 else 0
    return features

def calculate_spectral_features(data, sr=4000000, n_mfcc=13):
    features = {}
    data_float = data.astype(np.float32)
    mfccs = librosa.feature.mfcc(y=data_float, sr=sr, n_mfcc=n_mfcc, n_mels=40)
    for i in range(n_mfcc):
        features[f'mfcc_{i}_mean'] = np.mean(mfccs[i])
    fft_values = np.abs(fft.fft(data))
    fft_freq = fft.fftfreq(len(data), 1/sr)
    sorted_indices = np.argsort(fft_values)[::-1]
    features['fft_magnitude_1'] = fft_values[sorted_indices[0]]
    features['fft_frequency_1'] = np.abs(fft_freq[sorted_indices[0]])
    return features

def calculate_selected_tsfresh(x):
    features = {}
    features['tsf_abs_energy'] = feature_calculators.abs_energy(x)
    features['tsf_mean_abs_change'] = feature_calculators.mean_abs_change(x)
    features['tsf_binned_entropy'] = feature_calculators.binned_entropy(x, max_bins=10)
    features['tsf_autocorrelation_lag10'] = feature_calculators.autocorrelation(x, lag=10)
    features['tsf_longest_strike_above_mean'] = feature_calculators.longest_strike_above_mean(x)
    try:
        features['tsf_ar_coefficient_1'] = feature_calculators.ar_coefficient(x, [{"coeff": 1, "k": 10}])[0][1]
    except Exception:
        features['tsf_ar_coefficient_1'] = np.nan
    return features
    try:
        features['tsf_fft_real_coeff_1'] = feature_calculators.fft_coefficient(x, [{"coeff": 1, "attr": "real"}])[0][1]
        features['tsf_fft_real_coeff_5'] = feature_calculators.fft_coefficient(x, [{"coeff": 5, "attr": "real"}])[0][1]
    except Exception:
        features['tsf_fft_real_coeff_1'], features['tsf_fft_real_coeff_5'] = np.nan, np.nan
    try:
        features['tsf_spkt_welch_density_5'] = feature_calculators.spkt_welch_density(x, [{"coeff": 5}])[0][1]
    except Exception:
        features['tsf_spkt_welch_density_5'] = np.nan
    try:
        # CWT: 특정 스케일(주파수 대역)에서의 에너지
        cwt_coeffs = feature_calculators.cwt_coefficients(x, widths=[2, 5, 10, 20], coeff=5, w=5)
        features['tsf_cwt_coeff_5_w5'] = cwt_coeffs[0][1] if cwt_coeffs else np.nan
    except Exception:
        features['tsf_cwt_coeff_5_w5'] = np.nan
    try:
        features['tsf_approximate_entropy'] = feature_calculators.approximate_entropy(x, m=2, r=0.5)
    except Exception:
        features['tsf_approximate_entropy'] = np.nan

def segment_generator(cycle_df):
    n_samples = len(cycle_df)
    segment_len = 150000
    for i in range(n_samples, segment_len -1, -segment_len):
        start = i - segment_len; end = i
        segment_df = cycle_df.iloc[start:end]
        yield segment_df['acoustic_data'], segment_df['time_to_failure'].iloc[-1]
    rem_len = n_samples % segment_len
    if rem_len > 0:
        leftover_df = cycle_df.iloc[:rem_len]
        padding_value = leftover_df['acoustic_data'].median()
        padding = pd.Series([padding_value] * (segment_len - rem_len), dtype=np.int16)
        padded_segment = pd.concat([padding, leftover_df['acoustic_data']], ignore_index=True)
        yield padded_segment, leftover_df['time_to_failure'].iloc[-1]

for seg_id in tqdm(test_files, desc="Predicting on test segments"):
    test_df = pd.read_csv(f'/kaggle/input/testing/test/{seg_id}.csv')
    
    test_features = extract_comprehensive_features(test_df['acoustic_data'], target=0) 
    X_test = pd.DataFrame([test_features])[final_feature_list]

    fold_preds_lgb = []
    fold_preds_transformer = []

    for i in range(n_splits):
        model_lgb = lgb_models[i]
        model_transformer = transformer_models[i]
        scaler = scalers[i]

        pred_lgb = model_lgb.predict(X_test)[0]
        fold_preds_lgb.append(pred_lgb)

        X_test_scaled = scaler.transform(X_test)
        X_test_reshaped = X_test_scaled.reshape((1, 1, X_test.shape[1]))
        pred_transformer = model_transformer.predict(X_test_reshaped, verbose=0)[0][0]
        fold_preds_transformer.append(pred_transformer)

    avg_pred_lgb = np.mean(fold_preds_lgb)
    avg_pred_transformer = np.mean(fold_preds_transformer)

    final_prediction = best_w * avg_pred_lgb + (1 - best_w) * avg_pred_transformer
    predictions.append(final_prediction)
    gc.collect()

submission['time_to_failure'] = predictions
submission.to_csv('submission.csv')

print("\nSubmission file created!")
print("-First 5 predictions-")
print(submission.head())