In [1]:
import pandas as pd
import numpy as np
from typing import Tuple, Dict, List, Union, Optional
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
import gc
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Type aliases for enhanced clarity and Pylance compatibility
ArrayLike = Union[np.ndarray, pd.Series]
FloatArray = np.ndarray  # Explicit float array type
PredictionArray = np.ndarray  # Standardized prediction output

In [3]:


print("🚀 Type-Safe Advanced CTR Prediction with Temporal Validation & Stacking")
print("=" * 85)

# =============================================
# 1. TYPE-SAFE DATA LOADING WITH VALIDATION
# =============================================

def load_data_in_chunks(filepath: str, chunksize: int = 1000000) -> pd.DataFrame:
    """
    Type-safe chunked data loading with comprehensive validation
    
    Args:
        filepath: Path to compressed CSV file
        chunksize: Number of rows per chunk for memory optimization
        
    Returns:
        Concatenated DataFrame with validated structure
    """
    chunks: List[pd.DataFrame] = []
    total_rows: int = 0
    
    for chunk in pd.read_csv(filepath, chunksize=chunksize, compression='gzip'):
        chunks.append(chunk)
        total_rows += len(chunk)
        print(f"Loaded {total_rows:,} rows...")
            
    consolidated_df: pd.DataFrame = pd.concat(chunks, ignore_index=True)
    return consolidated_df

print("📊 Loading temporal training data with type validation...")
train_df: pd.DataFrame = load_data_in_chunks('ctr_train.csv.gz')

print(f"Training set shape: {train_df.shape}")
print(f"Temporal range: {train_df['hour'].min()} - {train_df['hour'].max()}")

# Explicit type casting for numerical stability
ctr: float = float(train_df['click'].mean())
print(f"Base CTR: {ctr:.4f} ({ctr*100:.2f}%)")
print(f"Class imbalance ratio: {(1.0-ctr)/ctr:.1f}:1")

🚀 Type-Safe Advanced CTR Prediction with Temporal Validation & Stacking
📊 Loading temporal training data with type validation...
Loaded 1,000,000 rows...
Loaded 2,000,000 rows...
Loaded 3,000,000 rows...
Loaded 4,000,000 rows...
Loaded 5,000,000 rows...
Loaded 6,000,000 rows...
Loaded 7,000,000 rows...
Loaded 8,000,000 rows...
Loaded 9,000,000 rows...
Loaded 10,000,000 rows...
Loaded 11,000,000 rows...
Loaded 12,000,000 rows...
Loaded 13,000,000 rows...
Loaded 14,000,000 rows...
Loaded 15,000,000 rows...
Loaded 16,000,000 rows...
Loaded 17,000,000 rows...
Loaded 18,000,000 rows...
Loaded 19,000,000 rows...
Loaded 20,000,000 rows...
Loaded 21,000,000 rows...
Loaded 22,000,000 rows...
Loaded 23,000,000 rows...
Loaded 24,000,000 rows...
Loaded 25,000,000 rows...
Loaded 26,000,000 rows...
Loaded 27,000,000 rows...
Loaded 28,000,000 rows...
Loaded 29,000,000 rows...
Loaded 30,000,000 rows...
Loaded 31,000,000 rows...
Loaded 32,000,000 rows...
Loaded 33,000,000 rows...
Loaded 34,000,000 rows

In [4]:
# =============================================
# 2. ROBUST TEMPORAL FEATURE ENGINEERING
# =============================================

def create_comprehensive_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Type-safe comprehensive temporal feature engineering with cyclical encoding
    and interaction terms optimized for CTR prediction tasks
    
    Args:
        df: Input DataFrame with 'hour' column in YYMMDDHH format
        
    Returns:
        Enhanced DataFrame with engineered temporal features
    """
    df_enhanced: pd.DataFrame = df.copy()
    
    # Primary temporal decomposition with explicit int conversion
    hour_series: pd.Series = df_enhanced['hour'].astype(np.int64)
    df_enhanced['hour_of_day'] = (hour_series % 100).astype(np.int32)
    df_enhanced['day'] = ((hour_series // 100) % 100).astype(np.int32)
    df_enhanced['month'] = ((hour_series // 10000) % 100).astype(np.int32)
    df_enhanced['year'] = (hour_series // 1000000).astype(np.int32)
    
    # Direct day of week calculation (assumes correct format)
    datetime_series = pd.to_datetime(hour_series.astype(str), format='%y%m%d%H')
    df_enhanced['day_of_week'] = datetime_series.dt.dayofweek.astype(np.int32)
    
    # Cyclical encoding for temporal periodicity preservation
    hour_of_day_float: FloatArray = df_enhanced['hour_of_day'].astype(np.float64).to_numpy()
    day_of_week_float: FloatArray = df_enhanced['day_of_week'].astype(np.float64).to_numpy()
    
    df_enhanced['hour_sin'] = np.sin(2.0 * np.pi * hour_of_day_float / 24.0).astype(np.float32)
    df_enhanced['hour_cos'] = np.cos(2.0 * np.pi * hour_of_day_float / 24.0).astype(np.float32)
    df_enhanced['dow_sin'] = np.sin(2.0 * np.pi * day_of_week_float / 7.0).astype(np.float32)
    df_enhanced['dow_cos'] = np.cos(2.0 * np.pi * day_of_week_float / 7.0).astype(np.float32)
    
    # Business logic features with explicit boolean conversion
    weekend_mask: pd.Series = (df_enhanced['day_of_week'] >= 5)
    business_hour_mask: pd.Series = (
        (df_enhanced['hour_of_day'] >= 9) & 
        (df_enhanced['hour_of_day'] <= 17) &
        (~weekend_mask)
    )
    
    df_enhanced['is_weekend'] = weekend_mask.astype(np.int8)
    df_enhanced['is_business_hour'] = business_hour_mask.astype(np.int8)
    
    # Time period categorization with robust binning
    hour_bins: List[float] = [-0.1, 6.0, 12.0, 18.0, 24.0]
    hour_labels: List[int] = [0, 1, 2, 3]  # night, morning, day, evening
    
    df_enhanced['time_period'] = pd.cut(
        hour_of_day_float, 
        bins=hour_bins, 
        labels=hour_labels,
        include_lowest=True
    ).fillna(0).astype(np.int8)
    
    new_features: int = len([c for c in df_enhanced.columns if c not in df.columns])
    print(f"✅ Created {new_features} type-safe temporal features")
    return df_enhanced

# Apply temporal feature engineering with type safety
train_df = create_comprehensive_temporal_features(train_df)

✅ Created 12 type-safe temporal features


In [5]:
# =============================================
# 3. TYPE-SAFE FREQUENCY ENCODING IMPLEMENTATION
# =============================================

def frequency_encoding_with_smoothing(
    train_df: pd.DataFrame, 
    val_df: pd.DataFrame, 
    test_df: pd.DataFrame,
    high_card_cols: List[str], 
    smoothing_factor: float = 10.0
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Type-safe frequency encoding with Laplace smoothing for high-cardinality
    categorical features, preventing overfitting on rare categories
    
    Args:
        train_df: Training DataFrame
        val_df: Validation DataFrame  
        test_df: Test DataFrame
        high_card_cols: List of high-cardinality column names
        smoothing_factor: Laplace smoothing parameter (α)
        
    Returns:
        Tuple of enhanced DataFrames with frequency-encoded features
    """
    print(f"🔄 Applying type-safe frequency encoding with α={smoothing_factor} smoothing...")
    
    encoded_dfs: Dict[str, pd.DataFrame] = {
        'train': train_df.copy(), 
        'val': val_df.copy(), 
        'test': test_df.copy()
    }
    
    for col in high_card_cols:
        if col not in train_df.columns:
            continue
            
        # Type-safe frequency computation on training data only
        col_series: pd.Series = train_df[col].astype(str)
        freq_map: Dict[str, int] = col_series.value_counts().to_dict()
        total_count: int = len(train_df)
        vocab_size: int = len(freq_map)
        
        # Laplace smoothing with explicit float conversion
        def smooth_frequency(value: str) -> float:
            raw_freq: int = freq_map.get(str(value), 0)
            smoothed: float = (float(raw_freq) + smoothing_factor) / (
                float(total_count) + smoothing_factor * float(vocab_size)
            )
            return smoothed
        
        # Default frequency for unseen values
        unseen_freq: float = smoothing_factor / (
            float(total_count) + smoothing_factor * float(vocab_size)
        )
        
        # Apply frequency encoding to all datasets with type consistency
        for df_name, df in encoded_dfs.items():
            col_values: pd.Series = df[col].astype(str)
            frequency_array: FloatArray = np.array([
                smooth_frequency(val) for val in col_values
            ], dtype=np.float32)
            
            encoded_dfs[df_name][f'{col}_freq'] = frequency_array
            
        print(f"  {col}: {vocab_size:,} unique values → frequency encoded")
    
    return encoded_dfs['train'], encoded_dfs['val'], encoded_dfs['test']


In [6]:
# =============================================
# 4. TYPE-SAFE CTR AGGREGATION FEATURES
# =============================================

def create_ctr_aggregation_features(
    train_df: pd.DataFrame, 
    val_df: pd.DataFrame, 
    test_df: pd.DataFrame,
    target_col: str = 'click', 
    min_samples: int = 50
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Create type-safe historical CTR aggregation features with temporal safety
    to prevent data leakage while capturing categorical-target interactions
    
    Args:
        train_df: Training DataFrame
        val_df: Validation DataFrame
        test_df: Test DataFrame
        target_col: Name of binary target column
        min_samples: Minimum samples required for reliable CTR estimation
        
    Returns:
        Tuple of DataFrames enhanced with CTR-based features
    """
    print(f"📈 Creating type-safe CTR aggregation features (min_samples={min_samples})...")
    print(f"  Train temporal range: {train_df['hour'].min()} - {train_df['hour'].max()}")
    print(f"  Val temporal range: {val_df['hour'].min()} - {val_df['hour'].max()}")
    print(f"  Test temporal range: {test_df['hour'].min()} - {test_df['hour'].max()}")
    
    categorical_cols: List[str] = [
        'site_category', 'app_category', 'device_type', 'device_conn_type',
        'banner_pos', 'hour_of_day', 'day_of_week', 'time_period'
    ]
    
    train_enhanced: pd.DataFrame = train_df.copy()
    val_enhanced: pd.DataFrame = val_df.copy()
    test_enhanced: pd.DataFrame = test_df.copy()
    
    # Global CTR with explicit float conversion
    global_ctr: float = float(train_df[target_col].mean())
    print(f"  Global CTR baseline: {global_ctr:.4f}")
    
    for col in categorical_cols:
        if col not in train_df.columns:
            continue
            
        # Type-safe CTR statistics computation
        ctr_stats: pd.DataFrame = (
            train_df.groupby(col)[target_col]
            .agg(['count', 'mean', 'std'])
            .reset_index()
        )
        ctr_stats.columns = [col, f'{col}_count', f'{col}_ctr', f'{col}_ctr_std']
        
        # Filter by minimum sample requirement with type safety
        count_mask: pd.Series = ctr_stats[f'{col}_count'] >= min_samples
        reliable_stats: pd.DataFrame = ctr_stats[count_mask].copy()
        
        # Create type-safe mapping dictionaries
        ctr_map: Dict[Union[str, int, float], float] = dict(
            zip(reliable_stats[col], reliable_stats[f'{col}_ctr'].astype(float))
        )
        count_map: Dict[Union[str, int, float], int] = dict(
            zip(reliable_stats[col], reliable_stats[f'{col}_count'].astype(int))
        )
        std_map: Dict[Union[str, int, float], float] = dict(
            zip(reliable_stats[col], reliable_stats[f'{col}_ctr_std'].fillna(0.0).astype(float))
        )
        
        # Apply to all datasets with explicit type handling
        datasets: List[Tuple[pd.DataFrame, str]] = [
            (train_enhanced, 'train'), 
            (val_enhanced, 'val'), 
            (test_enhanced, 'test')
        ]
        
        for df, df_name in datasets:
            # Type-safe mapping with fallback handling
            ctr_values: FloatArray = np.array([
                ctr_map.get(val, global_ctr) for val in df[col]
            ], dtype=np.float32)
            
            count_values: np.ndarray = np.array([
                count_map.get(val, 0) for val in df[col]
            ], dtype=np.int32)
            
            std_values: FloatArray = np.array([
                std_map.get(val, 0.0) for val in df[col]
            ], dtype=np.float32)
            
            df[f'{col}_historical_ctr'] = ctr_values
            df[f'{col}_sample_count'] = count_values
            df[f'{col}_ctr_std'] = std_values
            
            # Confidence score with logarithmic scaling
            df[f'{col}_ctr_confidence'] = np.log1p(count_values.astype(np.float32))
        
        reliable_categories: int = len(reliable_stats)
        total_categories: int = len(ctr_stats)
        print(f"  {col}: {reliable_categories}/{total_categories} categories with sufficient samples")
    
    return train_enhanced, val_enhanced, test_enhanced

In [7]:
# =============================================
# 5. TYPE-SAFE TEMPORAL DATA SPLITTING
# =============================================

print("\n🕒 IMPLEMENTING TYPE-SAFE TEMPORAL DATA SPLITTING")
print("=" * 55)

# Sort by temporal order with explicit type validation
train_df_sorted: pd.DataFrame = train_df.sort_values('hour').reset_index(drop=True)

# Temporal split with precise indexing
split_idx: int = int(len(train_df_sorted) * 0.8)

train_temporal: pd.DataFrame = train_df_sorted.iloc[:split_idx].copy()
val_temporal: pd.DataFrame = train_df_sorted.iloc[split_idx:].copy()

# Type-safe temporal integrity verification
train_max_time: int = int(train_temporal['hour'].max())
val_min_time: int = int(val_temporal['hour'].min())

print(f"📊 Temporal split verification:")
print(f"  Training: {train_temporal['hour'].min()} → {train_max_time}")
print(f"  Validation: {val_min_time} → {val_temporal['hour'].max()}")
leakage_status: str = '✅ No leakage' if train_max_time <= val_min_time else '❌ LEAKAGE DETECTED'
print(f"  Temporal gap: {leakage_status}")

# CTR distribution analysis with type safety
train_ctr: float = float(train_temporal['click'].mean())
val_ctr: float = float(val_temporal['click'].mean())
relative_diff: float = abs(train_ctr - val_ctr) / train_ctr * 100.0

print(f"📈 CTR distribution stability:")
print(f"  Training CTR: {train_ctr:.4f}")
print(f"  Validation CTR: {val_ctr:.4f}")
print(f"  Relative difference: {relative_diff:.2f}%")


🕒 IMPLEMENTING TYPE-SAFE TEMPORAL DATA SPLITTING
📊 Temporal split verification:
  Training: 14102100 → 14102823
  Validation: 14102823 → 14103023
  Temporal gap: ✅ No leakage
📈 CTR distribution stability:
  Training CTR: 0.1715
  Validation CTR: 0.1632
  Relative difference: 4.84%


In [8]:
# =============================================
# 6. TYPE-SAFE TEST DATA LOADING AND PREPROCESSING
# =============================================

print("\n📥 Loading test data with type-safe preprocessing...")
test_df: pd.DataFrame = pd.read_csv('ctr_test.csv')
test_df = create_comprehensive_temporal_features(test_df)

print(f"Test set shape: {test_df.shape}")
print(f"Test temporal range: {test_df['hour'].min()} → {test_df['hour'].max()}")



📥 Loading test data with type-safe preprocessing...
✅ Created 7 type-safe temporal features
Test set shape: (40032, 37)
Test temporal range: 14102100 → 14102101


In [9]:
# =============================================
# 7. FEATURE ENGINEERING PIPELINE EXECUTION
# =============================================

# High-cardinality categorical features for frequency encoding
high_cardinality_features: List[str] = ['device_id', 'site_id', 'device_ip', 'app_id', 'device_model']

# Apply frequency encoding with type safety
train_freq, val_freq, test_freq = frequency_encoding_with_smoothing(
    train_temporal, val_temporal, test_df, 
    high_cardinality_features, 
    smoothing_factor=10.0
)

# Apply CTR aggregation features with type safety
train_final, val_final, test_final = create_ctr_aggregation_features(
    train_freq, val_freq, test_freq
)

print(f"\n📏 Final feature matrix dimensions:")
print(f"  Training: {train_final.shape}")
print(f"  Validation: {val_final.shape}")
print(f"  Test: {test_final.shape}")

# Prepare model features
def prepare_model_features(train_df, val_df, test_df):
    exclude_cols = {'idx', 'id', 'click', 'hour'}
    feature_cols = [col for col in train_df.columns if col not in exclude_cols]
    
    X_train = train_df.loc[:, feature_cols]
    y_train = train_df['click']
    
    X_val = val_df.loc[:, feature_cols]
    y_val = val_df['click']
    
    X_test = test_df.loc[:, feature_cols]

    dtypes = X_train.dtypes.to_dict()
    nunique_map = X_train.nunique(dropna=False).to_dict()
    
    engineered_suffixes = ('_freq', '_ctr', '_std', '_confidence', '_count', '_sin', '_cos')
    always_categorical = {'is_weekend', 'is_business_hour', 'time_period'}

    categorical_features = []
    threshold = 0.5 * len(X_train)

    for col in feature_cols:
        if col.endswith(engineered_suffixes):
            continue
        if col in always_categorical:
            categorical_features.append(col)
        elif dtypes[col] == 'object' or nunique_map[col] < threshold:
            categorical_features.append(col)

    print(f"📋 Feature preparation complete:")
    print(f"  Total features: {len(feature_cols)}")
    print(f"  Categorical features: {len(categorical_features)}")
    print(f"  Numerical features: {len(feature_cols) - len(categorical_features)}")

    for df in (X_train, X_val, X_test):
        for col in categorical_features:
            if df[col].dtype.name != 'category':
                df[col] = df[col].astype('category')

    return X_train, y_train, X_val, y_val, X_test, categorical_features

X_train, y_train, X_val, y_val, X_test, categorical_features = prepare_model_features(
    train_final, val_final, test_final
)

# Memory cleanup
to_delete = [
    'train_df', 'train_df_sorted', 'train_temporal', 'val_temporal',
    'train_freq', 'val_freq', 'test_freq', 'train_final', 'val_final'
]

for var in to_delete:
    globals()[var] = None

gc.collect()

print(f"🧠 Memory cleanup completed. Training on {len(X_train):,} samples.")


🔄 Applying type-safe frequency encoding with α=10.0 smoothing...
  device_id: 2,215,224 unique values → frequency encoded
  site_id: 4,550 unique values → frequency encoded
  device_ip: 5,559,975 unique values → frequency encoded
  app_id: 8,092 unique values → frequency encoded
  device_model: 8,034 unique values → frequency encoded
📈 Creating type-safe CTR aggregation features (min_samples=50)...
  Train temporal range: 14102100 - 14102823
  Val temporal range: 14102823 - 14103023
  Test temporal range: 14102100 - 14102101
  Global CTR baseline: 0.1715
  site_category: 20/26 categories with sufficient samples
  app_category: 25/36 categories with sufficient samples
  device_type: 4/5 categories with sufficient samples
  device_conn_type: 4/4 categories with sufficient samples
  banner_pos: 7/7 categories with sufficient samples
  hour_of_day: 24/24 categories with sufficient samples
  day_of_week: 7/7 categories with sufficient samples
  time_period: 4/4 categories with sufficient sa

In [11]:
# =============================================
# 9. TYPE-SAFE 5-FOLD CROSS-VALIDATION AND BASE MODEL TRAINING
# =============================================

print("\n🌟 TYPE-SAFE 5-FOLD CROSS-VALIDATION AND BASE MODEL TRAINING")
print("=" * 60)

# Calculate class balance
neg_count: int = int((y_train == 0).sum())
pos_count: int = int((y_train == 1).sum())
scale_pos_weight: float = float(neg_count) / float(pos_count)

# Base LightGBM parameters
lgb_base_params: Dict[str, Union[str, int, float, bool]] = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'min_data_in_leaf': 100,
    'min_gain_to_split': 0.02,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_bin': 255,
    'num_threads': 4,
    'scale_pos_weight': scale_pos_weight,
    'verbose': -1,
    'seed': 42
}

# Base CatBoost parameters
catboost_base_params: Dict[str, Union[int, float, str, bool]] = {
    'iterations': 2000,
    'l2_leaf_reg': 3.0,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.8,
    'scale_pos_weight': scale_pos_weight,
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'random_seed': 42,
    'early_stopping_rounds': 50,
    'use_best_model': True,
    'task_type': 'CPU',
    'thread_count': 4,
    'verbose': 0
}

# Hyperparameter grids
lgb_param_grid = {
    'num_leaves': [63, 127],
    'learning_rate': [0.01, 0.02]
}

catboost_param_grid = {
    'depth': [6, 8],
    'learning_rate': [0.01, 0.02]
}

# Initialize arrays for out-of-fold predictions
lgb_oof_pred = np.zeros(len(X_train), dtype=np.float64)
catboost_oof_pred = np.zeros(len(X_train), dtype=np.float64)
val_oof_pred = np.zeros(len(X_val), dtype=np.float64)
lgb_test_pred = np.zeros(len(X_test), dtype=np.float64)
catboost_test_pred = np.zeros(len(X_test), dtype=np.float64)

# 5-fold cross-validation with TimeSeriesSplit
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

print("🔧 Performing 5-fold cross-validation for base models...")

# Store best parameters for final models
best_lgb_params: Optional[Dict[str, Union[str, int, float, bool]]] = None
best_lgb_val_auc: float = 0.0
best_catboost_params: Optional[Dict[str, Union[int, float, str, bool]]] = None
best_catboost_val_auc: float = 0.0

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train), 1):
    print(f"\nFold {fold}/{n_splits}")
    
    # Split data
    X_fold_train = X_train.iloc[train_idx]
    y_fold_train = y_train.iloc[train_idx]
    X_fold_val = X_train.iloc[val_idx]
    y_fold_val = y_train.iloc[val_idx]
    
    # LightGBM tuning
    fold_best_lgb_params = None
    fold_best_lgb_val_auc = 0.0
    
    for num_leaves in lgb_param_grid['num_leaves']:
        for learning_rate in lgb_param_grid['learning_rate']:
            print(f"  LightGBM: Testing num_leaves={num_leaves}, learning_rate={learning_rate}")
            lgb_params_tune = lgb_base_params.copy()
            lgb_params_tune['num_leaves'] = num_leaves
            lgb_params_tune['learning_rate'] = learning_rate
            
            train_lgb = lgb.Dataset(
                X_fold_train,
                label=y_fold_train.astype(np.int8),
                categorical_feature=categorical_features,
                free_raw_data=True
            )
            val_lgb = lgb.Dataset(
                X_fold_val,
                label=y_fold_val.astype(np.int8),
                categorical_feature=categorical_features,
                reference=train_lgb,
                free_raw_data=True
            )
            
            lgb_model_tune = lgb.train(
                lgb_params_tune,
                train_lgb,
                valid_sets=[train_lgb, val_lgb],
                valid_names=['train', 'eval'],
                num_boost_round=1000,
                callbacks=[
                    lgb.early_stopping(stopping_rounds=50, verbose=False),
                    lgb.log_evaluation(period=0)
                ]
            )
            
            lgb_val_pred_tune = np.asarray(lgb_model_tune.predict(X_fold_val, num_iteration=lgb_model_tune.best_iteration), dtype=np.float64)
            val_auc = roc_auc_score(y_fold_val, lgb_val_pred_tune)
            print(f"    Validation AUC: {val_auc:.6f}")
            
            if val_auc > fold_best_lgb_val_auc:
                fold_best_lgb_val_auc = val_auc
                fold_best_lgb_params = lgb_params_tune
    
    if fold_best_lgb_params is None:
        raise ValueError(f"No best LightGBM parameters found for fold {fold}")
    
    # Update global best if better
    if fold_best_lgb_val_auc > best_lgb_val_auc:
        best_lgb_val_auc = float(fold_best_lgb_val_auc)
        best_lgb_params = fold_best_lgb_params
    
    # Train LightGBM with best fold parameters for OOF predictions
    train_lgb = lgb.Dataset(
        X_fold_train,
        label=y_fold_train.astype(np.int8),
        categorical_feature=categorical_features,
        free_raw_data=True
    )
    val_lgb = lgb.Dataset(
        X_fold_val,
        label=y_fold_val.astype(np.int8),
        categorical_feature=categorical_features,
        reference=train_lgb,
        free_raw_data=True
    )
    
    lgb_model = lgb.train(
        fold_best_lgb_params,
        train_lgb,
        valid_sets=[train_lgb, val_lgb],
        valid_names=['train', 'eval'],
        num_boost_round=1400,
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=0)
        ]
    )
    
    lgb_oof_pred[val_idx] = np.asarray(lgb_model.predict(X_fold_val, num_iteration=lgb_model.best_iteration), dtype=np.float64)
    lgb_test_pred += np.asarray(lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration), dtype=np.float64) / n_splits
    
    # CatBoost tuning
    fold_best_catboost_params = None
    fold_best_catboost_val_auc = 0.0
    
    for depth in catboost_param_grid['depth']:
        for learning_rate in catboost_param_grid['learning_rate']:
            print(f"  CatBoost: Testing depth={depth}, learning_rate={learning_rate}")
            catboost_params_tune = catboost_base_params.copy()
            catboost_params_tune['depth'] = depth
            catboost_params_tune['learning_rate'] = learning_rate
            
            catboost_model_tune = CatBoostClassifier(**catboost_params_tune)
            catboost_model_tune.fit(
                X_fold_train, y_fold_train,
                cat_features=categorical_features,
                eval_set=(X_fold_val, y_fold_val),
                verbose=0
            )
            
            catboost_val_pred_tune = np.asarray(catboost_model_tune.predict_proba(X_fold_val)[:, 1], dtype=np.float64)
            val_auc = roc_auc_score(y_fold_val, catboost_val_pred_tune)
            print(f"    Validation AUC: {val_auc:.6f}")
            
            if val_auc > fold_best_catboost_val_auc:
                fold_best_catboost_val_auc = val_auc
                fold_best_catboost_params = catboost_params_tune
    
    if fold_best_catboost_params is None:
        raise ValueError(f"No best CatBoost parameters found for fold {fold}")
    
    # Update global best if better
    if fold_best_catboost_val_auc > best_catboost_val_auc:
        best_catboost_val_auc = float(fold_best_catboost_val_auc)
        best_catboost_params = fold_best_catboost_params
    
    # Train CatBoost with best fold parameters for OOF predictions
    catboost_model = CatBoostClassifier(**fold_best_catboost_params)
    catboost_model.fit(
        X_fold_train, y_fold_train,
        cat_features=categorical_features,
        eval_set=(X_fold_val, y_fold_val),
        verbose=0
    )
    
    catboost_oof_pred[val_idx] = np.asarray(catboost_model.predict_proba(X_fold_val)[:, 1], dtype=np.float64)
    catboost_test_pred += np.asarray(catboost_model.predict_proba(X_test)[:, 1], dtype=np.float64) / n_splits

# Generate validation predictions for meta-model
# (Moved to after final base model training to ensure lgb_model and catboost_model are defined)
# lgb_val_pred = np.asarray(lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration), dtype=np.float64)
# catboost_val_pred = np.asarray(catboost_model.predict_proba(X_val)[:, 1], dtype=np.float64)
# val_oof_pred = np.column_stack((lgb_val_pred, catboost_val_pred))

# Memory cleanup after cross-validation
gc.collect()



🌟 TYPE-SAFE 5-FOLD CROSS-VALIDATION AND BASE MODEL TRAINING
🔧 Performing 5-fold cross-validation for base models...

Fold 1/5
  LightGBM: Testing num_leaves=63, learning_rate=0.01


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import time, gc
from typing import Dict, Union, Optional

# === Настройка ===
n_splits = 5  # полноценная кросс-валидация
sample_frac = 1.0  # работаем с полным датасетом

# 🧪 Подсэмплирование
X_train_sample = X_train.sample(frac=sample_frac, random_state=42)
y_train_sample = y_train.loc[X_train_sample.index]

print(f"✅ Training data: {X_train_sample.shape}")

# ⚖️ Баланс классов
neg_count = int((y_train_sample == 0).sum())
pos_count = int((y_train_sample == 1).sum())
scale_pos_weight = float(neg_count) / float(pos_count)

# 💡 Преобразование категориальных признаков для CatBoost
categorical_features = X_train_sample.select_dtypes(include=['category', 'object']).columns.tolist()
le_dict = {}
for col in categorical_features:
    le = LabelEncoder()
    str_values = X_train_sample[col].astype(str).tolist()
    encoded_values = le.fit_transform(str_values)
    encoded_values = np.asarray(encoded_values, dtype=np.int32)
    X_train_sample[col] = encoded_values
    le_dict[col] = le

# 🎯 Приведение float признаков к float32
float_cols = X_train_sample.select_dtypes(include=['float64']).columns
X_train_sample[float_cols] = X_train_sample[float_cols].astype(np.float32)

# 📊 LightGBM базовые параметры
lgb_base_params: Dict[str, Union[str, int, float, bool]] = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'min_data_in_leaf': 100,
    'min_gain_to_split': 0.02,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_bin': 255,
    'num_threads': 4,
    'scale_pos_weight': scale_pos_weight,
    'verbose': -1,
    'seed': 42
}

# 🐱 CatBoost базовые параметры
catboost_base_params: Dict[str, Union[int, float, str, bool]] = {
    'iterations': 1000,
    'l2_leaf_reg': 3.0,
    'bootstrap_type': 'Bernoulli',
    'subsample': 0.8,
    'scale_pos_weight': scale_pos_weight,
    'eval_metric': 'AUC',
    'loss_function': 'Logloss',
    'random_seed': 42,
    'early_stopping_rounds': 50,
    'use_best_model': True,
    'task_type': 'CPU',
    'thread_count': 4,
    'verbose': 0
}

# 📦 OOF предсказания
lgb_oof_pred = np.zeros(len(X_train_sample), dtype=np.float32)
catboost_oof_pred = np.zeros(len(X_train_sample), dtype=np.float32)

# ⚙️ Кросс-валидация
tscv = TimeSeriesSplit(n_splits=n_splits)
print("\n🌟 TYPE-SAFE 5-FOLD CV + BASE MODELS TRAINING")
print("=" * 60)

# 💾 Лучшая модель
best_lgb_auc, best_cat_auc = 0.0, 0.0
best_lgb_params, best_cat_params = None, None

global_start = time.time()

for fold, (train_idx, val_idx) in enumerate(tscv.split(X_train_sample), 1):
    print(f"\n🔁 Fold {fold}/{n_splits}")
    fold_start = time.time()

    X_tr, X_val = X_train_sample.iloc[train_idx].copy(), X_train_sample.iloc[val_idx].copy()
    y_tr, y_val = y_train_sample.iloc[train_idx], y_train_sample.iloc[val_idx]

    # === LIGHTGBM ===
    print("🟩 LightGBM training...")
    lgb_train = lgb.Dataset(X_tr, label=y_tr.astype(float))
    lgb_valid = lgb.Dataset(X_val, label=y_val.astype(float))

    lgb_model = lgb.train(
        lgb_base_params,
        lgb_train,
        num_boost_round=1000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=['train', 'val'],
        callbacks=[
            lgb.early_stopping(50, verbose=False),
            lgb.log_evaluation(100)
        ]
    )

    pred = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
    pred = np.asarray(pred, dtype=np.float32).ravel()
    auc = roc_auc_score(y_val, pred)
    print(f"    ✅ LightGBM AUC: {auc:.5f} | Time: {time.time() - fold_start:.1f}s")

    if auc > best_lgb_auc:
        best_lgb_auc = auc
        best_lgb_params = lgb_base_params.copy()

    lgb_oof_pred[val_idx] = pred

    # === CATBOOST ===
    print("🐱 CatBoost training...")
    cat_model = CatBoostClassifier(**catboost_base_params)
    cat_model.fit(
        X_tr, y_tr,
        eval_set=(X_val, y_val),
        cat_features=[X_tr.columns.get_loc(col) for col in categorical_features if col in X_tr.columns],
        verbose=0
    )

    pred = cat_model.predict_proba(X_val)[:, 1]
    pred = np.asarray(pred, dtype=np.float32).ravel()
    auc = roc_auc_score(y_val, pred)
    print(f"    ✅ CatBoost AUC: {auc:.5f} | Time: {(time.time() - fold_start) / 60:.2f} min")

    if auc > best_cat_auc:
        best_cat_params = catboost_base_params.copy()
        best_cat_auc = auc

    catboost_oof_pred[val_idx] = pred

    print(f"⏱ Fold {fold} done in {(time.time() - fold_start)/60:.2f} min")

print("\n🏁 All folds completed")
print(f"🟩 Best LightGBM AUC: {best_lgb_auc:.5f}")
print(f"🐱 Best CatBoost AUC: {best_cat_auc:.5f}")
print(f"⏱ Total time: {(time.time() - global_start)/60:.2f} min")

gc.collect()

✅ Training data: (32311148, 70)

🌟 TYPE-SAFE 5-FOLD CV + BASE MODELS TRAINING

🔁 Fold 1/5
🟩 LightGBM training...
[100]	train's auc: 0.746482	val's auc: 0.74535
[200]	train's auc: 0.752802	val's auc: 0.75125


In [None]:
# =============================================
# 10. TYPE-SAFE FINAL BASE MODEL TRAINING
# =============================================

print("\n🌟 TYPE-SAFE FINAL BASE MODEL TRAINING")
print("=" * 45)

# Train final LightGBM model
if best_lgb_params is None:
    raise ValueError("No best LightGBM parameters found during tuning")
train_lgb = lgb.Dataset(
    X_train,
    label=y_train.astype(np.int8),
    categorical_feature=categorical_features,
    free_raw_data=True
)
val_lgb = lgb.Dataset(
    X_val,
    label=y_val.astype(np.int8),
    categorical_feature=categorical_features,
    reference=train_lgb,
    free_raw_data=True
)

lgb_model = lgb.train(
    best_lgb_params,
    train_lgb,
    valid_sets=[train_lgb, val_lgb],
    valid_names=['train', 'eval'],
    num_boost_round=1400,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, verbose=True),
        lgb.log_evaluation(period=200)
    ]
)

# Train final CatBoost model
if best_catboost_params is None:
    raise ValueError("No best CatBoost parameters found during tuning")
catboost_model = CatBoostClassifier(**best_catboost_params)
catboost_model.fit(
    X_train, y_train,
    cat_features=categorical_features,
    eval_set=(X_val, y_val),
    verbose=200
)
# Base model predictions for evaluation
lgb_train_pred = np.asarray(lgb_model.predict(X_train, num_iteration=lgb_model.best_iteration), dtype=np.float64)
lgb_val_pred = np.asarray(lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration), dtype=np.float64)
catboost_train_pred = np.asarray(catboost_model.predict_proba(X_train)[:, 1], dtype=np.float64)
catboost_val_pred = np.asarray(catboost_model.predict_proba(X_val)[:, 1], dtype=np.float64)
val_oof_pred = np.column_stack((lgb_val_pred, catboost_val_pred))
catboost_val_pred = np.asarray(catboost_model.predict_proba(X_val)[:, 1], dtype=np.float64)

# AUC for base models
lgb_train_auc = roc_auc_score(y_train, lgb_train_pred)
lgb_val_auc = roc_auc_score(y_val, lgb_val_pred)
catboost_train_auc = roc_auc_score(y_train, catboost_train_pred)
catboost_val_auc = roc_auc_score(y_val, catboost_val_pred)

print(f"\n📊 Base Model Performance:")
print(f"  LightGBM Training AUC: {lgb_train_auc:.6f}")
print(f"  LightGBM Validation AUC: {lgb_val_auc:.6f}")
print(f"  CatBoost Training AUC: {catboost_train_auc:.6f}")
print(f"  CatBoost Validation AUC: {catboost_val_auc:.6f}")

In [None]:
# =============================================
# 11. TYPE-SAFE META-MODEL TRAINING
# =============================================

print("\n🎭 TYPE-SAFE META-MODEL TRAINING")
print("=" * 45)

# Prepare meta-features (OOF predictions)
meta_train_features = np.column_stack((lgb_oof_pred, catboost_oof_pred))
meta_val_features = np.column_stack((lgb_val_pred, catboost_val_pred))

# Train logistic regression meta-model
meta_model = LogisticRegression(random_state=42, max_iter=1000)
meta_model.fit(meta_train_features, y_train)

# Meta-model predictions
meta_train_pred = np.asarray(meta_model.predict_proba(meta_train_features)[:, 1], dtype=np.float64)
meta_val_pred = np.asarray(meta_model.predict_proba(meta_val_features)[:, 1], dtype=np.float64)

# AUC for meta-model
meta_train_auc = roc_auc_score(y_train, meta_train_pred)
meta_val_auc = roc_auc_score(y_val, meta_val_pred)

print(f"\n📊 Meta-Model Performance:")
print(f"  Training AUC: {meta_train_auc:.6f}")
print(f"  Validation AUC: {meta_val_auc:.6f}")
print(f"  Overfitting gap: {meta_train_auc - meta_val_auc:.6f}")

# Performance comparison
individual_aucs: List[float] = [float(lgb_val_auc), float(catboost_val_auc)]
best_individual: float = max(individual_aucs)
improvement: float = float(meta_val_auc - best_individual)

print(f"\n📈 Model comparison summary:")
print(f"  LightGBM solo: {lgb_val_auc:.6f}")
print(f"  CatBoost solo: {catboost_val_auc:.6f}")
print(f"  Meta-Model: {meta_val_auc:.6f}")
print(f"  Best improvement: +{improvement:.6f}")


In [None]:
# =============================================
# 12. TYPE-SAFE FINAL TEST PREDICTIONS
# =============================================

print("\n🎯 GENERATING TYPE-SAFE FINAL TEST PREDICTIONS")
print("=" * 50)

# Generate test predictions from base models
lgb_test_pred = np.asarray(lgb_test_pred, dtype=np.float64)
catboost_test_pred = np.asarray(catboost_test_pred, dtype=np.float64)

# Prepare meta-features for test set
meta_test_features = np.column_stack((lgb_test_pred, catboost_test_pred))

# Meta-model test predictions
ensemble_test_pred: PredictionArray = np.asarray(meta_model.predict_proba(meta_test_features)[:, 1], dtype=np.float64)

print(f"📊 Test prediction statistics:")
print(f"  Mean prediction: {float(ensemble_test_pred.mean()):.6f}")

In [None]:
# =============================================
# 13. SUBMISSION PREPARATION
# =============================================

print("\n📄 PREPARING SUBMISSION FILES")
print("=" * 35)

# Load original test file to maintain structure
test_original = pd.read_csv('ctr_test.csv')
test_original['click'] = ensemble_test_pred

# Save updated test file
test_original.to_csv('ctr_test.csv', index=False)
print("✅ Updated ctr_test.csv with ensemble predictions")

# Prepare submission file
submission_df = pd.read_csv('ctr_sample_submission.csv')
submission_mapping = dict(zip(test_original['idx'], ensemble_test_pred))

submission_df['click'] = submission_df['idx'].map(submission_mapping)

# Validate submission completeness
missing_predictions = submission_df['click'].isna().sum()
if missing_predictions > 0:
    print(f"⚠️ Warning: {missing_predictions} missing predictions filled with mean")
    submission_df['click'].fillna(ensemble_test_pred.mean(), inplace=True)

submission_df.to_csv('ctr_sample_submission.csv', index=False)
print("✅ Final submission file ready: ctr_sample_submission.csv")

In [None]:
# =============================================
# 14. FEATURE IMPORTANCE ANALYSIS
# =============================================

print("\n🔍 FEATURE IMPORTANCE ANALYSIS")
print("=" * 35)

# LightGBM feature importance
lgb_importance = lgb_model.feature_importance(importance_type='gain')
feature_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'lgb_importance': lgb_importance,
    'catboost_importance': catboost_model.feature_importances_
})

# Use meta-model coefficients for ensemble importance
meta_coefficients = np.abs(meta_model.coef_[0])
feature_importance_df['ensemble_importance'] = (
    meta_coefficients[0] * feature_importance_df['lgb_importance'] +
    meta_coefficients[1] * feature_importance_df['catboost_importance']
)

top_features = feature_importance_df.nlargest(15, 'ensemble_importance')

print("🏆 Top 15 most important features:")
for i, (_, row) in enumerate(top_features.iterrows(), 1):
    print(f"  {i:2d}. {row['feature']:<25} → {row['ensemble_importance']:.1f}")

In [None]:
# =============================================
# 15. FINAL PERFORMANCE SUMMARY
# =============================================

print(f"\n{'🎉 FINAL PERFORMANCE SUMMARY'}")
print("=" * 50)

expected_points = 100 * max(0.0, float(meta_val_auc - 0.60)) / 0.40

print(f"📊 Validation Results:")
print(f"  Final Meta-Model AUC: {meta_val_auc:.6f}")
print(f"  Expected Competition Points: {expected_points:.1f}/100")

if meta_val_auc >= 0.80:
    print("🏆 EXCEPTIONAL RESULT! Target AUC ≥ 0.80 achieved")
elif meta_val_auc >= 0.75:
    print("🎯 EXCELLENT RESULT! Strong competitive performance")
elif meta_val_auc >= 0.70:
    print("✅ SOLID RESULT! Significant improvement achieved")
else:
    print("⚠️ MODERATE IMPROVEMENT. Consider additional feature engineering")

print(f"\n🔧 Key improvements implemented:")
print(f"  ✅ Temporal 5-fold cross-validation with TimeSeriesSplit")
print(f"  ✅ Frequency encoding with Laplace smoothing")
print(f"  ✅ CTR-based aggregation features with leakage checks")
print(f"  ✅ Advanced temporal feature engineering (simplified)")
print(f"  ✅ LightGBM + CatBoost with stacking via logistic regression meta-model")
print(f"  ✅ Hyperparameter tuning for base models")

print(f"\n🚀 Ready for submission! Expected significant improvement in leaderboard AUC.")