In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from xgboost import XGBRegressor
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')


In [3]:
print(">>> Загрузка данных...")
df = pd.read_csv('hackathon_income_train.csv', delimiter=';')
df = df.drop(columns=['w'])
df_test = pd.read_csv('hackathon_income_test.csv', delimiter=';')

print(f"Тренировочные данные: {df.shape}")
print(f"Тестовые данные: {df_test.shape}")

>>> Загрузка данных...
Тренировочные данные: (76786, 223)
Тестовые данные: (73214, 222)


In [4]:
def fix_comma_numbers(series):
    if series.dtype == 'object':
        try:
            return series.str.replace(',', '.').astype(float)
        except:
            return series
    return series


df['target'] = fix_comma_numbers(df['target'])

In [5]:
print(">>> Удаляем колонки с пропусками > 25%...")
missing_threshold = 0.25

missing_train = df.isnull().mean()
missing_test = df_test.isnull().mean()

cols_to_drop_missing = set(missing_train[missing_train > missing_threshold].index) | set(missing_test[missing_test > missing_threshold].index)
df_clean = df.drop(columns=cols_to_drop_missing)
df_test_clean = df_test.drop(columns=cols_to_drop_missing)

print(f"После удаления: train {df_clean.shape}, test {df_test_clean.shape}")

>>> Удаляем колонки с пропусками > 25%...
После удаления: train (76786, 83), test (73214, 82)


In [6]:
print(">>> Удаляем колонку dt...")
if 'dt' in df_clean.columns:
    df_clean = df_clean.drop(columns=['dt'])
    df_test_clean = df_test_clean.drop(columns=['dt'])
    print("Колонка dt удалена")

>>> Удаляем колонку dt...
Колонка dt удалена


In [7]:
median_columns = [
    'gender', 'incomeValueCategory', 'curbal_usd_amt_cm_avg', 'pil',
    'bki_total_auto_cnt', 'blacklist_flag', 'bki_total_oth_cnt', 'hdb_ovrd_sum',
    'avg_fdep_db_turn', 'hdb_bki_total_ip_cnt', 'days_to_last_transaction',
    'acard', 'other_credits_count', 'winback_cnt', 'bki_active_auto_cnt',
    'loanacc_rur_amt_cm_avg', 'avg_fdep_cr_turn', 'client_active_flag',
    'nonresident_flag', 'days_after_last_request', 'loanacc_rur_amt_curr_v2',
    'hdb_bki_total_active_products', 'hdb_bki_total_micro_cnt',
    'hdb_bki_active_pil_cnt', 'hdb_bki_total_pil_cnt', 'accountsalary_out_flag',
    'express_rur_amt_cm_avg', 'loanacc_rur_amt_cm_avg_inc_v2',
    'cred_dda_rur_amt_3m_avg'
]


median_columns = [col for col in median_columns if col in df_clean.columns]

In [8]:
categorical_columns = ['gender', 'adminarea', 'city_smart_name', 'addrref', 'incomeValueCategory']

for col in categorical_columns:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].fillna('Unknown')
        df_test_clean[col] = df_test_clean[col].fillna('Unknown')

In [9]:
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()

In [10]:
cols_to_keep = ['target', 'id']
for col in cols_to_keep:
    if col in numeric_cols:
        numeric_cols.remove(col)
    if col in categorical_cols:
        categorical_cols.remove(col)

In [11]:
numeric_median = [col for col in median_columns if col in numeric_cols]
numeric_mean = [col for col in numeric_cols if col not in median_columns]


if numeric_median:
    imputer_median = SimpleImputer(strategy='median')
    df_clean[numeric_median] = imputer_median.fit_transform(df_clean[numeric_median])
    df_test_clean[numeric_median] = imputer_median.transform(df_test_clean[numeric_median])

if numeric_mean:
    imputer_mean = SimpleImputer(strategy='mean')
    df_clean[numeric_mean] = imputer_mean.fit_transform(df_clean[numeric_mean])
    df_test_clean[numeric_mean] = imputer_mean.transform(df_test_clean[numeric_mean])

In [12]:
def fix_comma_numbers_df(df):
    df_processed = df.copy()
    for col in df_processed.columns:
        if pd.api.types.is_numeric_dtype(df_processed[col]):
            continue
        try:
            sample_values = df_processed[col].dropna().head(10)
            has_commas = any(isinstance(val, str) and ',' in str(val) and '.' not in str(val) for val in sample_values)
            if has_commas:
                df_processed[col] = df_processed[col].astype(str).str.replace(',', '.')
                df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
        except:
            continue
    return df_processed

df_clean = fix_comma_numbers_df(df_clean)
df_test_clean = fix_comma_numbers_df(df_test_clean)

In [13]:
print(">>> Кодируем категориальные признаки LabelEncoder...")
categorical_to_encode = ['gender', 'adminarea', 'city_smart_name', 'addrref', 'incomeValueCategory']
label_encoders = {}

for col in categorical_to_encode:
    if col in df_clean.columns:
        le = LabelEncoder()
        combined = pd.concat([df_clean[col], df_test_clean[col]], axis=0)
        le.fit(combined)

        df_clean[col] = le.transform(df_clean[col])
        df_test_clean[col] = le.transform(df_test_clean[col])
        label_encoders[col] = le

>>> Кодируем категориальные признаки LabelEncoder...


In [14]:
lower_bound = df_clean['target'].quantile(0.10)
upper_bound = df_clean['target'].quantile(0.90)


outliers_mask = (df_clean['target'] < lower_bound) | (df_clean['target'] > upper_bound)
df_clean = df_clean[~outliers_mask]

In [15]:
test_ids = df_test_clean['id'].copy()

X = df_clean.drop(columns=['id', 'target'])
y = df_clean['target']
X_test = df_test_clean.drop(columns=['id'])



X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

print(f">>> Разделение данных: train {X_train.shape}, val {X_val.shape}")

>>> Разделение данных: train (49142, 80), val (12286, 80)


In [16]:
def objective_lgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 5.0),
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': -1
    }

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return mean_absolute_error(y_val, y_pred)

def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.1, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 5.0),
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 0
    }

    model = XGBRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return mean_absolute_error(y_val, y_pred)

In [17]:
print(">>> LightGBM...")
study_lgb = optuna.create_study(direction='minimize')
study_lgb.optimize(objective_lgb, n_trials=20, show_progress_bar=True)
print(f"Лучший LightGBM MAE: {study_lgb.best_value:.4f}")

[I 2025-11-30 08:49:28,011] A new study created in memory with name: no-name-37776712-11ba-43ad-b953-8f880a045777


>>> LightGBM...


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-11-30 08:50:50,750] Trial 0 finished with value: 21673.494304244072 and parameters: {'n_estimators': 746, 'learning_rate': 0.16135461656495767, 'num_leaves': 74, 'max_depth': 8, 'min_child_samples': 27, 'subsample': 0.7904798124742021, 'colsample_bytree': 0.7568602241742043, 'reg_alpha': 0.521452694926056, 'reg_lambda': 1.772483649474284}. Best is trial 0 with value: 21673.494304244072.
[I 2025-11-30 08:51:20,380] Trial 1 finished with value: 21272.92430227974 and parameters: {'n_estimators': 811, 'learning_rate': 0.14702479570711596, 'num_leaves': 45, 'max_depth': 5, 'min_child_samples': 50, 'subsample': 0.9164731265042098, 'colsample_bytree': 0.7651826039497057, 'reg_alpha': 3.3649548743248805, 'reg_lambda': 1.496687759502884}. Best is trial 1 with value: 21272.92430227974.
[I 2025-11-30 08:51:29,276] Trial 2 finished with value: 21896.184884773353 and parameters: {'n_estimators': 253, 'learning_rate': 0.018913318047895325, 'num_leaves': 86, 'max_depth': 4, 'min_child_samples

In [18]:
print(">>> XGBoost...")
study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=20, show_progress_bar=True)
print(f"Лучший XGBoost MAE: {study_xgb.best_value:.4f}")

[I 2025-11-30 08:55:20,852] A new study created in memory with name: no-name-af249cac-b89e-4ad0-acf5-1f8d8391d5e0


>>> XGBoost...


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-11-30 08:55:54,745] Trial 0 finished with value: 21955.73820271563 and parameters: {'n_estimators': 769, 'learning_rate': 0.184925545607946, 'max_depth': 7, 'min_child_weight': 7, 'subsample': 0.8389541770294406, 'colsample_bytree': 0.7104992374474425, 'reg_alpha': 0.6625161144884351, 'reg_lambda': 3.264657347809156}. Best is trial 0 with value: 21955.73820271563.
[I 2025-11-30 08:56:03,517] Trial 1 finished with value: 21129.92649916767 and parameters: {'n_estimators': 813, 'learning_rate': 0.07873497680550569, 'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.8461389189196974, 'colsample_bytree': 0.9839848722307061, 'reg_alpha': 3.2838477630667837, 'reg_lambda': 4.993327349676533}. Best is trial 1 with value: 21129.92649916767.
[I 2025-11-30 08:56:13,251] Trial 2 finished with value: 21105.058371305393 and parameters: {'n_estimators': 859, 'learning_rate': 0.15868041643375588, 'max_depth': 3, 'min_child_weight': 8, 'subsample': 0.9330276663400492, 'colsample_bytree': 0.93

In [19]:
best_params_lgb = study_lgb.best_params.copy()
model_lgb = lgb.LGBMRegressor(**best_params_lgb)
model_lgb.fit(X, y)
print("LightGBM обучен")

best_params_xgb = study_xgb.best_params.copy()
model_xgb = XGBRegressor(**best_params_xgb)
model_xgb.fit(X, y)
print("XGBoost обучен")

LightGBM обучен
XGBoost обучен


In [20]:
models = {
    'LightGBM': model_lgb,
    'XGBoost': model_xgb
}

best_model_name = None
best_cv_score = float('inf')

for name, model in models.items():
    cv_scores = -cross_val_score(model, X, y,
                               scoring='neg_mean_absolute_error',
                               cv=5, n_jobs=1)
    cv_mae = np.mean(cv_scores)
    cv_std = np.std(cv_scores)

    print(f"{name}: CV MAE = {cv_mae:.4f} +/- {cv_std:.4f}")

    if cv_mae < best_cv_score:
        best_cv_score = cv_mae
        best_model_name = name

print(f"\n>>> Лучшая модель: {best_model_name} с MAE = {best_cv_score:.4f}")

LightGBM: CV MAE = 21035.6599 +/- 128.4611
XGBoost: CV MAE = 20970.2194 +/- 126.0982

>>> Лучшая модель: XGBoost с MAE = 20970.2194


In [21]:
y_pred_lgb = model_lgb.predict(X_test)
y_pred_xgb = model_xgb.predict(X_test)


weights = {
    'LightGBM': 0.6,
    'XGBoost': 0.4
}

y_pred_ensemble = (weights['LightGBM'] * y_pred_lgb +
                   weights['XGBoost'] * y_pred_xgb)


predictions = {
    'LightGBM': y_pred_lgb,
    'XGBoost': y_pred_xgb,
    'Ensemble': y_pred_ensemble
}

In [22]:
best_single_pred = predictions[best_model_name]
submission_single = pd.DataFrame({
    'id': test_ids,
    'target': best_single_pred
})
submission_single_filename = f'submission_{best_model_name}_final.csv'
submission_single.to_csv(submission_single_filename, index=False, sep=',')

In [23]:
submission_ensemble = pd.DataFrame({
    'id': test_ids,
    'target': predictions['Ensemble']
})
submission_ensemble_filename = 'submission_ensemble_lgb_xgb.csv'
submission_ensemble.to_csv(submission_ensemble_filename, index=False, sep=',')