In [1]:
import pandas as pd
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np
from sklearn.preprocessing import LabelEncoder
import optuna
from sklearn.model_selection import KFold
import seaborn as sns

def weighted_mean_absolute_error(y_true, y_pred, weights):
    return (weights * np.abs(y_true - y_pred)).mean()

df = pd.read_csv(r"C:\Users\lllku\Downloads\Telegram Desktop\data.csv")
feature_description = pd.read_csv(r"C:\Users\lllku\Downloads\Telegram Desktop\features_description.csv")
feature_digital_pr = feature_description[feature_description['описание'] == 'данные цифрового профиля']
df = df.drop(columns='dt')

  df = pd.read_csv(r"C:\Users\lllku\Downloads\Telegram Desktop\data.csv")


In [2]:
categorical_inf = []
for column in df.columns:
    if len(df[column].unique()) <= 106:
        categorical_inf.append({'column': column, "uniq": df[column].nunique()})
categorical_inf.append({'column': 'dp_ewb_last_employment_position', "uniq": df['dp_ewb_last_employment_position'].nunique()})
categorical_inf.append({'column': 'dp_address_unique_regions', "uniq": df['dp_address_unique_regions'].nunique()})
categorical = pd.DataFrame(categorical_inf)
categorical = categorical.drop(index=0)

In [3]:
for column in categorical['column']:
    if df[column].isna().any():
        df[column] = df[column].fillna(-1)
for cat in categorical['column']:
    df[cat] = df[cat].astype('category')
categorical

Unnamed: 0,column,uniq
1,adminarea,85
2,age,71
3,incomeValueCategory,12
4,avg_loan_cnt_with_insurance,51
5,city_smart_name,105
6,mob_cnt_days,90
7,pil,12
8,addrref,62
9,bki_total_auto_cnt,15
10,blacklist_flag,2


In [4]:
categorical_over_106 = []
for column in df.select_dtypes('object').columns:
    if df[column].nunique() > 106:
        categorical_over_106.append(column)
for i in categorical_over_106:
    if i[:3] == 'hdb' or i[:3] == 'bki':
        df[i] = pd.to_numeric(df[i], errors='coerce')
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'Мужской' else 0)
df['adminarea'] = df['adminarea'].apply(lambda x: 'nan' if x == -1 else x)
df['city_smart_name'] = df['city_smart_name'].apply(lambda x: 'nan' if x == -1 else x)
df['addrref'] = df['addrref'].apply(lambda x: 'nan' if x == -1 else x)
df['dp_ewb_last_employment_position'] = df['dp_ewb_last_employment_position'].apply(lambda x: 'nan' if x == -1 else x)

In [5]:
for cat in categorical['column']:
    df[cat] = df[cat].astype(str)
    le = LabelEncoder()
    df[cat] = le.fit_transform(df[cat])
df = df.drop(columns=['hdb_bki_total_pil_max_limit', 'hdb_bki_total_cc_max_limit', 'bki_active_auto_cnt', 'hdb_bki_total_micro_cnt', 'hdb_bki_active_pil_max_limit', 'bki_total_ip_max_outstand', 'bki_total_active_products', 'hdb_bki_other_active_ip_outstanding', 'hdb_bki_total_max_limit'])


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76786 entries, 0 to 76785
Columns: 214 entries, id to first_salary_income
dtypes: float64(167), int64(47)
memory usage: 125.4 MB


In [7]:
X = df.drop(columns=['target', 'w'])
y = df[['target', 'w']]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = xgb.XGBRegressor(max_depth=6, n_estimators=200, device='cuda', tree_method='hist')
model.fit(x_train, y_train['target'])


In [8]:
imp_score = model.feature_importances_
feature_names = X.columns


feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': imp_score
}).sort_values('importance', ascending=False)
feature_importance_df

Unnamed: 0,feature,importance
8,turn_cur_cr_avg_v2,0.168759
2,salary_6to12m_avg,0.051109
1,turn_cur_cr_avg_act_v2,0.044025
11,dp_ils_avg_salary_1y,0.026091
90,avg_fdep_db_turn,0.018135
...,...,...
18,turn_cur_db_avg_v2,0.000000
81,turn_fdep_db_avg_v2,0.000000
130,dp_ils_cnt_changes_1y,0.000000
127,dp_ewb_dismissal_due_contract_violation_by_lb_cnt,0.000000


In [None]:

def objective(trial):
    params = {
        'objective': trial.suggest_categorical("objective", ['reg:squarederror', 'reg:absoluteerror']),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        "early_stopping_rounds": trial.suggest_int('early_stopping_rounds', 10, 50),
        'enable_categorical': True,
        'random_state': 42,
    }
    
    X_trial = X[feature_importance_df['feature'][:trial.suggest_int('top_n_features', 3, 150)]]
    
    X_train, X_test, y_train, y_test = train_test_split(X_trial, y, test_size=0.2, random_state=42)
    
    model = xgb.XGBRegressor(**params, device='cuda', tree_method='hist')
    model.fit(X_train, y_train["target"], eval_set=[(X_test, y_test["target"])], verbose=False)
    

    predict = model.predict(X_test)
    return weighted_mean_absolute_error(y_test["target"], predict, y_test["w"])


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5, n_jobs=-1)

# Results
print("Best trial:")
trial = study.best_trial
print(f"MAPE loss: {trial.value:.4f}")


[I 2025-11-30 13:45:23,608] A new study created in memory with name: no-name-718d8a6f-138d-4669-a875-f4678c7f8423
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
[I 2025-11-30 13:45:45,005] Trial 0 finished with value: 78217.155250204 and parameters: {'objective': 'reg:absoluteerror', 'n_estimators': 563, 'max_depth': 10, 'learning_rate': 2.6847234527941337e-05, 'subsample': 0.3836167639132803, 'colsample_bytree': 0.7600223619451026, 'reg_alpha': 0.020183048735562148, 'reg_lambda': 2.103294143722945e-06, 'early_stopping_rounds': 10, 'top_n_features': 142}. Best is trial 0 with value: 78217.155250204.
[W 2025-11-30 13:45:48,365] Trial 1 failed with parameters: {'objective': 'reg:squarederror', 'n_estimators': 1955, 'max_depth': 5, 'learning_rate': 0.00015030912794077573, 'subsample': 0.6753049345232042, 'colsample_bytree': 0.902653045657817, 'reg_alpha': 8.7

KeyboardInterrupt: 

Trial 83 finished with value: 28426.461984512833 and parameters: {'objective': 'reg:squarederror', 'n_estimators': 1848, 'max_depth': 10, 'learning_rate': 0.00806762475574465, 'subsample': 0.9908354562103298, 'colsample_bytree': 0.7908909626596169, 'reg_alpha': 2.5739114786207488e-08, 'reg_lambda': 0.7766280043102454, 'early_stopping_rounds': 45, 'top_n_features': 89}.

In [None]:
best_params = {'objective': 'reg:squarederror', 'n_estimators': 1848, 'max_depth': 10, 'learning_rate': 0.00806762475574465, 'subsample': 0.9908354562103298, 'colsample_bytree': 0.7908909626596169, 'reg_alpha': 2.5739114786207488e-08, 'reg_lambda': 0.7766280043102454, 'early_stopping_rounds': 45}
def wmae(y_true, y_pred, *, sample_weight):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    sample_weight = np.array(sample_weight)
    return (sample_weight * np.abs(y_true - y_pred)).mean()

In [None]:
model_test = xgb.XGBRegressor(**best_params, device='cuda', tree_method='hist')
X_trial = X[feature_importance_df['feature'][:89]]#115 best_params

kf = KFold(n_splits=8, shuffle=True, random_state=42)
scores = []

# for fold, (train_idx, val_idx) in enumerate(kf.split(X_trial)):
#     X_tr = X_trial.iloc[train_idx]
#     X_val = X_trial.iloc[val_idx]
#     y_tr = y['target'][train_idx]
#     y_val = y['target'][val_idx]
#     w_tr = y['w'][train_idx]
#     w_val = y['w'][val_idx]

#     model_test = xgb.XGBRegressor(**best_params, device='cuda', tree_method='hist')
#     model_test.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_val, y_val)], sample_weight_eval_set=[w_val], verbose=False)

#     pred = model_test.predict(X_val)
#     wmae_score = np.sum(w_val * np.abs(y_val - pred)) / np.sum(w_val)
#     scores.append(wmae_score)
        
#     print(f"Fold {fold+1}: WMAE = {wmae_score:.6f} (best_iter: {model_test.best_iteration})")

# print(f"\nFinal CV Weighted MAE: {np.mean(scores):.6f} ± {np.std(scores):.5f}")
# print(f"Best mean score: {np.mean(scores):.6f}")
# X_train, X_test, y_train, y_test = train_test_split(X_trial, y, test_size=0.2, random_state=42)
# model_test.fit(X_train, y_train["target"], eval_set=[(X_test, y_test["target"])], verbose=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_trial, y, test_size=0.2, random_state=42)
model_test.fit(X_train, y_train["target"], eval_set=[(X_test, y_test["target"])], verbose=False)



In [None]:


df_test = pd.read_csv(r"F:\Browser dwnlds\hackathon_income_test.csv", sep=';', decimal=',')
feature_digital_pr = feature_description[feature_description['описание'] == 'данные цифрового профиля']


# info = []
# for column in df.columns:
#     info.append([column, sum(pd.notna(df[column]))/ ])

df_test

  df_test = pd.read_csv(r"F:\Browser dwnlds\hackathon_income_test.csv", sep=';', decimal=',')


Unnamed: 0,id,dt,turn_cur_cr_avg_act_v2,salary_6to12m_avg,hdb_bki_total_max_limit,dp_ils_paymentssum_avg_12m,hdb_bki_total_cc_max_limit,incomeValue,gender,avg_cur_cr_turn,...,total_sum,dp_ils_uniq_companies_1y,avg_6m_travel,avg_6m_government_services,hdb_bki_active_cc_max_overdue,total_rur_amt_cm_avg_period_days_ago_v2,label_Above_1M_share_r1,transaction_category_supermarket_sum_cnt_d15,max_balance_rur_amt_1m_af,first_salary_income
0,0,2024-08-31,805319.38,,61137.47,,60000.0,159999.0,Женский,69740.0,...,,,0.0,0.0,,80228.0,0.000000,1.0,,
1,1,2024-10-31,306240.00,,949500.0,,230000.0,108834.0,Мужской,63513.0,...,0.00,,0.0,0.0,0.0,24888.0,0.000000,,,
2,3,2024-09-30,164908.73,,178000.0,,178000.0,59203.0,Женский,132.0,...,38630.63,,0.0,0.0,2363.9,223.0,,,,
3,9,2024-10-31,2374846.42,,25500.0,126247.448359,4999.0,180906.0,Женский,290339.0,...,,3.0,800.0,0.0,,25734.0,0.000000,7.0,0.0,
4,11,2024-11-30,735902.71,47828.145621,60000.0,,60000.0,24922.0,Мужской,76924.0,...,0.00,,0.0,0.0,0.0,1214.0,0.000000,15.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73209,149981,2024-09-30,531284.21,,288500.0,,,64721.0,Мужской,279544.0,...,0.00,,0.0,233.0,,94798.0,,4.0,,
73210,149985,2024-09-30,173804.50,,90000.0,,24999.28,38860.0,Женский,31813.0,...,,,0.0,98.0,30232.0,16785.0,0.000000,,,
73211,149989,2024-10-31,1076401.61,,148000.0,,148000.0,97840.0,Мужской,4233.0,...,,,,,1507.42,524.0,,,,
73212,149995,2024-11-30,2729721.41,,659000.0,132028.008254,160000.0,156088.0,Мужской,187073.0,...,500.00,2.0,11394.0,182.0,0.0,18505.0,0.013333,25.0,1028.0,


In [None]:
for column in categorical['column']:
    if df_test[column].isna().any():
        df_test[column] = df_test[column].fillna(-1)
for cat in categorical['column']:
    df_test[cat] = df_test[cat].astype('category')

In [None]:
categorical_over_106 = []
for column in df_test.select_dtypes('object').columns:
    if df_test[column].nunique() > 106:
        categorical_over_106.append(column)
for i in categorical_over_106:
    if i[:3] == 'hdb' or i[:3] == 'bki':
        df_test[i] = pd.to_numeric(df_test[i], errors='coerce')
df_test['gender'] = df_test['gender'].apply(lambda x: 1 if x == 'Мужской' else 0)

In [None]:
df_test = df_test.drop(columns='dt')
df_test['adminarea'] = df_test['adminarea'].apply(lambda x: 'nan' if x == -1 else x)
df_test['city_smart_name'] = df_test['city_smart_name'].apply(lambda x: 'nan' if x == -1 else x)
df_test['addrref'] = df_test['addrref'].apply(lambda x: 'nan' if x == -1 else x)
df_test['dp_ewb_last_employment_position'] = df_test['dp_ewb_last_employment_position'].apply(lambda x: 'nan' if x == -1 else x)
for cat in categorical['column']:
    df_test[cat] = df_test[cat].astype(str)
    le = LabelEncoder()
    df_test[cat] = le.fit_transform(df_test[cat])
df_test = df_test.drop(columns=['hdb_bki_total_pil_max_limit', 'hdb_bki_total_cc_max_limit', 'bki_active_auto_cnt', 'hdb_bki_total_micro_cnt', 'hdb_bki_active_pil_max_limit', 'bki_total_ip_max_outstand', 'bki_total_active_products', 'hdb_bki_other_active_ip_outstanding', 'hdb_bki_total_max_limit'])



In [None]:
X = df_test
ids = X['id']
X = X[feature_importance_df['feature'][:89]]
predictions = model_test.predict(X)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


In [None]:
submission = pd.DataFrame({'id': ids, 'target': predictions})
submission.to_csv(index=False, path_or_buf=r"C:\Users\lllku\OneDrive\Документы\submissons.csv")


In [None]:
predictions[:5]

array([63434.062, 59554.375, 40637.5  , 93998.35 , 39516.71 ],
      dtype=float32)