In [6]:
# !pip install faiss-imputer

In [7]:
# !pip install --upgrade tensorflow==2.13

# **Imports**

In [8]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np

# import featuretools as ft
# from faiss_imputer import FaissImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.model_selection import train_test_split
import optuna

# from sklearn.kernel_ridge import KernelRidge
# from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, ARDRegression, BayesianRidge, SGDRegressor, PoissonRegressor
from catboost import CatBoostRegressor, CatBoostClassifier, Pool

import tensorflow as tf
import tensorflow.keras as keras

from sklearn.metrics import accuracy_score, f1_score

import time
from typing import TypeAlias, Union

import gc


import warnings
# warnings.filterwarnings('ignore')

In [9]:
# tf.config.list_physical_devices()

In [10]:
featureList: TypeAlias = list[str]
ndarray: TypeAlias = Union[pd.Series, np.ndarray, list, tuple]

# **Metric**

In [11]:
def weighted_mean_absolute_error(y_true: ndarray, y_pred: ndarray, weights: ndarray) -> float:

    '''
    Parameters
    ----------
    y_true: ndarray
        Ground truth
    y_pred: ndarray
        Array of predictions

    Returns
    -------
    rmsle: float
        Weighted mean absolute error

    References
    ----------
    .. [1] https://kaggle-metrics.readthedocs.io/en/latest/_modules/kaggle_metrics/regression.html

    '''
    
    return (weights * np.abs(y_true - y_pred)).mean()

# **Reading Data**

In [12]:
train_df = pd.read_csv('/kaggle/input/income-prediction-alfa-campus/train.csv', sep=";", decimal=",", encoding="windows-1251")
test_df = pd.read_csv('/kaggle/input/income-prediction-alfa-campus/test.csv', sep=";", decimal=",", encoding="windows-1251")

# train_df = pd.read_csv('/kaggle/input/train-df2/train_df_2', sep=",", decimal=".")
# test_df = pd.read_csv('/kaggle/input/test-df2/test_df_2', sep=",", decimal=".")
usd_rub = pd.read_csv('/kaggle/input/d/crn4tww/usd-rub/USD_RUB.csv')
infl_prime_rate = pd.read_excel('/kaggle/input/inflation-cb-prime-rate/inflation_CB_prime_rate.xlsx')

train_df.index = train_df.client_id
test_df.index = test_df.client_id

# train_df.drop(['client_id', 'Unnamed: 0'], axis='columns', inplace=True)
# test_df.drop(['client_id', 'Unnamed: 0'], axis='columns', inplace=True)

train_df.shape, test_df.shape

  warn("Workbook contains no default style, apply openpyxl's default")


((205962, 235), (37183, 233))

# **Merging Other Data**

I've gathered some data from central bank and other financial institution websites that I think would be a good fit 

**Inflation and Central Bank Prime Rate**

In [13]:
infl_prime_rate.columns = ['feature_date', 'prime_rate', 'infl', 'tmp']
infl_prime_rate = infl_prime_rate[['feature_date', 'prime_rate', 'infl']]
infl_prime_rate.feature_date = ['2023-09-30', '2023-08-31', '2023-07-31', '2023-06-30', 
                                '2023-05-31', '2023-04-30', '2023-03-31', '2023-02-28', 
                                '2023-01-31', '2022-12-31', '2022-11-30', '2022-10-31', '2022-09-30']

**UsdRub**

In [14]:
usd_rub = usd_rub[['Дата', 'Цена']]
usd_rub.columns = ['feature_date', 'usd_price']

# inputting some missing values
usd_rub.loc[len(usd_rub.index)] = ['30.09.2023', 97.9675]
usd_rub.loc[len(usd_rub.index)] = ['31.12.2022', 69.8200]
usd_rub.loc[len(usd_rub.index)] = ['30.04.2023', 81.1025]

**Money Supply**

In [15]:
date = ['2023-09-30', '2023-08-31', '2023-07-31', '2023-06-30', '2023-05-31', '2023-04-30', '2023-03-31', '2023-02-28', '2023-01-31', '2022-12-31', '2022-11-30', '2022-10-31', '2022-09-30']
amount = [18559.5, 18395.0, 17998.0, 17538.9, 17349.8, 16855.8, 16616.3, 16274.8, 16454.7, 15665.9, 15596.4, 15265.4, 14335.4]

amount_of_money = pd.DataFrame(np.transpose([date, amount]), columns=['feature_date', 'amount_of_money'])

**Merging**

In [16]:
usd_rub['feature_date'] = pd.to_datetime(usd_rub['feature_date'], format='%d.%m.%Y')
infl_prime_rate['feature_date'] = pd.to_datetime(infl_prime_rate['feature_date'], format="%Y-%m-%d")
amount_of_money['feature_date'] = pd.to_datetime(amount_of_money['feature_date'], format="%Y-%m-%d")

amount_of_money.amount_of_money = amount_of_money.amount_of_money.astype('float64')
usd_rub.usd_price = usd_rub.usd_price.str.replace(',', '.').astype('float64')

test_df.feature_date = pd.to_datetime(test_df.feature_date, format="%Y-%m-%d")
train_df.feature_date = pd.to_datetime(train_df.feature_date, format="%Y-%m-%d")

In [17]:
train_df = pd.merge(train_df, usd_rub, on='feature_date', how='left')
train_df = pd.merge(train_df, infl_prime_rate, on='feature_date', how='left')
train_df = pd.merge(train_df, amount_of_money, on='feature_date', how='left')

test_df = pd.merge(test_df, usd_rub, on='feature_date', how='left')
test_df = pd.merge(test_df, infl_prime_rate, on='feature_date', how='left')
test_df = pd.merge(test_df, amount_of_money, on='feature_date', how='left')

In [18]:
# another missing value
test_df.usd_price = 97.9675

In [19]:
# train_df.to_csv('train_df.csv')
# test_df.to_csv('test_df.csv')

# **FEATURE ENGEENIRING**

While analyzing the data, I noticed that due to the large number of values denoting essentially the same thing in different columns, the data gets very corrupted

**Base Transformations**

In [None]:
# Convert all characters in columns to lower case to avoid duplicates that differ only in case
def lower_positions(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
    for feat in cols:
        df[feat] = df[feat].str.lower()
    return df


def del_null_features(df: pd.DataFrame, a: Union[int, float]) -> (pd.DataFrame, featureList):
    """
    df: DataFrame
    a: alpha, fraction threshold below which the column is to be deleted
    """
    ln = len(df.index)
    feat_to_del = [] 
    
    for col in df.columns:
        if df[col].isnull().sum() / ln > a:
            feat_to_del.append(col)
    
    # Returns two values to keep the possibility to retry modifications.
    return df.drop(feat_to_del, axis='columns'), feat_to_del

def remove_correlated_features(df: pd.DataFrame, threshold: float) -> (pd.DataFrame, list):
    """
    df: DataFrame 
    threshold : Correlation threshold for feature removal
    """
    # Calculate the absolute correlation matrix
    corr_matrix = df.corr().abs()
    
    features_to_del = []
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if corr_matrix.iloc[i, j] >= threshold and corr_matrix.columns[j] not in features_to_del:
                colname = corr_matrix.columns[i] 
                features_to_del.append(colname)
    
    df_dropped = df.drop(columns=features_to_del)
    
    return df_dropped, features_to_del

# This function replaces a specified word within a string with another word.
def replace_words(word, search_word, replace_word):
    if search_word in word:
        return replace_word
    else:
        return word

In [None]:
cols_to_low = ['main_last_position_ccode', 'main_pre_last_position_ccode', 'part_last_position_ccode', 'part_pre_last_position_ccode', 'model', 'brand']

test_df = lower_positions(test_df, cols_to_low)
train_df = lower_positions(train_df, cols_to_low)

train_df, features_to_del_na = del_null_features(train_df, 0.9)
test_df = test_df.drop(features_to_del_na, axis='columns')

In [None]:
cat_features = ['addrref', 'main_last_position_ccode', 'main_pre_last_position_ccode',
                'oldest_campaignsegment_ccode_for_nss', 'oldest_campaignsegment_ccode_for_pil', 
                'part_last_position_ccode', 'part_pre_last_position_ccode', 'segment', 'brand', 'model']

In [None]:
num_features = train_df.drop(cat_features + ['feature_date', 'target', 'w'], axis='columns').columns

train_df[num_features] = train_df[num_features].astype('float64')
test_df[num_features] = test_df[num_features].astype('float64')

In [None]:
train_df[cat_features] = train_df[cat_features].fillna('no_data')
test_df[cat_features] = test_df[cat_features].fillna('no_data')

In [None]:
# 'pairs' is a dictionary where each key is a brand name and the value is a list of various spellings and common misspellings of that brand.
pairs = {
    'lada': ['lada', 'лада', 'granta', 'uaz', 'уаз', 'volga', 'волга', 'москвич', 'богдан', 'bogdan'],
    'bmw': ['bmw', 'бмв'], 
    'benz': ['mercedes', 'benz', 'мерседес', 'бенз', 'daimler', 'даймлер'],
    'seat': ['seat', 'сеат'], 
    'audi': ['audi', 'ауди'],
    'rolls-royce': ['rolls-royce', 'роллс', 'ройс'],
    'ducati': ['ducati', 'дукати'],
    'kia': ['kia', 'киа', 'киa'],
    'hyindai': ['hyindai', 'хенде', 'хендай', 'huyndai', 'hyundai', 'хундай', 'hyndai', 'хендэ'],
    'suzuki': ['suzuki', 'сузуки'],
    'datsun': ['datsun', 'датсун', 'дацун'],
    'toyota': ['toyota', 'тойота', 'лендмарк', 'landmark'],
    'honda': ['honda', 'хонда'],
    'ford': ['ford', 'форд'],
    'mustang': ['mustang', 'мустанг'],
    'kawasaki': ['kawasaki', 'кавасаки'],
    'chevrolet': ['chevrolet', 'chevy', 'шевроле'],
    'nissan': ['nissan', 'ниссан', 'niss'],
    'volkswagen': ['volkswagen', 'vw', 'фольксваген'],
    'subaru': ['subaru', 'субару'],
    'mazda': ['mazda', 'мазда'],
    'ferrari': ['ferrari', 'феррари'],
    'lamborghini': ['lamborghini', 'ламборгини'],
    'porsche': ['porsche', 'порше'],
    'aston martin': ['aston martin', 'астон мартин'],
    'bentley': ['bentley', 'бентли'],
    'bugatti': ['bugatti', 'бугатти'],
    'maserati': ['maserati', 'мазерати'],
    'peugeot': ['peugeot', 'пежо'],
    'citroen': ['citroen', 'ситроен'],
    'renault': ['renault', 'рено'],
    'ssangyoung': ['ssang', 'ссанг'],
    'volvo': ['volvo', 'вольво'],
    'skoda': ['skoda', 'шкода'],
    'fiat': ['fiat', 'фиат'],
    'alfa romeo': ['alfa romeo', 'альфа ромео'],
    'jaguar': ['jaguar', 'ягуар'],
    'land rover': ['land rover', 'ленд ровер', 'ланд ровер'],
    'smart': ['smart', 'смарт'],
    'lincoln': ['lincoln', 'линкольн'],
    'mitsubishi': ['mitsubishi', 'митсубиси', 'мицубиси'],
    'cadillac': ['cadillac', 'кадиллак'],
    'geely': ['geely', 'джили', 'джели'],
    'cherry': ['cherry', 'чери', 'chery', 'черри', 'a21', 'a15', 'а21', 'а15'],
    'great wall': ['great wall', 'великая стена', 'греат уолл', 'грейт волл'],
    'byd': ['byd', 'бид'],
    'dongfeng': ['dongfeng', 'донгфенг', 'донг фенг', 'dfm'],
    'changan': ['changan', 'чанган'],
    'faw': ['faw', 'фав'],
    'reliant': ['reliant'],
    'cfmoto': ['cfmoto', 'cf moto'],
    'vortex': ['vortex estina', 'вортекс'],
    'suv': ['suv'],
    'сст': ['сст'],
    'сеаз': ['сеаз'],
    'атлетик': ['атлетик'],
    'белава': ['белава', 'belava'],  # современная версия старой газели  
    'кр': ['кр'],
    'adria': ['adria'],
    'расер': ['расер', 'racer', 'райсер'],
    'сб трейлер': ['сб трейлер', 'трейлер'],
    'stels': ['стелс', 'stels'],
    'USSR_MOTO': ['спутник', 'sputnik', 'ява', 'иж', ],
    'vmc ventomatic motus company': ['vmc'],
    'schmitz': ['schmitz', 'шмитц'],
    'yamaha': ['ямаха', 'yamaha', 'fjr'],
    'maxus': ['максус', 'maxus'],
    'dodge': ['dodge', 'додж', 'ram', 'рам'],
    'скиф': ['скиф'],  # прицеп или мотоцикл стелс хз
    'кс': ['кс'],
    'газель': ['2834bf', '2834de', '278814', '2834bj', '2747', '2818', '278813', '33022', '32361', 'l4h2m2', '3009', '172452', '2704',
               '38787', '3010', '3227', 'ткм', '2834', '5789', '172422', 'l3h2', '172412'],
    'mini cooper': ['мини купер', 'mini cooper', 'mini', 'cooper', 'мини'],
    'диса': ['диса', 'индеец'],  # armed cars 
    'skygo': ['skygo'],  # moto
    '3at6tl': ['3at6tl'],
    'cool_american_moto': ['харлей девидсон', 'harley davidson', 'харлей', 'индиан', 'indian'],
    'iveco': ['iveco', 'ивеко'],
    'bockmann': ['bockmann'],
    'man': ['man', 'ман'],
    'бизнес': ['бизнес'],
    'gwm': ['gwm'],
    'baic': ['baic'],
    'hafei': ['hafei'],
    'howo': ['howo', 'хово'],
    'triumph ': ['триумф', 'triumph'],
    'nexus': ['nexus'],
    'suvt11': ['suvt11'],
    'ноэмз': ['ноэмз'],
    'daf': ['даф', 'daf'],
    'алк': ['алк'],
    '3009z6': ['3009z6'],
    'балтмоторс': ['балтмоторс'],
    'ровер': ['ровер'],
    '3702': ['3702'],
    'tabbert': ['tabbert'],
    'jj': ['jj'],
    '96093р': ['96093р'],
    'луидор': ['луидор'],
    'brilliance': ['бриллианс', 'brilliance'],
    'tatra': ['татра', 'tatra'],
    'ravon': ['ravon', 'равон'],
    'kaiyi': ['kaiyi', 'каи'],
    'рендерс': ['рендерс'],
    'acura': ['акура', 'acura'],
    'omoda': ['omoda', 'омода'],
    '69364s': ['69364s'], # бетономешалка
    '829440': ['829440', '82944с'], # трейлер для гидроцикла
    'daewoo': ['дэо', 'даэу', 'daewoo', 'дэу'],
    'regulmoto': ['regulmoto'],
    'tonar': ['тонар', 'tonar'],
    'кузбасс': ['кузбасс'],
    'metaco': ['metaco'],
    'chrysler': ['крайслер', 'chrysler'],
    'scania': ['scania', 'скания'],
    'trailer-small': ['821305', '7194b1', '7194f2', '71621', '829450', '716104', '718201', '7197', '8213', 'мзса', 
                      'кзап', '7149', '2747', '7171', 'прицеп', '850701', '716103', 'humbaur', '7187', '8501', '858018',
                      '8294', '716100', 'viking force', '7843', 'багем'], # trailer
    'zeekr': ['zeekr'],
    'газ': ['газ', 'саз', 'паз', 'тпз', 'маз', 'камаз', 'зил', 'вис', 'ваз', 'заз', 'урал', 'ммз', 'кмз', 'га3'],
    'range rover': ['range', 'rover', 'рендж', 'ренж', 'ровер'],
    'doninvest': ['донинвест', 'doninvest'],
    'хайма': ['хайма', 'haima'],
    'baltmotors': ['baltmotors'],
    'sym': ['sym'], 
    'opel': ['opel', 'опель'], 
    'jeep': ['jeep', 'джип'],
    'infiniti': ['infiniti', 'инфинити'],
    'haval': ['haval', 'хавал'],
    'lexus': ['lexus', 'лексус'],
    'pontiac': ['pontiac', 'понтиак'],
    'genesis': ['genesis', 'дженезис', 'генезис'],
    'jac': ['jac'],
    'lifan': ['lifan', 'лифан'],
    'exeed': ['exeed', 'эксид', 'ексид'],
    'lixiang': ['li', 'lixiang'],
    'tesla': ['tesla', 'тесла'],
    'isuzu': ['isuzu', 'исузу'],
    'moto': ['kayo', 'ktm', 'ктм', 'husqvarna', 'motoland', 'bajaj', 'баджадж', 'wy150t', 'nanfang', 'avantis', 'bs250', 
             'motard', 'мотоциклы', 'wanqiang'],
    'voyah': ['voyah', 'воях'],
    'agusta': ['agusta'],
    'jmc': ['jx1032db'],
    'saab': ['saab', 'сааб'],
    'trailer-big': ['чмзап', 'ламберет', 'lamberet', 'фрехауф', 'fruehauf', 'сеспель', 'kogel', 'сзап', 'krone', 'вилтон', 'wielton'],
    'freightliner': ['фрейтлайнер', 'freightliner', 'лкт'],
    'tractor': ['русич'],
    'cct': ['cct'],
    'trailer-house': ['avento', 'elddis'],
    'лав': ['лав']
    }

# This loop iterates over each brand name and its variations to standardize the brand names in the dataset.
for mark in pairs.keys():
    for bad_mark in pairs[mark]:
        
        # Applying standartization
        train_df['brand'] = train_df['brand'].apply(lambda x: replace_words(x, bad_mark, mark))
        test_df['brand'] = test_df['brand'].apply(lambda x: replace_words(x, bad_mark, mark))
        
# len([a for a in list(np.flip(train_df['brand'].unique())) if a not in pairs.keys()]), len(pairs), [a for a in list(np.flip(train_df['brand'].unique())) if a not in pairs.keys()]

In [None]:
# Combine test and train to create features in the same way.
df = pd.concat([train_df, test_df], axis='rows', ignore_index=True)

In [None]:
# Functions for splitting last month of train data
# will also use them for CatBoost data
last_month_train_split = lambda df, drop_col=['target', 'w', 'feature_date']: df[df['feature_date'] != df['feature_date'].max()].drop(drop_col, axis='columns')
last_month_test_split = lambda df, drop_col=['target', 'w', 'feature_date']: df[df['feature_date'] == df['feature_date'].max()].drop(drop_col, axis='columns')
last_month_train_col_split = lambda df, col: df[df['feature_date'] != df['feature_date'].max()][col]
last_month_test_col_split = lambda df, col: df[df['feature_date'] == df['feature_date'].max()][col]

# this data will also be used further for CatBoost model
y_train, y_test = last_month_train_col_split(train_df, 'target'), last_month_test_col_split(train_df, 'target') 
w_train, w_test = last_month_train_col_split(train_df, 'w'), last_month_test_col_split(train_df, 'w')

# **Creating New Features**

In [None]:
# List of features to be classified into quantiles based on their distribution.
features_to_quantile_classify = ['amount_by_category_30d__summarur_amt__sum__cashflowcategory_name__bilety_na_kontserty_i_v_teatry',
 'amount_by_category_30d__summarur_amt__sum__cashflowcategory_name__brokerskie_uslugi', 'amount_by_category_30d__summarur_amt__sum__cashflowcategory_name__hosting',
 'amount_by_category_30d__summarur_amt__sum__cashflowcategory_name__spa_sauny_bani', 'amount_by_category_30d__summarur_amt__sum__cashflowcategory_name__tovary_dlja_detej',
 'amount_by_category_30d__summarur_amt__sum__cashflowcategory_name__turisticheskie_agenstva', 'amount_by_category_90d__summarur_amt__sum__cashflowcategory_name__marketplejsy',
 'amount_by_category_90d__summarur_amt__sum__cashflowcategory_name__nalogi', 'amount_by_category_90d__summarur_amt__sum__cashflowcategory_name__ohota_i_rybalka', 
 'amount_by_category_90d__summarur_amt__sum__cashflowcategory_name__prochie_bilety', 'avg_3m_hotels',
 'avg_6m_building_services', 'avg_6m_money_transactions', 'avg_6m_personal_services', 'avg_6m_transportation',
 'avg_by_category__amount__sum__cashflowcategory_name__detskie_igrushki', 'avg_by_category__amount__sum__cashflowcategory_name__investitsii',
 'avg_by_category__amount__sum__cashflowcategory_name__odezhda_dlja_beremennyh', 'avg_by_category__amount__sum__cashflowcategory_name__vydacha_nalichnyh_v_bankomate',
 'avg_by_category__amount__sum__cashflowcategory_name__zdorove', 'avg_by_category__amount__sum__cashflowcategory_name__zooparki',
 'by_category__amount__sum__eoperation_type_name__perevod_mezhdu_svoimi_schetami', 'by_category__amount__sum__eoperation_type_name__platezh_cherez_vidzhet_moj_mobilnyj',
 'by_category__amount__sum__eoperation_type_name__pokupka_paja', 'by_category__amount__sum__eoperation_type_name__vneshnij_perevod_rur',
 'cred_dda_rur_amt_3m_avg', 'curr_rur_amt_cm_avg', 'hdb_bki_active_cc_cnt', 'hdb_bki_active_ip_max_outstand', 'hdb_bki_active_micro_max_outstand',
 'hdb_bki_active_pil_max_overdue', 'hdb_bki_other_active_auto_month_payments_sum', 'hdb_bki_total_cc_max_limit', 'hdb_bki_total_ip_cnt',
 'hdb_bki_total_max_limit', 'hdb_bki_total_max_overdue_sum', 'hdb_bki_total_pil_max_limit', 'hdb_outstand_sum', 'max_cc_largest_max_limit_actoff_30d',
 'max_pil_largest_max_limit_actoff_90d', 'min_cc_max_el_actoff_90d', 'min_cc_max_pilot_score_actoff_180d',
 'min_cc_max_ul_actoff_180_30_inc', 'min_cc_offer_lifetime_days_actoff_90d', 'min_clip_max_ul_actoff_90_30_inc', 'min_max_limit', 'min_pil_max_score_actoff_180d', 'min_pil_max_ul_actoff_90d', 
 'mob_cnt_days', 'mob_cover_days', 'mortgage_main_3', 'mortgage_main_90', 'prod_anketa_cc_7', 'prod_anketa_pil_refin_30', 'prod_page_pil_90', 'prod_page_pil_clicks_90', 'prod_page_sa_30', 'prod_page_travel_30',
 'product_auto_loan_ratio_hy', 'product_brokerskoe_obsluzhivanie_voc_features_12m_cnt_mark', 'product_brokerskoe_obsluzhivanie_voc_features_36m_mark_eq_2_flag',
 'product_brokerskoe_obsluzhivanie_voc_features_36m_voc_with_expert_cnt', 'product_brokerskoe_obsluzhivanie_voc_features_3m_mark_eq_3_flag', 'product_brokerskoe_obsluzhivanie_voc_features_3m_min_mark',
 'product_debetovye_karty_voc_features_36m_mark_in_1_3_flag', 'product_debetovye_karty_voc_features_36m_min_mark', 'product_debetovye_karty_voc_features_36m_voc_not_null_flag',
 'product_debetovye_karty_voc_features_full_mark_eq_2_flag', 'product_debetovye_karty_voc_features_full_mark_not_null_flag', 'product_debetovye_karty_voc_features_full_min_mark',
 'product_debetovye_karty_voc_features_full_voc_with_expert_cnt', 'product_depozity_voc_features_full_voc_without_marks_portion', 'product_ipoteka_voc_features_12m_mark_eq_5_flag',
 'product_ipoteka_voc_features_36m_mark_eq_1_flag', 'product_ipoteka_voc_features_3m_mark_in_4_5_flag', 'product_ipoteka_voc_features_3m_mark_not_null_flag', 'product_ipoteka_voc_features_full_mark_eq_3_flag',
 'product_kreditnye_karty_voc_features_12m_avg_mark', 'product_kreditnye_karty_voc_features_36m_mark_eq_5_flag', 'product_kreditnye_karty_voc_features_36m_min_mark', 'product_kreditnye_karty_voc_features_36m_voc_with_expert_cnt',
 'product_kreditnye_karty_voc_features_3m_mark_eq_1_flag', 'product_kreditnye_karty_voc_features_full_mark_not_null_flag', 'product_nakopitelnye_scheta_voc_features_36m_mark_not_null_flag',
 'product_other_comm_cur_status_hy', 'product_other_depos_ratio_hy', 'product_zarplatnaja_karta_voc_features_36m_mark_eq_5_flag', 'product_zarplatnaja_karta_voc_features_36m_voc_without_marks_portion',
 'product_zarplatnaja_karta_voc_features_3m_voc_with_expert_cnt', 'product_zarplatnyj_proekt_voc_features_12m_mark_eq_3_flag', 'product_zarplatnyj_proekt_voc_features_36m_avg_mark', 'product_zarplatnyj_proekt_voc_features_36m_cnt_mark',
 'product_zarplatnyj_proekt_voc_features_3m_mark_in_1_3_flag', 'prof_cc_prof', 'profit_income_out_rur_amt_9m', 'turn_cc_cr_max_v2', 'turn_cc_db_min_v2', 'turn_cc_db_sum_v2',
 'turn_other_cr_avg_v2', 'turn_other_db_max_v2', 'unique_countries1', 'vcu_other_rate_max_2avg_prop', 'voc_features_12m_voc_without_marks_portion',
 'voc_features_36m_mark_eq_1_flag', 'voc_features_36m_max_mark', 'voc_features_full_mark_eq_3_flag', 'worksalary_rur_amt',]

def car_age_classificator(x):
    if x <= 2000:
        return 'very_old'
    elif 2000 < x <= 2010:
        return 'old'
    elif 2010 < x <= 2018:
        return 'mid'
    elif 2018 < x:
        return 'new'
    
def default_quantile_classify(x, col, v_25, v_50, v_75):
    if x <= v_25:
        return 1 # '0-25'
    elif v_25 < x <= v_50:
        return 2 # '25-50'
    elif v_50 < x <= v_75:
        return 3 # '50-75'
    elif v_75 < x:
        return 4 # '75-100'  

# Function to apply aggregation functions to a DataFrame grouped by a specific column.
def apply_agg_funcs(df, col, col_val, funcs, name):
    for func in funcs:
        # Group the DataFrame by 'col' and aggregate 'col_val' using 'func'.
        tmp = df.groupby(col)[col_val].agg(func).to_frame().reset_index()
        # Rename the columns to reflect the aggregation function applied.
        tmp.columns = [col, f'{func}_{name}']
        # Merge the aggregated results back into the original DataFrame.
        df = pd.merge(df, tmp, left_on=col, right_on=col, how='outer')
    
    return df


def feat_engeeniring(df):    
    df['month'] = pd.DatetimeIndex(df['feature_date']).month
    df['province'] = df['addrref'].str.contains('область').astype('int64')
    
    df = apply_agg_funcs(df, 'segment', 'target', ['mean', 'std', 'min', 'max', ], 'target_by_segment')
    df = apply_agg_funcs(df, 'month', 'target', ['mean', 'std', 'min', 'max', ], 'target_by_month')
    df = apply_agg_funcs(df, 'addrref', 'target', ['mean', 'std', 'min', 'max', ], 'target_by_addrref')
    df = apply_agg_funcs(df, 'brand', 'target', ['mean', 'std', 'min', 'max', ], 'target_by_brand')
    
    # cars
    df['car_age_category'] = df['productionyear'].apply(car_age_classificator)
    
    # unique countries - all possible values - 1, 2, 0 no need to specify
    
    # quatiles splitting
    print(len(features_to_quantile_classify))
    i = 0
    for feat in features_to_quantile_classify:
        v_25 = df[feat].quantile(0.25)
        v_50 = df[feat].quantile(0.50)
        v_75 = df[feat].quantile(0.75)
        # Classify each value into a quantile.
        tmp = df[feat].apply(lambda x: default_quantile_classify(x, df[feat], v_25, v_50, v_75))
        if tmp.isnull().sum() != len(tmp):
            df[f'{feat}_by_quantiles'] = tmp
            df = apply_agg_funcs(df, f'{feat}_by_quantiles', 'target', ['mean', 'std', 'min', 'max', ], f'target_by_{feat}')
        
        print(i, end=' ')
        i += 1
        
    return df

In [None]:
train_df[['avg_percents_inc', 'target']].sort_values(by='avg_percents_inc', ascending=False)

Unnamed: 0,avg_percents_inc,target
174962,3000000.0,150000.00
54104,1750000.0,350000.00
159591,1447002.0,62066.77
112787,1000000.0,91009.20
114617,583333.0,293935.88
...,...,...
205918,,50000.00
205925,,63523.00
205949,,55000.00
205953,,25000.00


In [None]:
df = feat_engeeniring(df)

  df['month'] = pd.DatetimeIndex(df['feature_date']).month
  df['province'] = df['addrref'].str.contains('область').astype('int64')


107
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 

In [None]:
df, features_to_del_na = del_null_features(df, 0.94)

In [None]:
df.target = df.target.fillna(64)

In [None]:
train_df, test_df =  df[df['target'] != 64], df[df['target'] == 64].drop(['w', 'target'], axis='columns')

In [None]:
cat_features = train_df.select_dtypes(include=['category', 'object']).columns.to_list()

In [None]:
# Filling missing values
train_df[cat_features] = train_df[cat_features].fillna('no_data')
test_df[cat_features] = test_df[cat_features].fillna('no_data')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[cat_features] = train_df[cat_features].fillna('no_data')


In [None]:
del df
gc.collect()

In [None]:
'''X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(train_df.drop(['target', 'w', 'client_id', 'feature_date'], axis='columns'), 
                                                                     train_df.target, train_df.w, test_size=0.2, random_state=42) '''

X_train = last_month_train_split(train_df)
X_test = last_month_test_split(train_df)

# **Creating Embeddings For Categorical Features**

Сreate embeddings for some categorical data, to make it easier for models to understand them.

In [None]:
features_to_create_embeddings = ['addrref', 'main_last_position_ccode', 'main_pre_last_position_ccode', 'part_last_position_ccode', 'part_pre_last_position_ccode', 'model', 'brand']

cat_encoders = {}

for feat in features_to_create_embeddings:
    i = 0
    tmp = {}
    for val in train_df[feat].unique():
        tmp[val] = i
        
        i += 1
        
    cat_encoders[feat] = tmp
    
# After this code block, 'cat_encoders' will contain a dictionary for each feature,
# where each unique value in the feature column is mapped to a unique integer. 
# Then we can pass it through neural network with embeddings layer.

In [None]:
X_train_embeddings_model = []
X_test_embeddings_model = []

for feat in features_to_create_embeddings:
    # Map each categorical value to its corresponding integer encoding for the sets.
    X_train_embeddings_model.append(X_train[feat].map(cat_encoders[feat]).values)
    X_test_embeddings_model.append(X_test[feat].map(cat_encoders[feat]).values)
    
# X_train_embeddings_model.append(X_train[num_features].values)
# X_test_embeddings_model.append(X_test[num_features].values)

In [None]:
EMB_SIZE = 16

In [None]:
input_layers = []
emb_layers = []
flat_layers = []

# Create input layers for each categorical feature to be embedded.
i = 0
for feat in features_to_create_embeddings:
    # Each input layer takes a single value (shape=(1,)).
    input_layers.append(keras.Input(shape=(1, ), name=f'cat_input{i}'))
    
    i += 1

# Create embedding layers corresponding to each input layer.
# Or else we can create only one embedding layer.
# for all features. 
i = 0
for feat in features_to_create_embeddings:
    emb_layers.append(keras.layers.Embedding(len(cat_encoders[feat]), EMB_SIZE, name=f'cat_emb{i}')(input_layers[i]))
    
    i += 1

# i = 0
# for feat in features_to_create_embeddings:
#     flat_layers.append(keras.layers.Flatten()(emb_layers[i]))
#     
#     i += 1
#     
# numeric_input = keras.Input(shape=(len(num_features), ), name='num_input')
# dense_numeric = keras.layers.Dense(128, activation='linear')(numeric_input)
    
concat_layer = keras.layers.Concatenate()(emb_layers)  # flat_layers + [dense_numeric]
dense1 = keras.layers.Dense(128, activation='linear')(concat_layer)
dense2 = keras.layers.Dense(64, activation='linear')(dense1)
dense3 = keras.layers.Dense(32, activation='linear')(dense2)
output = keras.layers.Dense(1, activation='linear')(dense3)

emb_model = tf.keras.Model(input_layers, output)  #  + [numeric_input]

emb_model.summary()

: 

In [None]:
emb_model.compile(loss='mean_absolute_error', optimizer='adam')

# Set up an EarlyStopping callback for training. This will monitor the validation loss,
# and stop the training process if the model stops improving for a set number of epochs.
es = keras.callbacks.EarlyStopping(monitor='val_loss',
                                   mode='min',
                                   verbose=1,
                                   patience=10,
                                   restore_best_weights=True)

In [None]:
history = emb_model.fit(X_train_embeddings_model, 
                        y_train, 
                        validation_data=(X_test_embeddings_model, y_test), 
                        epochs=1000, 
                        batch_size=64, 
                        verbose=1,
                        callbacks=[es])

**Extracting categorical embeddings**

In [None]:
emb_dfs = []
i = 0
for feat in features_to_create_embeddings:
    # Create a dataframe mapping each category to its corresponding index value.
    cat_encoder = pd.DataFrame(cat_encoders[feat].items(), columns=[feat, f'{feat}_index'])
    # Retrieve the weights of the embedding layer for the current feature and convert it to a dataframe.
    # The 'reset_index' function is used to create a column with the index values.
    emb_dfs.append(pd.DataFrame(emb_model.get_layer(f"cat_emb{i}").get_weights()[0]).reset_index())
    emb_dfs[i] = emb_dfs[i].add_prefix(f'{feat}_')
    # Merge the category-index mapping dataframe with the embedding dataframe on the index column.
    # This associates each category with its embedding vector.
    emb_dfs[i] = pd.merge(cat_encoder, emb_dfs[i], how='inner', on=f'{feat}_index')
    
    i += 1

In [None]:
# Saving
i = 0
for feat in features_to_create_embeddings:
    emb_dfs[i].to_csv(f'{feat}_emb.csv')
    
    i += 1

**Merging Embeddings With Dataset**

In [None]:
i = 0
for feat in features_to_create_embeddings:
    X_train = pd.merge(X_train, emb_dfs[i], left_on=feat, right_on=feat, how='inner').drop([f'{feat}_index'], axis='columns')
    X_test = pd.merge(X_test, emb_dfs[i], left_on=feat, right_on=feat, how='inner').drop([f'{feat}_index'], axis='columns')
    test_df = pd.merge(test_df, emb_dfs[i], left_on=feat, right_on=feat, how='inner').drop([f'{feat}_index'], axis='columns')
    
    i += 1

# **Train**

The first model will be used to remove unimportant features.

In [None]:
train_data = Pool(X_train.drop(['client_id'], axis='columns'), y_train, cat_features=cat_features)
test_data = Pool(X_test.drop(['client_id'], axis='columns'), y_test, cat_features=cat_features)

In [None]:
best_params = {'n_estimators': 2667, 'depth': 13, 'l2_leaf_reg': 5.0, 'min_child_samples': 64, 'grow_policy': 'Depthwise', 'random_state': 1, 'bootstrap_type': 'MVS', 'bagging_temperature': 2.6, 'random_strength': 8.5, "border_count": 508}

model = CatBoostRegressor(**best_params) # task_type='GPU'
model.fit(train_data, verbose=10)

0:	learn: 88268.2693941	total: 3.26s	remaining: 2h 25m 3s
10:	learn: 88193.8733361	total: 29.5s	remaining: 1h 58m 41s
20:	learn: 88115.1963103	total: 54.3s	remaining: 1h 53m 55s
30:	learn: 88049.2190558	total: 1m 16s	remaining: 1h 48m 38s
40:	learn: 87974.4506820	total: 1m 41s	remaining: 1h 48m 14s
50:	learn: 87903.4034267	total: 2m 5s	remaining: 1h 47m 42s
60:	learn: 87827.2495997	total: 2m 30s	remaining: 1h 47m
70:	learn: 87756.5714064	total: 2m 54s	remaining: 1h 46m 14s
80:	learn: 87687.0089084	total: 3m 18s	remaining: 1h 45m 37s
90:	learn: 87626.3116601	total: 3m 39s	remaining: 1h 43m 42s
100:	learn: 87555.8030334	total: 4m 3s	remaining: 1h 43m 1s
110:	learn: 87489.2140435	total: 4m 27s	remaining: 1h 42m 42s
120:	learn: 87414.1327888	total: 4m 53s	remaining: 1h 42m 52s
130:	learn: 87351.0832588	total: 5m 16s	remaining: 1h 42m 13s
140:	learn: 87287.3336454	total: 5m 39s	remaining: 1h 41m 20s
150:	learn: 87213.3497210	total: 6m 5s	remaining: 1h 41m 32s
160:	learn: 87155.5741600	total

<catboost.core.CatBoostRegressor at 0x7b2656775ba0>

In [None]:
# Create a DataFrame to hold feature importances.
# The DataFrame is constructed by transposing an array that combines feature names (excluding 'client_id') and their importance scores.
feature_importances = pd.DataFrame(np.transpose([X_train.drop(['client_id'], axis='columns').columns.to_list(), list(model.feature_importances_)]), columns=['feature', 'importance']).sort_values('importance', ascending=False)
feature_importances['importance'] = feature_importances['importance'].astype('float64')
# Identify features to delete based on a threshold for importance.
features_to_del = feature_importances['feature'][feature_importances['importance'] < 0.2]

In [None]:
feature_importances.head(60)

Unnamed: 0,feature,importance
14,avg_debet_turn_rur,2.046479
119,total_rur_amt_cm_avg_div_v2,1.963483
111,profit_income_out_rur_amt_9m,1.756351
47,commission_outcome_rur_amt,1.678452
114,smsInWavg6m,1.649215
84,percent_outcome_rur_amt,1.611895
60,hdb_bki_total_max_limit,1.591851
51,curr_rur_amt_cm_avg,1.547303
110,prof_cc_prof,1.476052
13,avg_cnt_daily_transactions_90d,1.455585


In [None]:
len(features_to_del)

In [None]:
# Drop
X_train = X_train.drop(features_to_del, axis='columns')
X_test = X_test.drop(features_to_del, axis='columns')
test_df = test_df.drop(features_to_del, axis='columns')

In [None]:
cat_features = X_train.select_dtypes(include=['category', 'object']).columns.to_list()

Next we add several classifiers to help regression model. They will classify if price in some range.

In [None]:
# All that I can't pass to the function.
y_tr = y_train >= 80000 
y_tr = y_tr.astype('int64')

# weights = y_tr.replace(0, y_tr.mean()).replace(1, 1 - y_tr.mean())
train = Pool(X_train, y_tr, cat_features=cat_features)  # weight=weights

y_tst = y_test >= 80000 

# Optimizing hyperparameters for classifier models with optuna
def objective(trial):
    param = {
        'depth': trial.suggest_int('depth', 5, 13, 1),
        'l2_leaf_reg': trial.suggest_discrete_uniform('l2_leaf_reg', 1.0, 5.0, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        'min_child_samples': trial.suggest_categorical('min_child_samples', [16, 32, 64, 128]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        'border_count': trial.suggest_int('border_count', 254, 508, step=100),
        'grow_policy': trial.suggest_categorical('grow_policy', ['Depthwise', 'SymmetricTree', 'Lossguide']),
        'n_estimators': trial.suggest_int('n_estimators', 500, 5000, step=500),
        'task_type': 'CPU',
        'verbose': 200
    }
    
    print(f'\n----\ntrial {trial.number}\nparameters: {trial.params}')
    model = CatBoostClassifier(**param)
    model.fit(train)
        
    preds = model.predict(X_test)
    
    f1 = f1_score(y_tst, preds)
    
    print(f'f1: {f1}')
    return f1

In [None]:
study_classifiers = optuna.create_study(direction='maximize')
study_classifiers.optimize(objective, n_trials=200)

In [None]:
# Best params
param = {'depth': 8, 'l2_leaf_reg': 5.0, 'min_child_samples': 16, 'task_type': 'CPU', 'random_state': 42, 'random_strength': 8.5, 'verbose': 200}

In [None]:
def train_income_classifier(train_df, target, income, cat_features, above=True, test_df=None, test_target=None):
    """
        Trains a CatBoostClassifier that predicts whether income is above or below a 
    certain amount.
    """
    # Define the binary classification target based on the 'income' threshold.
    y_tr = target >= income if above else target <= income
    y_tr = y_tr.astype('int64')  # Convert the boolean series to integers (0 or 1).
    
    # Print the proportion of positive class in the target.
    print('mean:', y_tr.mean())
    
    # To reduce class imbalance.
    # 1v.
    # weight = 1 / (len(y_train) - y_tr.sum() / (y_tr.sum()))
    # weights = y_tr.replace(0, weight)
    # 2v.
    weights = y_tr.replace(0, y_tr.mean()).replace(1, 1 - y_tr.mean())
    train = Pool(train_df, y_tr, weight=weights, cat_features=cat_features)
    
    # Print calculated weights for each class.
    print('weights for 0:', y_tr.mean(), '|| weights for 1:', 1 - y_tr.mean())
    print('income:', income)
    
    # Record the start time for training.
    st_time = time.time()
    model = CatBoostClassifier(**param)
    model.fit(train)
    print('time:', (st_time - time.time()) / 60)
    
    preds = model.predict(train_df)
    
    print("train\naccuracy:", accuracy_score(y_tr, preds), '|| f1:', f1_score(y_tr, preds))
    
    # If test data is provided, evaluate the model on it.
    if type(test_df) != type(None) and type(test_target) != type(None):
        y_test = test_target >= income if above else test_target <= income
        
        preds = model.predict(test_df)
        
        print("test\naccuracy:", accuracy_score(y_test, preds), '|| f1:', f1_score(y_test, preds))
    
    return model


def train_income_classifier2(train_df, target, income, cat_features, test_df=None, test_target=None):
    """
        Trains a CatBoostClassifier that predicts whether income is above or below a 
    certain amount.
    """
    # Define the binary classification target based on a range around the 'income'.
    y_tr = (target >= income) & (target <= income + 50000)
    y_tr = y_tr.astype('int64')  # Convert the boolean series to integers (0 or 1).
    
    # Print the proportion of positive class in the target.
    print('mean:', y_tr.mean())
    
    # To reduce class imbalance.
    # 1v.
    # weight = 1 / (len(y_train) - y_tr.sum() / (y_tr.sum()))
    # weights = y_tr.replace(0, weight)
    # 2v.
    weights = y_tr.replace(0, y_tr.mean()).replace(1, 1 - y_tr.mean())
    train = Pool(train_df, y_tr, weight=weights, cat_features=cat_features)
    
    # Print calculated weights for each class.
    print('weights for 0:', y_tr.mean(), '|| weights for 1:', 1 - y_tr.mean())
    print('income:', income)
    
    # Record the start time for training.
    st_time = time.time()
    model = CatBoostClassifier(**param)
    model.fit(train)
    print('time:', (st_time - time.time()) / 60)
    
    preds = model.predict(train_df)
    
    print("train\naccuracy:", accuracy_score(y_tr, preds), '|| f1:', f1_score(y_tr, preds))
    
    # If test data is provided, evaluate the model on it.
    if type(test_df) != type(None) and type(test_target) != type(None):
        y_test = (test_target >= income) & (test_target <= income + 50000)
        
        preds = model.predict(test_df)
        
        print("test\naccuracy:", accuracy_score(y_test, preds), '|| f1:', f1_score(y_test, preds))
    
    return model

In [None]:
# checkpoint data7
# X_train.to_csv('X_train.csv')
# X_test.to_csv('X_test.csv')
# y_train.to_csv('y_train.csv')
# y_test.to_csv('y_test.csv')
# w_train.to_csv('w_train.csv')
# w_test.to_csv('w_test.csv')
# test_df.to_csv('test_df.csv')

In [None]:
# to restore data
X_train = pd.read_csv('/kaggle/input/data7ds/X_train.csv')
X_test = pd.read_csv('/kaggle/input/data7ds/X_test.csv')
test_df = pd.read_csv('/kaggle/input/data7ds/test_df(1).csv')
w_test = pd.read_csv('/kaggle/input/data7ds/w_test.csv')
w_train = pd.read_csv('/kaggle/input/data7ds/w_train.csv')
y_test = pd.read_csv('/kaggle/input/data7ds/y_test.csv')
y_train = pd.read_csv('/kaggle/input/data7ds/y_train.csv')

X_train = X_train.drop(['Unnamed: 0'], axis='columns')
X_test = X_test.drop(['Unnamed: 0'], axis='columns')
test_df = test_df.drop(['Unnamed: 0'], axis='columns')
w_test = w_test.drop(['Unnamed: 0'], axis='columns')
w_train = w_train.drop(['Unnamed: 0'], axis='columns')
y_test = y_test.drop(['Unnamed: 0'], axis='columns')
y_train = y_train.drop(['Unnamed: 0'], axis='columns')

In [None]:
X_train = X_train.drop(X_train[X_train.columns[172:]].columns, axis='columns')
X_test = X_test.drop(X_test[X_test.columns[172:]].columns, axis='columns')

In [None]:
new_X_train = X_train.drop(['client_id'], axis='columns').copy()
new_X_test = X_test.drop(['client_id'], axis='columns').copy()
new_test_df = test_df.drop(['client_id'], axis='columns').copy()

i = 0
for inc in np.linspace(20000, 220000, 11):
    # Train a binary classifier to predict if income is above the current threshold 'inc'
    model = train_income_classifier(X_train, y_train, inc, cat_features, test_df=X_test, test_target=y_test)
    # Predict on the data and create a new feature indicating if income is above the threshold.
    new_X_train[f'above_{inc}'] = model.predict(X_train)
    new_X_test[f'above_{inc}'] = model.predict(X_test)
    new_test_df[f'above_{inc}'] = model.predict(test_df.drop(['feature_date'], axis='columns'))
    
    # model = train_income_classifier(X_train, y_train, rev, cat_features, above=False, test_df=X_test, test_target=y_test)
    # new_X_train[f'below_{rev}'] = model.predict(X_train)
    # new_X_test[f'below_{rev}'] = model.predict(X_test)
    
    i += 1
    print('trial:', i)
    print('\n')

X_train = new_X_train.copy()
X_test = new_X_test.copy()
test_df = new_test_df.copy()

mean: 0.9674405184659091
weights for 0: 0.9674405184659091 || weights for 1: 0.03255948153409094
income: 20000.0
0:	learn: 0.6928257	total: 786ms	remaining: 13m 5s
200:	learn: 0.6489167	total: 2m 16s	remaining: 9m 1s
400:	learn: 0.6077066	total: 4m 30s	remaining: 6m 43s
600:	learn: 0.5527420	total: 6m 47s	remaining: 4m 30s
800:	learn: 0.4984944	total: 9m 6s	remaining: 2m 15s
999:	learn: 0.4507710	total: 11m 26s	remaining: 0us
time: -11.463851209481557
train
accuracy: 0.9220858487215909 || f1: 0.9581435555025635
test
accuracy: 0.886782189758334 || f1: 0.9399645638468829
trial: 1


mean: 0.8289850408380682
weights for 0: 0.8289850408380682 || weights for 1: 0.17101495916193177
income: 40000.0
0:	learn: 0.6931054	total: 757ms	remaining: 12m 36s
200:	learn: 0.6850693	total: 2m 12s	remaining: 8m 47s
400:	learn: 0.6773218	total: 4m 23s	remaining: 6m 32s
600:	learn: 0.6645379	total: 6m 37s	remaining: 4m 24s
800:	learn: 0.6508562	total: 8m 52s	remaining: 2m 12s
999:	learn: 0.6375846	total: 11m

In [None]:
new_X_train = X_train.copy()
new_X_test = X_test.copy()
new_test_df = test_df.copy()

i = 0
for inc in np.linspace(20000, 100000, 4):
    # Train a binary classifier to predict if income is above the current threshold 'inc'
    model = train_income_classifier2(X_train, y_train, inc, cat_features, test_df=X_test, test_target=y_test)
    # Predict on the data and create a new feature indicating if income is above the threshold.
    new_X_train[f'between_{inc}'] = model.predict(X_train)
    new_X_test[f'between_{inc}'] = model.predict(X_test)
    new_test_df[f'between_{inc}'] = model.predict(test_df.drop(['feature_date'], axis='columns'))
    
    
    # model = train_income_classifier(X_train, y_train, rev, cat_features, above=False, test_df=X_test, test_target=y_test)
    # new_X_train[f'below_{rev}'] = model.predict(X_train)
    # new_X_test[f'below_{rev}'] = model.predict(X_test)
    i += 1
    print('trial:', i)
    print('\n')

X_train = new_X_train.copy()
X_test = new_X_test.copy()
test_df = new_test_df.copy()

In [None]:
# X_test = pd.read_csv('/kaggle/input/alfa3ds/X_test_df3.csv')
# X_train = pd.read_csv('/kaggle/input/alfa3ds/X_train_df3.csv')
# test_df = pd.read_csv('/kaggle/input/alfa3ds/test_df3.csv')

In [None]:
# y_test, w_test = X_test['target'], X_test['w']
# X_test = X_test.drop(['Unnamed: 0', 'target', 'w'], axis='columns')

# y_train, w_train = X_train['target'], X_train['w']
# X_train = X_train.drop(['Unnamed: 0', 'target', 'w'], axis='columns')

In [None]:
# pd.concat([X_train, w_train, y_train], axis='columns').to_csv('X_train_df3.csv')
# pd.concat([X_test, w_test, y_test], axis='columns').to_csv('X_test_df3.csv')
# test_df.to_csv('test_df3.csv')

In [None]:
del new_X_train
del new_X_test
gc.collect()

In [None]:
del train_df
gc.collect()

In [None]:
# Get the text features
text_features = ['main_last_position_ccode', 'main_pre_last_position_ccode', 'part_last_position_ccode', 'part_pre_last_position_ccode']

for feat in text_features:
    cat_features.remove(feat)

In [None]:
# X_train = pd.read_csv("/kaggle/input/alfadata4/X_train.csv").drop(['Unnamed: 0'], axis='columns')
# X_test = pd.read_csv("/kaggle/input/alfadata4/X_test.csv").drop(['Unnamed: 0'], axis='columns')
# test_df = pd.read_csv("/kaggle/input/alfadata4/test_df(1).csv").drop(['Unnamed: 0'], axis='columns')
# w_test = pd.read_csv("/kaggle/input/alfadata4/w_test.csv").drop(['Unnamed: 0'], axis='columns')
# w_train = pd.read_csv("/kaggle/input/alfadata4/w_train.csv").drop(['Unnamed: 0'], axis='columns')
# y_test = pd.read_csv("/kaggle/input/alfadata4/y_test.csv").drop(['Unnamed: 0'], axis='columns')
# y_train = pd.read_csv("/kaggle/input/alfadata4/y_train.csv").drop(['Unnamed: 0'], axis='columns')

In [None]:
train_data = Pool(X_train, y_train, cat_features=cat_features, sample_weight=w_test)# text_features=text_features
test_data = Pool(X_test, y_test, cat_features=cat_features, sample_weight=w_test)# text_features=text_features

Main model training and optimizing w optuna

In [None]:
def objective(trial):
    param = {
        'depth': trial.suggest_int('depth', 5, 13, step=1),
        'l2_leaf_reg': trial.suggest_discrete_uniform('l2_leaf_reg', 1.0, 5.0, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        'min_child_samples': trial.suggest_categorical('min_child_samples', [16, 32, 64, 128]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "od_type": trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        "od_wait": trial.suggest_int("od_wait", 10, 50),
        'random_strength': trial.suggest_float("random_strength", 1.5, 14.5, step=2.0),
        'grow_policy': trial.suggest_categorical('grow_policy', ['Depthwise', 'SymmetricTree', 'Lossguide']),
        # 'task_type': 'GPU',
        'border_count': trial.suggest_int('border_count', 254, 508, step=100),
        # 'loss_function': 'MAE',
        'n_estimators': trial.suggest_int('n_estimators', 500, 5000, step=500),
        'verbose': 500
    }
    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)
    '''    elif param["grow_policy"] == "SymmetricTree":
        param["boosting_type"] = trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),'''
        
    print(f'\n----\ntrial {trial.number}\nparameters: {trial.params}')
    model = CatBoostRegressor(**param)
    model.fit(X_train, y_train, cat_features=cat_features, sample_weight=w_train)
    
    preds = model.predict(X_test)
    loss = weighted_mean_absolute_error(preds, y_test['target'].to_list(), w_test['w'].to_list())
    
    
    print(f'loss: {loss}')
    return loss

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=400)

In [None]:
study.best_params

In [None]:
best_params = {'n_estimators': 3400, 'depth': 13, 'l2_leaf_reg': 5.0, 'min_child_samples': 64, 'grow_policy': 'Depthwise', 'random_state': 1, 'bootstrap_type': 'MVS', 'bagging_temperature': 2.6, 'random_strength': 8.5, "border_count": 508}

In [None]:
# Training on the best params we picked with optuna.
model = CatBoostRegressor(**best_params)
model.fit(X_train, y_train, cat_features=cat_features, sample_weight=w_train) # train_df.drop(['target', 'w', 'client_id', 'feature_date'] + features_to_del.to_list(), axis='columns'), train_df['target'], cat_features=cat_features

0:	learn: 151415.6691118	total: 2.9s	remaining: 2h 44m 3s
1:	learn: 149539.8393394	total: 5.02s	remaining: 2h 22m 5s
2:	learn: 147892.3438233	total: 7.08s	remaining: 2h 13m 40s
3:	learn: 146239.5690083	total: 8.65s	remaining: 2h 2m 28s
4:	learn: 144646.6690984	total: 10.4s	remaining: 1h 57m 41s
5:	learn: 143281.1441591	total: 12.3s	remaining: 1h 55m 34s
6:	learn: 141823.7067157	total: 14.4s	remaining: 1h 56m 11s
7:	learn: 140529.1692718	total: 16.2s	remaining: 1h 54m 41s
8:	learn: 139092.5126713	total: 17.9s	remaining: 1h 52m 39s
9:	learn: 137915.7477548	total: 20.2s	remaining: 1h 54m 21s
10:	learn: 136605.2505352	total: 22.1s	remaining: 1h 53m 28s
11:	learn: 135545.6538808	total: 24.1s	remaining: 1h 53m 30s
12:	learn: 134516.9874115	total: 25.5s	remaining: 1h 50m 39s
13:	learn: 133470.7162335	total: 28.1s	remaining: 1h 53m 22s
14:	learn: 132530.8218144	total: 29.7s	remaining: 1h 51m 46s
15:	learn: 131624.0169535	total: 31.3s	remaining: 1h 50m 29s
16:	learn: 130606.5486245	total: 32.9s

<catboost.core.CatBoostRegressor at 0x7b26569ad540>

In [None]:
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')
w_train.to_csv('w_train.csv')
w_test.to_csv('w_test.csv')
test_df.to_csv('test_df.csv')

In [None]:
weighted_mean_absolute_error(model.predict(X_test), y_test, w_test)

63702.768084822266

In [None]:
print("train", weighted_mean_absolute_error(model.predict(X_train), y_train, w_train), "\ntest", \
weighted_mean_absolute_error(model.predict(X_test), y_test, w_test))

train 10333.178898643944 
test 63702.768084822266


In [None]:
# Making predictions.
test_df['predict'] = model.predict(test_df.drop(['feature_date'] + features_to_del.to_list(), axis='columns'))

In [None]:
# Saving predictions
cl_id = pd.read_csv('/kaggle/input/income-prediction-alfa-campus/test.csv', sep=";", decimal=",", encoding="windows-1251").client_id
test_df['client_id'] = cl_id.to_list()
test_df[['client_id','predict']].set_index('client_id').to_csv("commit.csv", sep=",", decimal=".")