In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
import pandas as pd
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
import re
from collections import Counter
import numpy as np
import warnings
from sklearn.model_selection import KFold
warnings.filterwarnings("ignore")
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from catboost import CatBoostRegressor
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
import gradio as gr

In [3]:
df_test = pd.read_csv('/content/hackathon_income_test.csv', delimiter=';', on_bad_lines='skip')

In [4]:
#комбинированные фичи
def create_features(df):
    df_new = df.copy()

    df_new['credit_to_income_ratio_12m'] = df_new['turn_cur_cr_avg_act_v2'] / df_new['salary_6to12m_avg'].replace(0, np.nan)
    df_new['debit_to_credit_ratio_12m'] = df_new['turn_cur_db_sum_v2'] / df_new['turn_cur_cr_sum_v2'].replace(0, np.nan)
    df_new['credit_debit_diff_3m'] = df_new['avg_cur_cr_turn'] - df_new['avg_cur_db_turn']

    df_new['credit_growth_6m'] = df_new['turn_cur_cr_avg_act_v2'] / df_new['turn_cur_cr_avg_v2'].replace(0, np.nan) - 1
    df_new['debit_growth_6m'] = df_new['turn_cur_db_avg_act_v2'] / df_new['turn_cur_db_avg_v2'].replace(0, np.nan) - 1
    df_new['salary_change_ratio'] = df_new['salary_6to12m_avg'] / df_new['dp_ils_avg_salary_1y'].replace(0, np.nan) - 1

    df_new['avg_credit_to_max_limit'] = df_new['turn_cur_cr_avg_act_v2'] / df_new['hdb_bki_total_max_limit'].replace(0, np.nan)
    df_new['avg_cc_to_max_limit'] = df_new['turn_cur_cr_avg_act_v2'] / df_new['hdb_bki_total_cc_max_limit'].replace(0, np.nan)
    df_new['used_credit_ratio'] = df_new['turn_cur_cr_sum_v2'] / (df_new['hdb_bki_total_cc_max_limit'] + df_new['hdb_bki_total_pil_max_limit']).replace(0, np.nan)
    #df_new['avg_transaction_to_balance'] = df_new['avg_transaction_amount'] / df_new['total_rur_amt_cm_avg'].replace(0, np.nan)
    df_new['credit_debit_growth_ratio'] = (df_new['credit_growth_6m'] + 1) / (df_new['debit_growth_6m'] + 1)

    df_new['supermk_share'] = (df_new['avg_by_category__amount__sum__cashflowcategory_name__supermarkety'] + df_new['avg_by_category__amount__sum__cashflowcategory_name__gipermarkety']) / df_new['turn_cur_db_sum_v2'].replace(0, np.nan)
    df_new['product_share'] = (df_new['avg_by_category__amount__sum__cashflowcategory_name__produkty']) / df_new['turn_cur_db_sum_v2'].replace(0, np.nan)
    df_new['trip_share'] = df_new['avg_by_category__amount__sum__cashflowcategory_name__puteshestvija'] / df_new['turn_cur_db_sum_v2'].replace(0, np.nan)

    df_new['essentials_expense'] = df_new['supermk_share'] + df_new['avg_by_category__amount__sum__cashflowcategory_name__produkty']
    df_new['luxury_expense'] = df_new['avg_6m_hotels'] / df_new['turn_cur_db_sum_v2'].replace(0, np.nan) + df_new['avg_6m_travel'] / df_new['turn_cur_db_sum_v2'].replace(0, np.nan)

    df_new['transaction_count_ratio'] = df_new['transaction_category_supermarket_sum_cnt_d15'] / df_new['transaction_category_supermarket_sum_cnt_m2'].replace(0, np.nan)
    df_new['mob_usage_ratio'] = df_new['mob_cover_days'] / 90
    df_new['cash_withdrawal_freq'] = df_new['avg_by_category__amount__sum__cashflowcategory_name__vydacha_nalichnyh_v_bankomate'] / df_new['turn_cur_db_sum_v2'].replace(0, np.nan)

    df_new['inactive_flag'] = (df_new['days_to_last_transaction'] > 30).astype(int)

    df_new['income_per_age'] = df_new['incomeValue'] / df_new['age'].replace(0, np.nan)
    df_new['salary_ratio_to_region'] = df_new['incomeValue'] / df_new['per_capita_income_rur_amt'].replace(0, np.nan)
    df_new['gender_income_ratio'] = (df_new['gender']=='M').astype(int) * df_new['incomeValue']

    df_new['credit_debit_ratio_12m'] = df_new['turn_cur_cr_sum_v2'] / df_new['turn_cur_db_sum_v2'].replace(0, np.nan)
    df_new['credit_debit_ratio_3m'] = df_new['avg_cur_cr_turn'] / df_new['avg_cur_db_turn'].replace(0, np.nan)

    df_new['avg_credit_per_product'] = df_new['turn_cur_cr_sum_v2'] / df_new['hdb_bki_total_products'].replace(0, np.nan)

    df_new['overdue_ratio'] = df_new['hdb_bki_total_max_overdue_sum'] / df_new['hdb_bki_total_max_limit'].replace(0, np.nan)

    df_new['active_products_per_age'] = df_new['hdb_bki_total_active_products'] / df_new['age'].replace(0, np.nan)

    df_new['spending_to_salary_ratio'] = df_new['turn_cur_db_sum_v2'] / df_new['salary_6to12m_avg'].replace(0, np.nan)

    df_new['total_credit_limit'] = df_new['hdb_bki_total_max_limit'] + df_new['hdb_bki_total_cc_max_limit'] + df_new['hdb_bki_total_pil_max_limit']
    df_new['credit_utilization_ratio'] = df_new['turn_cur_cr_sum_v2'] / df_new['total_credit_limit'].replace(0, np.nan)
    df_new['debit_to_salary_ratio'] = df_new['turn_cur_db_sum_v2'] / df_new['salary_6to12m_avg'].replace(0, np.nan)
    df_new['high_value_transaction_ratio'] = df_new['avg_cur_db_turn'] / df_new['turn_cur_db_avg_act_v2'].replace(0, np.nan)

    #df_new['avg_transaction_amount'] = df_new['turn_cur_db_sum_v2'] / df_new['transaction_category_supermarket_sum_cnt_m2'].replace(0, np.nan)
    df_new['supermk_to_total_ratio'] = df_new['avg_by_category__amount__sum__cashflowcategory_name__supermarkety'] / df_new['turn_cur_db_sum_v2'].replace(0, np.nan)
    df_new['rest_to_total_ratio'] = df_new['avg_by_category__amount__sum__cashflowcategory_name__kafe'] / df_new['turn_cur_db_sum_v2'].replace(0, np.nan)

    df_new['avg_salary_per_month'] = df_new['salary_6to12m_avg'] / 6

    df_new['assets_to_income_ratio'] = df_new['total_rur_amt_cm_avg'] / df_new['salary_6to12m_avg'].replace(0, np.nan)
    df_new['loan_per_income_ratio'] = df_new['loan_cur_amt'] / df_new['salary_6to12m_avg'].replace(0, np.nan)
    df_new['active_loan_count_ratio'] = df_new['avg_loan_cnt_with_insurance'] / df_new['hdb_bki_total_products'].replace(0, np.nan)

    df_new['income_per_credit_product'] = df_new['salary_6to12m_avg'] / df_new['hdb_bki_total_products'].replace(0, np.nan)
    df_new['age_squared'] = df_new['age'] ** 2

    df_new['mobile_activity_income_ratio'] = df_new['mob_cover_days'] / df_new['salary_6to12m_avg'].replace(0, np.nan)

    return df_new

In [5]:
def smart_position_grouping_with_age(row):
    """Группировка должностей с учетом возраста"""
    position = row['dp_ewb_last_employment_position']
    age = row['age']

    if pd.isna(position) or position == 'nan':
        if pd.notna(age):
            # можно здесь поменять границы для определения студента и пенсионера
            if age <= 25:
                return 'студент_стандартный'
            elif age >= 60:
                return 'пенсионер_стандартный'
        return 'не указано'

    text = str(position).lower().strip()

    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    role_keywords = {
        'руководитель': ['директор', 'руководитель', 'начальник', 'управляющий', 'заведующий'],
        'специалист': ['специалист', 'эксперт', 'аналитик', 'программист', 'разработчик', 'тестировщик', 'юрисконсульт', 'юрист', 'логист'],
        'менеджер': ['менеджер'],
        'бухгалтер': ['бухгалтер', 'экономист'],
        'водитель': ['водитель', 'экспедитор', 'автомобиля', 'машинист', 'курьер', 'тракторист'],
        'продавец': ['продавец', 'кассир', 'консультант', 'стажер', 'товаровед', 'контролер', 'мерчандайзер'],
        'рабочий': ['рабочий', 'комплектовщик', 'грузчик', 'кладовщик', 'упаковщик', 'монтажник', 'комплектовщик', 'электромонтажник', 'электрогазосварщик', 'комплектовщик', 'слесарь', 'маляр', 'механик', 'бетонщик', 'автослесарь', 'электромонтер', 'электрик', 'прораб', 'сантехник', 'столяр', 'разнорабочий'],
        'администратор': ['администратор', 'секретарь'],
        'инженер': ['инженер', 'техник', 'технолог', 'архитектор'],
        'оператор': ['оператор', 'диспетчер', 'колл'],
        'врач': ['врач', 'сестра', 'брат', 'дерматолог', 'фармацевт', 'санитарка', 'санитар', 'ревизор', 'ассистент', 'стоматолог', 'терапевт', 'рентгенолог', 'фельдшер'],
        'учитель': ['учитель','воспитатель', 'педагог', 'преподаватель', 'тренер'],
        'повар': ['повар', 'пекарь', 'кондитер', 'шеф', 'бармен', 'официант'],
        'уборщик-охранник': ['уборщик', 'дворник', 'горничная', 'уборщица', 'охранник', 'сторож', 'вахтер', 'гардеробщик'],
        'студент': ['студент'],
        'пенсионер': ['пенсионер']
    }

    main_role = 'другое'
    for role, keywords in role_keywords.items():
        if any(keyword in text for keyword in keywords):
            main_role = role
            break
    if any(word in text for word in ['главный', 'ведущий', 'старший', 'head', 'chief']):
        level = 'старший'
    elif any(word in text for word in ['младший', 'помощник', 'assistant', 'junior']):
        level = 'младший'
    else:
        level = 'стандартный'

    direction = ''
    if main_role in ['менеджер', 'специалист']:
        if 'по продажам' in text or 'sales' in text:
            direction = 'продажи'
        elif 'it' in text or 'информационн' in text or 'программист' in text:
            direction = 'it'
        elif 'маркетинг' in text:
            direction = 'маркетинг'
        elif 'логистик' in text or 'склад' in text:
            direction = 'логистика'

    if direction and main_role not in ['студент', 'пенсионер']:
        return f"{main_role}_{level}_{direction}"
    else:
        return f"{main_role}_{level}"


In [6]:
def preprocess_data(df):
    df = df.copy()
    cols = []
    for i in df.columns:
        if df[i].dtypes != 'float64':
            try:
                df[i] = df[i].astype(str).apply(lambda x: x.replace(',','.')).astype('float')
            except:
                cols.append(i)

    if 'target' in df.columns:
        df['target'] = df['target'].astype(str).apply(lambda x: x.replace(',','.')).astype('float')

    # Логарифмирование таргета
    if 'target' in df.columns:
        df['target_log'] = np.log1p(df['target'])
        threshold = df['target_log'].quantile(0.75)  # 75% квантиль
        df['is_target_bin'] = np.where(df['target_log'] >= threshold, 1, 0)

    if 'gender' in df.columns:
        df['gender'] = df['gender'].map({'Женский': 0, 'Мужской': 1}).astype('float')
    for col in ['gender', 'adminarea', 'city_smart_name', 'dp_ewb_last_employment_position', 'addrref']:
        if col in df.columns:
            df[col] = df[col].astype(str).apply(lambda x: x.lower())
    if 'period_last_act_ad' in df.columns:
        df['period_last_act_ad'] = df['period_last_act_ad'].map({'1677-09-01': None})
        df['period_last_act_ad'] = pd.to_datetime(df['period_last_act_ad'])
        df['period_last_act_ad_day'] = df['period_last_act_ad'].dt.day
        df['period_last_act_ad_month'] = df['period_last_act_ad'].dt.month
        df['period_last_act_ad_dayofweek'] = df['period_last_act_ad'].dt.dayofweek

    if 'dt' in df.columns:
        df['dt'] = pd.to_datetime(df['dt'])
        df['dt_day'] = df['dt'].dt.day
        df['dt_month'] = df['dt'].dt.month
        df['dt_dayofweek'] = df['dt'].dt.dayofweek

    return df


df_test = preprocess_data(df_test)


df_test = create_features(df_test)

df_test['job'] = df_test.apply(smart_position_grouping_with_age, axis=1)


if 'dp_ewb_last_employment_position' in df_test.columns:
    df_test = df_test.drop(columns='dp_ewb_last_employment_position')


cols_to_drop = ['is_target_bin', 'target', 'target_log', 'w', 'id', 'period_last_act_ad', 'dt']
if 'dp_address_unique_regions' in df_test.columns:
    cols_to_drop.append('dp_address_unique_regions')




In [21]:
table = pd.read_csv('/content/predictions_two_models (3).csv')
table_1 = pd.read_csv('/content/predictions_0_1 (1).csv')
df_test = df_test.merge(table, on = 'id')
df_test = df_test.merge(table_1, on = 'id')
df_test

Unnamed: 0,id,dt,turn_cur_cr_avg_act_v2,salary_6to12m_avg,hdb_bki_total_max_limit,dp_ils_paymentssum_avg_12m,hdb_bki_total_cc_max_limit,incomeValue,gender,avg_cur_cr_turn,...,assets_to_income_ratio,loan_per_income_ratio,active_loan_count_ratio,income_per_credit_product,age_squared,mobile_activity_income_ratio,job,prediction_x,prediction_y,pred
0,0.0,2024-08-31,805319.38,,61137.47,,60000.00,159999.0,0.0,69740.0,...,,,,,3481.0,,не указано,58908.996819,58908.996819,0
1,1.0,2024-10-31,306240.00,,949500.00,,230000.00,108834.0,1.0,63513.0,...,,,0.200000,,2116.0,,не указано,41638.781169,41638.781169,0
2,3.0,2024-09-30,164908.73,,178000.00,,178000.00,59203.0,0.0,132.0,...,,,0.038462,,484.0,,студент_стандартный,32197.452531,32197.452531,0
3,9.0,2024-10-31,2374846.42,,25500.00,126247.448359,4999.00,180906.0,0.0,290339.0,...,,,,,1296.0,,менеджер_стандартный,73348.977795,73348.977795,0
4,11.0,2024-11-30,735902.71,47828.145621,60000.00,,60000.00,24922.0,1.0,76924.0,...,0.024442,,,11957.036405,400.0,0.000016,студент_стандартный,47866.083595,47866.083595,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73209,149981.0,2024-09-30,531284.21,,288500.00,,,64721.0,1.0,279544.0,...,,,0.500000,,3721.0,,пенсионер_стандартный,49501.060767,49501.060767,0
73210,149985.0,2024-09-30,173804.50,,90000.00,,24999.28,38860.0,0.0,31813.0,...,,,,,1024.0,,не указано,30896.463069,30896.463069,0
73211,149989.0,2024-10-31,1076401.61,,148000.00,,148000.00,97840.0,1.0,4233.0,...,,,,,625.0,,студент_стандартный,54409.193668,54409.193668,0
73212,149995.0,2024-11-30,2729721.41,,659000.00,132028.008254,160000.00,156088.0,1.0,187073.0,...,,,0.100000,,676.0,,врач_стандартный,50192.230195,50192.230195,0


In [22]:
most_importante_1= ['turn_cur_cr_avg_act_v2',
'incomeValue',
'avg_debet_turn_rur',
'by_category__amount__sum__eoperation_type_name__perevod_po_nomeru_telefona',
'total_credit_limit']
most_importante_0 = ['turn_cur_cr_avg_act_v2',
'dp_ils_accpayment_avg_12m',
'turn_cur_cr_max_v2',
'dp_ils_avg_salary_1y',
'avg_debet_turn_rur']

feature_descriptions = {
    "turn_cur_cr_avg_act_v2": "Средний текущий кредитовый оборот по текущим счетам за 12 месяцев",
    "incomeValue": "Значение дохода абонента",
    "avg_debet_turn_rur": "Средний дебетовый оборот по всем счетам за 3 месяца",
    "by_category__amount__sum__eoperation_type_name__perevod_po_nomeru_telefona": "Средняя сумма электронных операций в категории - Перевод по номеру телефона за месяц",
    "total_credit_limit": "Общий кредитный лимит клиента",
    "dp_ils_accpayment_avg_12m": "Средние платежи за последние 12 месяцев",
    "dp_ils_avg_salary_1y": "Средняя зарплата за последний год",
    "turn_cur_cr_max_v2": "Максимальный кредитовый оборот по текущим счетам за 12 месяцев"
}

In [32]:
def get_user_info_gradio(user_id):
    try:
        user_id = int(user_id)
    except:
        return "<b>Введите корректный ID</b>"

    user_df = df_test[df_test["id"] == user_id]
    if user_df.empty:
        return "<b>Пользователь не найден</b>"

    user = user_df.iloc[0]  # Series

    pred = user.get("pred", 1)
    important_features = most_importante_1 if pred == 1 else most_importante_0

    # Основная информация
    blacklist_flag = user.get("blacklist_flag", 0)
    gender = user.get("gender", "N/A")
    age = user.get("age", "N/A")
    addrref = user.get("addrref", "N/A")
    job = user.get("job", "N/A")
    prediction = round(user.get("prediction_y", 0), 2)  # строго из колонки

    # Формируем HTML для признаков
    features_html = "<div style='margin-top:10px;'><b>Важные признаки:</b>"
    for feat in important_features:
        value = user.get(feat, "N/A")
        desc = feature_descriptions.get(feat, "")
        features_html += f"<div style='margin:3px 0; padding:3px 5px; border-left:3px solid #333;'>"
        features_html += f"<b>{feat}:</b> {value} — {desc}</div>"
    features_html += "</div>"

    # Рекомендации по продуктам в отдельном прямоугольнике
    if prediction < 50000:
        products = ["Карта с кешбэком на продукты", "Микрозайм", "Депозит"]
    elif prediction <= 150000:
        products = ["Классическая карта", "Депозит", "Потребительский кредит"]
    else:
        products = ["Премиальная карта", "Ипотека", "Инвестиционный счёт"]

    products_html = "<div style='margin-top:15px; padding:10px; background:#f9f9f9; border:2px solid #ccc; border-radius:8px; max-width:400px;'>"
    products_html += "<b>Рекомендации по продуктам:</b>"
    for prod in products:
        products_html += f"<div style='margin:5px 0; padding:5px 8px; background:#e0e0e0; border-radius:5px; width:fit-content;'>{prod}</div>"
    products_html += "</div>"

    # Карточка клиента
    html = f"""
    <div style='background: linear-gradient(to bottom, #fefcea, #f1da36); padding:15px; border-radius:10px; max-width:600px;'>
        <h3>ID клиента: {user_id}</h3>
        <span style='color:{"red" if blacklist_flag else "green"}; font-weight:bold'>
        Флаг черного списка: {blacklist_flag}</span> | Пол: {gender} | Возраст: {age} |
        <span style='color:{addrref.lower()}'>{addrref}</span> | Профессия: {job}<br>
        <h2 style='color:black;'>Доход: {prediction} ₽</h2>
        {features_html}
        {products_html}
    </div>
    """

    return html

# Gradio интерфейс
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column(scale=1):
            user_input = gr.Textbox(label="Введите ID клиента", placeholder="ID клиента")
            btn = gr.Button("Показать данные")

    output_card = gr.HTML()

    btn.click(get_user_info_gradio, inputs=user_input, outputs=output_card)

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2e1fec7fa7eae0d74a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


