## 1.1 - Рассчет аналитических весов

**Цель:** Разработать аналитические веса для рейтингов

При расчете весов рейтингов используется трехкомпонентный расчет, где каждый компонент отвечает за определенный аспект корректировки влияния рейтинга на модель. Такая структура позволяет независимо контролировать временные эффекты, продолжительность действия и балансировку классов.
- Учет к-во расчетов в день рейтинга
- Учет длины валидного периода
- Учет дисбаланса классов

### 1.1.1 - Библиотеки

In [None]:
import pandas as pd

import yaml

import warnings
warnings.filterwarnings("ignore")

import yaml

import plotly.graph_objects as go
import plotly.subplots as sp


### 1.1.2 - Данные

In [None]:
with open('../CONFIGS.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [None]:
PATHS = config['data_paths']
PARAMS = config['calibration_params']

filepath = {
      'clean_data' : PATHS['other']['clean_data']
    , 'valid_dates' : PATHS['ratings']['valid_dates']
    , 'rating_weights': PATHS['ratings']['weights']
}

annual_drs = config['calibration_params']['annual_drs']
ttc_ratio = config['calibration_params']['central_tendency']['TTC']
pit_ratio = config['calibration_params']['central_tendency']['PIT']

pit_quarters = [str(y)+'Q'+str(q) for y, q in PARAMS['pit_quarters']]

In [None]:
# annual_default_pcts = config['calibration_params']['annual_drs']
# ttc_pct = config['calibration_params']['central_tendency']['TTC']
# pit_pct = config['calibration_params']['central_tendency']['PIT']
# annual_drs = {y: p / 100 for y, p in list(annual_default_pcts.items()) if p >= 0 and p<=100}
# ttc_ratio = ttc_pct / 100
# pit_ratio = pit_pct / 100

In [None]:
data = pd.read_parquet(filepath['clean_data'])
data['rating_assignment_date'] = pd.to_datetime(data['rating_assignment_date'].dt.date)

valid_dates = pd.read_parquet(filepath['valid_dates'])
valid_dates = valid_dates.drop('rating_assignment_date', axis=1)

In [None]:
# Define colors for each weight type
colors = {
    'weight_day': '#FF6B6B',
    'weight_1': '#4ECDC4', 
    'weight_2': '#45B7D1',
    'W': '#96CEB4'
}

#### 1.1.3 - Функции

In [None]:
def get_missing_pct(df):
    stats = round(df.isna().sum()/ df.shape[0] * 100, 2).sort_values(ascending=False)
    stats_only_missing = stats.loc[stats.index[stats>0]].to_frame().rename(columns={0:'missing_pct'})
    return stats_only_missing

In [None]:
def show_example(data, example_client=706021):
    return data.query('client_id == @example_client')

In [None]:
def calculate_daily_weights(df):
    """
    Расчет веса каждого дня - обратно пропорционально количеству активных рейтингов.
    """
    daily_weights = []
    
    for date in df.rating_assignment_date.sort_values().unique():
        active_ratings_count = df[
            (df['rating_assignment_date'] <= date) & 
            (df['valid_date'] > date)
        ].shape[0]
        
        daily_weights.append({
            'rating_assignment_date': date,
            'weight_day': 1 / active_ratings_count if active_ratings_count > 0 else 0
        })
    
    return pd.DataFrame(daily_weights)


def normalize_weights_by_year(weights_df, target_distribution=None):
    """
    Нормализация весов так, чтобы каждый год имел заданную долю.
    """
    weights_df = weights_df.copy()
    weights_df['year'] = weights_df['rating_assignment_date'].dt.year
    unique_years = weights_df['year'].unique()
    
    if target_distribution is None:
        target_distribution = {year: 1.0 / len(unique_years) for year in unique_years}
    
    weights_df['weight_day_normalized'] = weights_df['weight_day'].copy()
    
    for year in unique_years:
        year_mask = weights_df['year'] == year
        year_sum = weights_df.loc[year_mask, 'weight_day'].sum()
        
        if year_sum > 0 and year in target_distribution:
            scaling_factor = target_distribution[year] / year_sum
            weights_df.loc[year_mask, 'weight_day_normalized'] *= scaling_factor
    
    weights_df['weight_day'] = weights_df['weight_day_normalized']
    weights_df = weights_df.drop(['weight_day_normalized', 'year'], axis=1)
    
    return weights_df


def calculate_validity_period_weight(df):
    """Расчет weight_1: сумма весов дней на валидном периоде каждого рейтинга"""
    df['weight_1'] = 0
    for i in range(len(df)):
        client     = df.client_id.iloc[i]
        date_start = df.rating_assignment_date.iloc[i]
        date_end   = df.valid_date.iloc[i]
        df.weight_1.iloc[i] = df[(df['rating_assignment_date'] >= date_start) & (df['rating_assignment_date'] <= date_end)]['weight_day'].sum()
    return df['weight_1']


def apply_outlier_filter(df: pd.DataFrame, col: str, quantile_range=(0.005, 0.995)) -> pd.DataFrame:
    lower_bound = df[col].quantile(quantile_range[0])
    upper_bound = df[col].quantile(quantile_range[1])
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]


def get_yearly_stats(df, target_default_rates):
    df['year']      = df['rating_assignment_date'].dt.year
    df['target_dr'] = df['year'].map(lambda y: target_default_rates.get(y))

    # Подсчет статистики по годам
    yearly_stats = df.groupby('year').agg({
        'target': ['sum', 'count', 'mean']
    }).reset_index()
    yearly_stats.columns = ['year', 'defaults_count', 'total_count', 'default_rate']
    
    # df = df.merge(yearly_stats, on='year', how='left')

    return yearly_stats


def assign_annual_drs(df, annual_drs):
    from dateutil.relativedelta import relativedelta

    min_year = df['rating_assignment_date'].min().year
    max_year = df['rating_assignment_date'].max().year
    start_date = pd.to_datetime(str(min_year) + '-01-01', format='%Y-%m-%d')

    # print(f"Years range: {min_year} to {max_year}")
    # print(f"Number of years: {max_year - min_year + 1}")
    # print(f"Length of ct: {len(annual_drs.values())}")

    df['drp'] = 0

    while start_date.year <= max_year:
        end_date = start_date + relativedelta(years=1)
        
        ratings_within_year = df.query('@start_date <= rating_assignment_date < @end_date').copy()
        shape    = ratings_within_year.shape[0]
        defaults = ratings_within_year.target.sum()
        
        # Check if index is valid before using it
        if start_date.year in annual_drs.keys():
            drp_value = annual_drs[start_date.year]
        else:
            print(f"Warning: No ct value for the year {start_date.year}")
            break
        
        
        df.loc[(start_date <= df.rating_assignment_date) & (df.rating_assignment_date < end_date), 'drp'] = drp_value
        df.loc[(start_date <= df.rating_assignment_date) & (df.rating_assignment_date < end_date), 'total'] = shape
        df.loc[(start_date <= df.rating_assignment_date) & (df.rating_assignment_date < end_date), 'defaults'] = defaults
        
        start_date = end_date

    return df 


def calculate_weight_2(df):
    """
    Расчет weight_2 для ребалансировки классов
    """
    non_defaults_count = df['total'] - df['defaults']
    
    # Определяем, в каком формате drp и приводим к долям
    if ((df['drp'] >= 0 )& (df['drp'] <= 1)).all():
        drp_ratio = df['drp']
    elif ((df['drp'] >= 0) & (df['drp'] <= 100)).all():
        drp_ratio = df['drp'] / 100 
    else:
        print('Error')
        return None
    
    non_defaults_ratio = 1 - drp_ratio
    
    # Расчет веса для не-дефолтов
    # Используем drp (ratio) везде для консистентности
    df['weight_2'] = (drp_ratio * non_defaults_count) / (df['defaults'] * non_defaults_ratio)
    
    # Вес для дефолтов
    df.loc[df['target'] == 1, 'weight_2'] = 1
    
    return df['weight_2']


def calculate_final_weight(weight_1:pd.Series, weight_2: pd.Series):
    return weight_1 * weight_2


def calculate_weights(data, valid_dates, target_default_rates, target_ttc, target_pit, normalise=False):
    """
    Базовый расчет весов с учетом временной валидности и целевых DR
    """

    df = data.merge(valid_dates, on=['client_id', 'rating_id'])
    
    df['year_quarter'] = df['rating_assignment_date'].dt.to_period('Q')
    
    # Рассчитываем веса для уникальных дат
    daily_weights = calculate_daily_weights(df)
    
    # Нормализуем если нужно
    if normalise:
        daily_weights = normalize_weights_by_year(daily_weights)
    
    # Merge весов обратно в основной DataFrame
    df = df.merge(daily_weights, on='rating_assignment_date', how='left')
    
    # Остальная логика остается без изменений
    df['weight_1'] = calculate_validity_period_weight(df)
    # df = apply_outlier_filter(df=df, col='weight_1')
    df = assign_annual_drs(df, target_default_rates)
    df['weight_2'] = calculate_weight_2(df)
    df['W'] = calculate_final_weight(weight_1=df['weight_1'], weight_2=df['weight_2'])
    df['W'] = calibrate_weights_to_target_dr(df, target_ttc=target_ttc)
    
    return df[['client_id', 'rating_id', 'weight_day', 'weight_1', 'weight_2', 'W']]

def calibrate_weights_to_target_dr(df, target_ttc, weight_col='W'):
    """
    Калибрует веса так, чтобы:
    1. Взвешенный default rate = target_ttc
    2. Сумма всех весов = количеству наблюдений
    """
    total_observations_N = df.shape[0]
    target_defaults_N = total_observations_N * target_ttc 
    
    # Шаг 1: Масштабируем веса дефолтов
    defaults_weight_sum    = df[df['target'] == 1][weight_col].sum()
    default_scaling_factor = target_defaults_N / defaults_weight_sum
    df.loc[df.target == 1, weight_col] = df[weight_col] * default_scaling_factor
    
    # Шаг 2: Масштабируем веса не-дефолтов
    new_defaults_weight_sum    = df[df['target'] == 1][weight_col].sum()
    non_defaults_weight_sum    = df[df['target'] == 0][weight_col].sum()
    target_non_defaults_count  = total_observations_N - new_defaults_weight_sum
    non_default_scaling_factor = target_non_defaults_count / non_defaults_weight_sum
    df.loc[df.target == 0, weight_col] = df[weight_col] * non_default_scaling_factor
    
    return df[weight_col]


def validate_weights(df, weight_col, target_ttc, target_pit, pit_quarters, rounding_to=5):
    W_target   = df[df.target == 1][weight_col].sum()
    W_notarget = df[df.target == 0][weight_col].sum()
    current_ttc = W_target / (W_target + W_notarget) 
    if round(current_ttc, rounding_to) == round(target_ttc, rounding_to):
        print('✅ Получившийся TTC равен целевому (c округлением)')
    else:
        print('❌ Получившийся TTC НЕ равен целевому.')
    print(current_ttc, target_ttc)

    pit_check_df    = df.copy()
    pit_check_df['year_quarter'] = pit_check_df['rating_assignment_date'].dt.to_period('Q')
    pit_quarters    = pd.PeriodIndex(pit_quarters, freq='Q')
    W_pit           = pit_check_df.query('year_quarter.isin(@pit_quarters)', engine = 'python')
    W_pit_target    = W_pit[W_pit.target == 1][weight_col].sum()
    W_pit_nontarget = W_pit[W_pit.target == 0][weight_col].sum()
    current_pit = W_pit_target / (W_pit_target + W_pit_nontarget)
    
    if round(current_pit, rounding_to) == round(target_pit, rounding_to):
        print('✅ Получившийся PIT равен целевому (c округлением).')
    else:
        print('❌ Получившийся PIT НЕ равен целевому.')
    print(current_pit, target_pit)

### 1.1.4 - Расчет весов

In [None]:
weights_df = calculate_weights( data=data
                              , valid_dates=valid_dates
                              , target_default_rates=annual_drs
                              , target_ttc=ttc_ratio
                              , target_pit=pit_ratio
                              , normalise=False)

In [None]:
weights_df

In [None]:
normalised_weights_df = calculate_weights( data=data
                                         , valid_dates=valid_dates
                                         , target_default_rates=annual_drs
                                         , target_ttc=ttc_ratio
                                         , target_pit=pit_ratio
                                         , normalise=True)
normalised_weights_df.rename(columns={  'weight_day': 'weight_day_norm'
                                      , 'weight_1'  : 'weight_1_norm'
                                      , 'weight_2'  : 'weight_2_norm'
                                      , 'W'         : 'W_norm'
                                      }
                            , inplace=True)

In [None]:
normalised_weights_df

In [None]:
df = (data[['client_id', 'rating_id', 'rating_assignment_date', 'target']]
        .merge(valid_dates, on=['client_id', 'rating_id'], how='inner')
        .merge(weights_df, on=['client_id', 'rating_id'], how='inner')
        .merge(normalised_weights_df, on=['client_id', 'rating_id'], how='inner'))

df['year'] = df['rating_assignment_date'].dt.year

df

In [None]:
# columns_to_save = [
#       'client_id', 'rating_id', 'rating_assignment_date', 'valid_date'
#     , 'total', 'defaults', 'drp', 'target'
#     , 'weight_day', 'weight_1', 'weight_2', 'W'
# ]

In [None]:
weight_cols =  ['weight_day', 'weight_1', 'weight_2', 'W', 'weight_day_norm', 'weight_1_norm', 'weight_2_norm', 'W_norm']

### 1.1.5 - Визуализация

In [None]:
# ===== COMBINED VIOLIN PLOT WITH DUAL Y-AXIS =====
fig_violin = sp.make_subplots(specs=[[{"secondary_y": True}]])


# ===== HISTOGRAM =====
fig_hist = sp.make_subplots(
    rows=2, cols=4,
    subplot_titles=('weight_day', 'weight_1', 'weight_2', 'W',
                   'weight_day_norm', 'weight_1_norm', 'weight_2_norm', 'W_norm'),
    vertical_spacing=0.15,
    horizontal_spacing=0.05
)

# Original weights histograms (top row)
weight_names = ['weight_day', 'weight_1', 'weight_2', 'W']
for i, col in enumerate(weight_names):
    fig_hist.add_trace(
        go.Histogram(
            x=df[col],
            name=col,
            marker_color=colors[col],
            opacity=0.7,
            nbinsx=20
        ),
        row=1, col=i+1
    )

# Normalized weights histograms (bottom row)
norm_names = ['weight_day_norm', 'weight_1_norm', 'weight_2_norm', 'W_norm']
for i, col in enumerate(norm_names):
    base_name = col.replace('_norm', '')
    fig_hist.add_trace(
        go.Histogram(
            x=df[col],
            name=col,
            marker_color=colors[base_name],
            opacity=0.7,
            nbinsx=20
        ),
        row=2, col=i+1
    )

fig_hist.update_layout(
    title_text="Histogram: Weight Distribution Comparison",
    title_font_size=16,
    height=600,
    showlegend=False
)

print("\nDisplaying Histogram...")
fig_hist.show()

In [None]:
# Агрегация данных
stats_df = df.groupby('year').agg({
    'rating_assignment_date': 'count',
    'client_id': 'nunique'
}).reset_index()

# Rename columns
stats_df.columns = ['year', 'total', 'clients']

# ===== ГРАФИК С ДВУМЯ СТОЛБЦАМИ РЯДОМ =====
fig = go.Figure()

# Общее количество записей
fig.add_trace(
    go.Bar(
        x=stats_df['year'],
        y=stats_df['total'],
        name='Общее количество рейтингов',
        marker_color='grey',
        opacity=0.7,
    )
)

# Количество клиентов
fig.add_trace(
    go.Bar(
        x=stats_df['year'],
        y=stats_df['clients'],
        name='Количество клиентов',
        marker_color='blue'
    )
)

fig.update_layout(
    title=dict(
        text="Статистика портфеля по годам",
        x=0.5,  # Центрируем заголовок
        xanchor='center'  # Выравниваем по центру
    ),
    width=1000,
    height=400,
    xaxis=dict(
        title=dict(
            text="Год",
        ),
        tickmode='linear',
        dtick=1,
        showgrid=False,
    ),
    yaxis=dict(
        title=dict(
            text="Количество",
        ),
        showgrid=True,
        gridcolor='lightgray',  # Light gray gridlines
        gridwidth=0.5,          # Thin gridlines
    ),
    barmode='group',
    plot_bgcolor='white',
    showlegend=True,
    legend=dict(
        orientation="h",  # Горизонтальная легенда
        yanchor="top",    # Привязка к верхней части
        y=-0.3,           # Размещение внизу (отрицательное значение)
        xanchor="center", # Центрирование по горизонтали
        x=0.5,            # Центр по горизонтали
        bgcolor='white',
        bordercolor='black',
        borderwidth=1
    ),
)

fig.show()

# ===== BOX PLOT BY YEAR FOR W and W_norm =====
fig_yearly = go.Figure()

df['year'] = df['rating_assignment_date'].dt.year
years = sorted(df['year'].unique())

# Add W (original) by year - only add legend for first trace
for i, year in enumerate(years):
    year_data = df[df['year'] == year]
    fig_yearly.add_trace(
        go.Box(
            y=year_data['weight_day'],
            name='Ненормализованный вес' if i == 0 else '',  # Only show legend for first trace
            fillcolor='blue',
            line_color='blue',
            offsetgroup=1,
            x=[f'{year}'] * len(year_data),
            legendgroup='original',  # Group for legend
            showlegend=True if i == 0 else False  # Only show legend for first trace
        )
    )

# Add W_norm (normalized) by year - only add legend for first trace
for i, year in enumerate(years):
    year_data = df[df['year'] == year]
    fig_yearly.add_trace(
        go.Box(
            y=year_data['weight_day_norm'],
            name='Нормализованный вес' if i == 0 else '',  # Only show legend for first trace
            fillcolor='green',
            line_color='green',
            offsetgroup=2,
            x=[f'{year}'] * len(year_data),
            legendgroup='normalized',  # Group for legend
            showlegend=True if i == 0 else False  # Only show legend for first trace
        )
    )

fig_yearly.update_layout(
    title=dict(
        text="Ненормализованный и нормализованный итоговый вес",
        x=0.5,
        xanchor='center',
        font=dict(size=18)
    ),
    height=600,
    width=1200,
    xaxis=dict(
        title=dict(text="Год", font=dict(size=18)),
        tickfont=dict(size=16),
        showgrid=False,
    ),
    yaxis=dict(
        title=dict(text="Вес", font=dict(size=16)),
        tickfont=dict(size=16),
        showgrid=True,
        gridcolor='lightgray',
        gridwidth=0.5,
    ),
    boxmode='group',
    plot_bgcolor='white',
    paper_bgcolor='white',
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="top",
        y=-0.3,
        xanchor="center",
        x=0.5,
        bgcolor='white',
        bordercolor='black',
        borderwidth=1,
        font=dict(size=16)
    ),
    font=dict(size=16)
)

print("Displaying Bar Chart...")
# fig.show()
print("\nDisplaying Yearly Box Plot for W and W_norm...")
fig_yearly.show()

### 1.1.6 -Валидация

In [None]:
validate_weights(df, weight_col='W', target_ttc=ttc_ratio, target_pit=pit_ratio, pit_quarters=pit_quarters)
print('----')
validate_weights(df, weight_col='W_norm', target_ttc=ttc_ratio, target_pit=pit_ratio, pit_quarters=pit_quarters)

### 1.1.7 - Выгрузка

In [None]:
df_final = df.copy().sort_values(['rating_assignment_date', 'rating_id', 'client_id']).drop(['year'], axis=1)
df_final

In [None]:
df_final.to_parquet(filepath['rating_weights'])