In [None]:
import warnings

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
from optuna.integration import OptunaSearchCV
from scipy.stats import randint, uniform
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, make_scorer,
                             mean_absolute_error, mean_squared_error,
                             precision_score, recall_score, r2_score)
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV,
                                     StratifiedKFold, cross_val_score,
                                     train_test_split)
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (LabelEncoder, OneHotEncoder,
                                   StandardScaler)
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

In [None]:
df = pd.read_csv('train.csv')
df.head(10)

In [None]:
# 3. Общая информация о данных
print("Информация о данных:")
print(df.info())

### Проверка на пропуски

In [None]:
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Пропуски': missing, 'Процент': missing_percent})
missing_df = missing_df[missing_df['Пропуски'] > 0].sort_values(by='Пропуски', ascending=False)

if missing_df.empty:
    print("Пропущенных значений нет.")
else:
    print("Пропущенные значения:")
    print(missing_df)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=missing_df.index, y=missing_df['Процент'])
    plt.xticks(rotation=90)
    plt.title('Процент пропусков по признакам')
    plt.show()

### Целевая переменная

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
target_counts = df['Target'].value_counts()
plt.pie(target_counts.values, labels=target_counts.index, autopct='%1.1f%%')
plt.title('Распределение Target')

plt.subplot(1, 2, 2)
sns.countplot(data=df, x='Target')
plt.title('Количество наблюдений по классам')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("Распределение классов:")
print(target_counts)
print('\nGraduate: Успешно окончил учебное заведение.')
print('Dropout: Отчислен (прервал обучение).')
print('Enrolled: Все еще учится (на момент сбора данных).')

### Корреляционный анализ

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0, linewidths=0.5)
plt.title('Тепловая карта корреляций числовых признаков')
plt.show()

# Высококоррелированные признаки (коэффициент > 0.8 или < -0.8)
high_corr = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.8:
            high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

if high_corr:
    print("Высококоррелированные пары признаков (|corr| > 0.8):")
    for pair in high_corr:
        print(f"{pair[0]} — {pair[1]}: {pair[2]:.3f}")
else:
    print("Нет сильно коррелирующих пар признаков (|corr| > 0.8).")

In [None]:
# Словарь для перевода названий признаков
feature_translation = {
    'Age at enrollment': 'Возраст при поступлении',
    'Admission grade': 'Оценка при поступлении',
    'Previous qualification (grade)': 'Оценка предыдущего образования',
    'Unemployment rate': 'Уровень безработицы',
    'GDP': 'ВВП',
    'Nacionality': 'Национальность'
}

# Цвета для классов Target
target_colors = {
    'Graduate': '#2E8B57',  # Зеленый
    'Dropout': '#DC143C',   # Красный
    'Enrolled': '#4169E1'   # Синий
}

key_numeric = ['Age at enrollment', 'Admission grade', 
               'Previous qualification (grade)', 'Unemployment rate', 'GDP', 'Nacionality']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

plt.rcParams['font.family'] = 'DejaVu Sans' 

for i, col in enumerate(key_numeric[:6]):
    ax = axes[i]
    
    for target_class in df['Target'].unique():
        subset = df[df['Target'] == target_class]
        sns.histplot(data=subset, x=col, kde=True, 
                     color=target_colors[target_class], 
                     label=target_class, alpha=0.6, ax=ax)
    
    russian_title = feature_translation.get(col, col)
    ax.set_title(f'Распределение: {russian_title}', fontsize=14, fontweight='bold')
    ax.set_xlabel(russian_title, fontsize=12)
    ax.set_ylabel('Количество студентов', fontsize=12)
    ax.tick_params(axis='x', rotation=45, labelsize=10)
    ax.tick_params(axis='y', labelsize=10)
    
    ax.legend(title='Статус студента', title_fontsize=11, fontsize=10,
              loc='upper right' if i != 0 else 'upper right')
    
    ax.grid(True, alpha=0.3, linestyle='--')

for i in range(len(key_numeric), len(axes)):
    fig.delaxes(axes[i])

plt.suptitle('Распределение ключевых числовых признаков по статусу студента', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Словарь для перевода названий признаков
feature_translation = {
    'Marital status': 'Семейное положение',
    'Daytime/evening attendance': 'Форма обучения',
    'Father\'s qualification': 'Образование отца',
    'Mother\'s qualification': 'Образование матери',
    'Gender': 'Пол',
    'Scholarship holder': 'Стипендия'
}

# Словари для перевода значений категорий
category_translation = {
    'Marital status': {
        1: 'Холост/Не замужем',
        2: 'Женат/Замужем',
        3: 'Вдовец/Вдова',
        4: 'Разведен(а)',
        5: 'Фактический брак',
        6: 'Разлучен(а)'
    },
    'Daytime/evening attendance': {
        1: 'Дневная',
        0: 'Вечерняя'
    },
    'Gender': {
        1: 'Мужской',
        0: 'Женский'
    },
    'Scholarship holder': {
        1: 'Со стипендией',
        0: 'Без стипендии'
    }
}

cat_features = ['Marital status', 'Daytime/evening attendance', 
                'Father\'s qualification', 'Mother\'s qualification',
                'Gender', 'Scholarship holder']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6', '#1abc9c']

for i, col in enumerate(cat_features):
    temp_df = df.groupby(col)['Target'].apply(
        lambda x: (x == 'Dropout').mean() * 100
    ).reset_index(name='dropout_rate')
    
    russian_title = feature_translation.get(col, col)
    
    if col in category_translation:
        temp_df['category_label'] = temp_df[col].map(category_translation[col])
    else:
        temp_df['category_label'] = temp_df[col].astype(str)
    
    bars = sns.barplot(data=temp_df, x='category_label', y='dropout_rate', 
                       hue='category_label', 
                       legend=False, 
                       ax=axes[i], 
                       palette=[colors[i % len(colors)]],  
                       edgecolor='black', linewidth=1.5)
    
    axes[i].set_title(f'{russian_title}', 
                      fontsize=14, fontweight='bold', pad=20)
    axes[i].set_xlabel(russian_title, fontsize=12)
    axes[i].set_ylabel('Процент отчислений (%)', fontsize=12)
    axes[i].tick_params(axis='x', rotation=45, labelsize=10)
    axes[i].tick_params(axis='y', labelsize=10)
    
    for idx, p in enumerate(bars.patches):
        height = p.get_height()
        axes[i].text(p.get_x() + p.get_width()/2., height + 0.5,
                    f'{height:.1f}%', ha='center', va='bottom', 
                    fontsize=10, fontweight='bold')

    axes[i].grid(True, alpha=0.3, linestyle='--', axis='y')
    axes[i].set_axisbelow(True)
    
    axes[i].set_ylim(0, max(temp_df['dropout_rate'].max() + 5, 50))
    
    if col in ['Father\'s qualification', 'Mother\'s qualification']:
        axes[i].tick_params(axis='x', rotation=90, labelsize=8)

plt.suptitle('Влияние категориальных признаков на процент отчислений студентов', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name

        if col_type not in ['object', 'category', 'datetime64[ns, UTC]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

# Очистка данных

## Укрупнение категориальных данных

In [None]:
train = pd.read_csv('train.csv')

In [None]:
pd.set_option('display.max_columns', None)
train.head()

In [None]:
train.info()

In [None]:
train = reduce_mem_usage(train)

In [None]:
nationality_region_mapping = {
    # Западная Европа
    2: 1,   # German
    13: 1,  # Dutch
    14: 1,  # English
    # Южная Европа
    1: 2,  # Portuguese
    6: 2,  # Spanish
    11: 2, # Italian
    62: 2, # Romanian
    # Восточная Европа
    17: 3,  # Lithuanian
    100: 3, # Moldova
    103: 3, # Ukrainian
    105: 3, # Russian
    # Африка
    21: 4,  # Angolan
    22: 4,  # Cape Verdean
    24: 4,  # Guinean
    25: 4,  # Mozambican
    26: 4,  # Santomean
    # Латинская Америка
    41: 5,  # Brazilian
    101: 5, # Mexican
    108: 5, # Cuban
    109: 5, # Colombian
    # Азия
    32: 6  # Turkish
}

In [None]:
train['Nationality_Region'] = train['Nacionality'].map(nationality_region_mapping)

In [None]:
train = train.drop('id', axis=1)

In [None]:
label_encoder = LabelEncoder()
train['target_encoded'] = label_encoder.fit_transform(train['Target'])

print("Соответствие классов:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"{i} -> {class_name}")

In [None]:
train = train.drop('Nacionality', axis=1)

In [None]:
train_wout_2nd_sem = train.loc[:, ~train.columns.str.contains('2nd sem', case=False, na=False)]
train_wout_2nd_sem = train_wout_2nd_sem[train_wout_2nd_sem['Target'] != 'Enrolled']
train_wout_2nd_sem['target_encoded'] = train_wout_2nd_sem['target_encoded'].replace(2, 1)

In [None]:
train_wout_2nd_sem

In [None]:
df = train_wout_2nd_sem.copy()

# 1. Application mode (Режим поступления)
def group_application_mode(x):
    if x in [1, 17, 18]:
        return 1  # Общий конкурс
    elif x in [5, 16, 26, 44, 53]:
        return 2  # Специальные квоты
    elif x in [2, 10, 27, 39, 42, 43, 51, 57]:
        return 3  # Другие формы доступа
    elif x == 15:
        return 4  # Международные
    elif x == 7:
        return 5  # Другие высшие курсы
    else:
        return x  # Оставляем как есть, если код не распознан

df['Application_mode_grouped'] = df['Application mode'].apply(group_application_mode)

# 2. Course (Курс)
def group_course(x):
    # Инженерия и IT
    if x in [9119, 9130]:
        return 1
    # Здравоохранение
    elif x in [9085, 9500, 9556]:
        return 2
    # Бизнес и управление
    elif x in [9147, 9670, 9991]:
        return 3
    # Сельское хозяйство и природа
    elif x in [9003, 33]:
        return 4
    # Социальные науки и сервис
    elif x in [8014, 9238, 9254, 9773, 9853]:
        return 5
    # Дизайн и искусство
    elif x in [171, 9070]:
        return 6
    else:
        return x  # Оставляем как есть

df['Course_grouped'] = df['Course'].apply(group_course)

# 3. Previous qualification (Предыдущее образование)
def group_qualification(x):
    # Без образования/Начальное
    if x in [9, 10, 11, 12, 14, 15, 18, 26, 27, 29, 30, 34, 35, 36, 37, 38]:
        return 1
    # Среднее образование/Профессиональное
    elif x in [1, 19, 39, 42]:
        return 2
    # Неоконченное высшее
    elif x == 6:
        return 3
    # Высшее (1-й цикл / бакалавриат)
    elif x in [2, 3, 40]:
        return 4
    # Высшее (2-й цикл и выше / магистратура+)
    elif x in [4, 5, 41, 43, 44]:
        return 5
    else:
        return 0  # Неизвестно/Пропущено

df['Previous_qualification_grouped'] = df['Previous qualification'].apply(group_qualification)

# 4. Mother's qualification (Образование матери)
def group_mother_qualification(x):
    # Без образования/Начальное
    if x in [9, 10, 11, 12, 14, 18, 26, 27, 29, 30, 34, 35, 36, 37, 38]:
        return 1
    # Среднее образование/Профессиональное
    elif x in [1, 19, 22, 39, 42]:
        return 2
    # Неоконченное высшее
    elif x == 6:
        return 3
    # Высшее (1-й цикл / бакалавриат)
    elif x in [2, 3, 40, 41]:
        return 4
    # Высшее (2-й цикл и выше / магистратура+)
    elif x in [4, 5, 43, 44]:
        return 5
    else:
        return 0  # Неизвестно/Пропущено

df['Mother_qualification_grouped'] = df["Mother's qualification"].apply(group_mother_qualification)

# 5. Father's qualification (Образование отца)
def group_father_qualification(x):
    # Без образования/Начальное
    if x in [9, 10, 11, 12, 13, 14, 18, 20, 22, 25, 26, 27, 29, 30, 31, 33, 34, 35, 36, 37, 38]:
        return 1
    # Среднее образование/Профессиональное
    elif x in [1, 19, 39, 42]:
        return 2
    # Неоконченное высшее
    elif x == 6:
        return 3
    # Высшее (1-й цикл / бакалавриат)
    elif x in [2, 3, 40, 41]:
        return 4
    # Высшее (2-й цикл и выше / магистратура+)
    elif x in [4, 5, 43, 44]:
        return 5
    else:
        return 0  # Неизвестно/Пропущено

df['Father_qualification_grouped'] = df["Father's qualification"].apply(group_father_qualification)

# 6. Mother's occupation (Профессия матери)
def group_mother_occupation(x):
    # Руководители и управленцы
    if x in [0, 1]:
        return 1
    # Специалисты высшего уровня
    elif x in [2, 122, 123, 125]:
        return 2
    # Специалисты среднего уровня/Техники
    elif x in [3, 131, 132, 134]:
        return 3
    # Служащие/Административный персонал
    elif x in [4, 141, 143, 144]:
        return 4
    # Работники сферы услуг и торговли
    elif x in [5, 151, 152, 153]:
        return 5
    # Квалифицированные рабочие
    elif x in [6, 7, 8, 171, 173, 175]:
        return 6
    # Неквалифицированные рабочие
    elif x in [9, 191, 192, 193, 194]:
        return 7
    # Силовые структуры
    elif x == 10:
        return 8
    else:
        return 0  # Другое/Неизвестно (90, 99)

df['Mother_occupation_grouped'] = df["Mother's occupation"].apply(group_mother_occupation)

# 7. Father's occupation (Профессия отца)
def group_father_occupation(x):
    # Руководители и управленцы
    if x in [0, 1, 101, 102, 112, 114]:
        return 1
    # Специалисты высшего уровня
    elif x in [2, 121, 122, 123, 124]:
        return 2
    # Специалисты среднего уровня/Техники
    elif x in [3, 103, 131, 132, 134, 135]:
        return 3
    # Служащие/Административный персонал
    elif x in [4, 141, 143, 144]:
        return 4
    # Работники сферы услуг и торговли
    elif x in [5, 151, 152, 153, 154, 161, 163, 182, 183, 194, 195]:
        return 5
    # Квалифицированные рабочие
    elif x in [6, 7, 8, 171, 172, 174, 175, 181]:
        return 6
    # Неквалифицированные рабочие
    elif x in [9, 191, 192, 193]:
        return 7
    # Силовые структуры
    elif x == 10:
        return 8
    else:
        return 0  # Другое/Неизвестно (90, 99)

df['Father_occupation_grouped'] = df["Father's occupation"].apply(group_father_occupation)

# 8. Marital Status (Семейное положение)
def group_marital_status(x):
    # Не в отношениях (холост)
    if x == 1:
        return 1
    # В официальных отношениях (женат/замужем)
    elif x == 2:
        return 2
    # В неофициальных отношениях
    elif x == 5:
        return 3
    # Ранее состоял в отношениях (вдовец/разведен/раздельно)
    elif x in [3, 4, 6]:
        return 4
    else:
        return x

df['Marital_status_grouped'] = df['Marital status'].apply(group_marital_status)

In [None]:
columns_to_drop = [
    'Application mode', 'Course', 'Previous qualification',
    "Mother's qualification", "Father's qualification",
    "Mother's occupation", "Father's occupation", 'Marital status'
]
df = df.drop(columns=columns_to_drop)

In [None]:
df

## Убираем с оценками = 0 за первый семестр

In [None]:
target_distribution = df[df['Curricular units 1st sem (approved)'] == 0]['Target'].value_counts()

In [None]:
target_distribution

In [None]:
df = df[df['Curricular units 1st sem (approved)'] > 0]

# Логистическая регрессия на очищенных данных

In [None]:
df_t = df.copy()

In [None]:
if 'target_encoded' not in df_t.columns:
    le = LabelEncoder()
    df_t['target_encoded'] = le.fit_transform(df_t['Target'])
    print(f"Target classes mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")

In [None]:
X = df_t.drop(['Target', 'target_encoded'], axis=1, errors='ignore')
y = df_t['target_encoded'] if 'target_encoded' in df_t.columns else le.transform(df['Target'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
numeric_features = [
    'Application order',
    'Previous qualification (grade)',
    'Admission grade',
    'Age at enrollment',
    'Curricular units 1st sem (credited)',
    'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (evaluations)',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)',
    'Curricular units 1st sem (without evaluations)',
    'Unemployment rate',
    'Inflation rate',
    'GDP'
]

binary_features = [
    'Daytime/evening attendance', 'Displaced', 'Educational special needs',
    'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'International'
]

ordinal_features = [
    'Previous_qualification_grouped', 'Mother_qualification_grouped',
    'Father_qualification_grouped', 'Marital_status_grouped'
]

nominal_features = [
    'Nationality_Region', 'Application_mode_grouped', 'Course_grouped',
    'Mother_occupation_grouped', 'Father_occupation_grouped'
]

# 2. Создаем препроцессор
preprocessor = ColumnTransformer(
    transformers=[
        # Числовые - масштабируем
        ('num', StandardScaler(), numeric_features),
        
        # Бинарные - оставляем как есть (0/1)
        ('bin', 'passthrough', binary_features),
        
        # Порядковые - оставляем как есть (числа)
        ('ord', 'passthrough', ordinal_features),
        
        # Номинальные - one-hot encoding
        ('nom', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), nominal_features)
    ]
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        random_state=42,
        max_iter=1000,
        class_weight='balanced',
        penalty='l2',
        C=1.0,
        solver='lbfgs'
    ))
])

In [None]:
def recall_dropout(y_true, y_pred):
    """Recall для класса Dropout (класс 0)"""
    return recall_score(y_true, y_pred, labels=[0], average=None)[0]

recall_dropout_scorer = make_scorer(recall_dropout, greater_is_better=True)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
def objective(trial, pipeline, cv, recall_dropout_scorer):
    # Предлагаем тип регуляризации
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    
    # В зависимости от penalty выбираем допустимые solvers
    if penalty == 'l1':
        solver = trial.suggest_categorical('solver_l1', ['liblinear', 'saga'])
    elif penalty == 'l2':
        solver = trial.suggest_categorical('solver_l2', ['liblinear', 'saga', 'lbfgs', 'newton-cg', 'sag'])
    else:  # elasticnet
        solver = 'saga'
        l1_ratio = trial.suggest_float('l1_ratio', 0.1, 0.9)
    
    # Остальные гиперпараметры
    C = trial.suggest_float('C', 1, 5, log=True)
    class_weight = trial.suggest_categorical('class_weight', ['balanced', None])
    max_iter = trial.suggest_int('max_iter', 100, 2000)
    tol = trial.suggest_categorical('tol', [1e-4, 1e-3])
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    
    # Устанавливаем параметры в pipeline
    params = {
        'classifier__C': C,
        'classifier__penalty': penalty,
        'classifier__solver': solver,
        'classifier__class_weight': class_weight,
        'classifier__max_iter': max_iter,
        'classifier__tol': tol,
        'classifier__fit_intercept': fit_intercept,
    }
    
    if penalty == 'elasticnet':
        params['classifier__l1_ratio'] = l1_ratio
    
    pipeline.set_params(**params)
    
    # Вычисляем скор с помощью кросс-валидации
    scores = cross_val_score(
        pipeline, 
        X_train,  # Нужно определить X_train и y_train
        y_train,
        cv=cv,
        scoring=recall_dropout_scorer,
        n_jobs=-1
    )
    
    return scores.mean()

In [None]:
study = optuna.create_study(
    direction='maximize',  # или 'minimize' в зависимости от метрики
    sampler=optuna.samplers.TPESampler(seed=42)
)

In [None]:
study.optimize(
    lambda trial: objective(trial, pipeline, cv, recall_dropout_scorer),
    n_trials=200,  # Количество испытаний
    show_progress_bar=True
)

In [None]:
best_params = study.best_params
print("Лучшие параметры:", best_params)
print("Лучшее значение recall:", study.best_value)

In [None]:
pipeline_best_params = {}

for key, value in best_params.items():
    if key == 'solver_l2':
        pipeline_best_params[f'classifier__solver'] = value
    else:
        pipeline_best_params[f'classifier__{key}'] = value

In [None]:
pipeline_best_params

In [None]:
pipeline.set_params(**pipeline_best_params)
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
recall_scores = recall_score(y_test, y_pred, average=None, labels=[0, 1])
print(f"Recall для Dropout (класс 0): {recall_scores[0]:.4f}")
print(f"Recall для Graduate (класс 1): {recall_scores[1]:.4f}")

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Dropout', 'Graduate']))

In [None]:
# Извлечение значимости признаков для логистической регрессии
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Извлекаем обученную модель и препроцессор
classifier = pipeline.named_steps['classifier']
preprocessor = pipeline.named_steps['preprocessor']

# 2. Получаем коэффициенты модели
# Для бинарной классификации coef_ имеет размер (1, n_features)
coefficients = classifier.coef_[0]

# 3. Получаем имена признаков после препроцессинга
try:
    # Для новых версий scikit-learn
    feature_names = preprocessor.get_feature_names_out()
except AttributeError:
    # Для старых версий scikit-learn
    feature_names = []
    for name, transformer, features in preprocessor.transformers_:
        if name == 'nom':  # Для one-hot encoded признаков
            # Получаем категории для номинальных признаков
            nom_encoder = transformer
            for i, feature in enumerate(features):
                categories = nom_encoder.categories_[i][1:]  # исключаем первую категорию (drop='first')
                for cat in categories:
                    feature_names.append(f"{feature}_{cat}")
        elif hasattr(transformer, 'get_feature_names_out'):
            feature_names.extend(transformer.get_feature_names_out(features))
        else:
            # Для passthrough или StandardScaler
            feature_names.extend(features)

# 4. Создаем DataFrame с важностью признаков
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients,
    'abs_coefficient': np.abs(coefficients)
})

# Сортируем по абсолютной важности
feature_importance = feature_importance.sort_values('abs_coefficient', ascending=False)

# 5. Визуализация
plt.figure(figsize=(14, 10))

# График 1: Топ-20 признаков с учетом знака
plt.subplot(2, 1, 1)
top_features = feature_importance.head(20)
colors = ['red' if coef < 0 else 'blue' for coef in top_features['coefficient']]
bars = plt.barh(range(len(top_features)), top_features['coefficient'], color=colors)
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Значение коэффициента', fontsize=12)
plt.title('Топ-20 самых важных признаков (с учетом знака)', fontsize=14, fontweight='bold')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.5)
plt.gca().invert_yaxis()

# Добавляем значения на график
for i, (coef, abs_coef) in enumerate(zip(top_features['coefficient'], top_features['abs_coefficient'])):
    plt.text(coef + (0.01 if coef >= 0 else -0.01), i, 
             f'{coef:.3f}', 
             ha='left' if coef >= 0 else 'right',
             va='center',
             fontsize=9)

# График 2: Абсолютные значения топ-20
plt.subplot(2, 1, 2)
bars = plt.barh(range(len(top_features)), top_features['abs_coefficient'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Абсолютное значение коэффициента', fontsize=12)
plt.title('Топ-20 самых важных признаков (абсолютные значения)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()

# Добавляем значения на график
for i, val in enumerate(top_features['abs_coefficient']):
    plt.text(val + 0.01, i, f'{val:.3f}', ha='left', va='center', fontsize=9)

plt.tight_layout()
plt.show()

# Дерево решений

In [None]:
dt_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('dt', DecisionTreeClassifier(random_state=42, class_weight='balanced'))
])

In [None]:
def objective(trial, dt_pipeline, cv, recall_dropout_scorer):
    # Предлагаем гиперпараметры для Decision Tree
    max_depth = trial.suggest_int('max_depth', 3, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    
    # Для class_weight используем разные стратегии (для 2 классов)
    class_weight_option = trial.suggest_categorical(
        'class_weight_option', 
        ['balanced', 'dropout_2', 'dropout_3', 'dropout_4', None]
    )
    
    # Преобразуем class_weight в нужный формат для 2 классов
    if class_weight_option == 'balanced':
        class_weight = 'balanced'
    elif class_weight_option == 'dropout_2':
        class_weight = {0: 2, 1: 1}  # Dropout весом 2, Graduate весом 1
    elif class_weight_option == 'dropout_3':
        class_weight = {0: 3, 1: 1}  # Dropout весом 3, Graduate весом 1
    elif class_weight_option == 'dropout_4':
        class_weight = {0: 4, 1: 1}  # Dropout весом 4, Graduate весом 1
    else:  # None
        class_weight = None
    
    # Дополнительные параметры (опционально)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    splitter = trial.suggest_categorical('splitter', ['best', 'random'])
    
    # Устанавливаем параметры в pipeline
    params = {
        'dt__max_depth': max_depth if max_depth < 30 else None,
        'dt__min_samples_split': min_samples_split,
        'dt__min_samples_leaf': min_samples_leaf,
        'dt__criterion': criterion,
        'dt__class_weight': class_weight,
        'dt__max_features': max_features,
        'dt__splitter': splitter,
    }
    
    dt_pipeline.set_params(**params)
    
    # Вычисляем скор с помощью кросс-валидации
    scores = cross_val_score(
        dt_pipeline, 
        X_train, 
        y_train,
        cv=cv,
        scoring=recall_dropout_scorer,
        n_jobs=-1
    )
    
    return scores.mean()

In [None]:
dt_study = optuna.create_study(
    direction='maximize',  # Мы хотим максимизировать recall для Dropout
    sampler=optuna.samplers.TPESampler(seed=42)
)

In [None]:
dt_study.optimize(
    lambda trial: objective(trial, dt_pipeline, cv, recall_dropout_scorer),
    n_trials=200,  # Количество испытаний
    show_progress_bar=True
)

In [None]:
dt_best_params = dt_study.best_params
print("Лучшие параметры:", dt_best_params)
print("Лучший recall:", dt_study.best_value)

In [None]:
pipeline_best_params = {}

for key, value in dt_best_params.items():
    if key == 'class_weight_option':
            pipeline_best_params[f'dt__class_weight'] = {0: 4, 1: 1}
    else:
        pipeline_best_params[f'dt__{key}'] = value

In [None]:
pipeline_best_params

In [None]:
dt_pipeline.set_params(**pipeline_best_params)

In [None]:
dt_pipeline.fit(X_train, y_train)

In [None]:
y_pred_dt = dt_pipeline.predict(X_test)

In [None]:
print("Оценка дерева решений на тестовых данных:")
print(f"Recall Dropout: {recall_dropout(y_test, y_pred_dt):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_dt, target_names=['Dropout', 'Enrolled']))

# Случайный лес

In [None]:
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1))
])

In [None]:
def objective(trial, rf_pipeline, cv, recall_dropout_scorer):
    # Основные гиперпараметры Random Forest
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 3, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
    
    # Для class_weight используем разные стратегии
    class_weight_option = trial.suggest_categorical(
        'class_weight_option', 
        ['balanced', 'balanced_subsample', 'dropout_2', 'dropout_3', 'dropout_4', None]
    )
    
    # Преобразуем class_weight в нужный формат
    if class_weight_option == 'balanced':
        class_weight = 'balanced'
    elif class_weight_option == 'balanced_subsample':
        class_weight = 'balanced_subsample'
    elif class_weight_option == 'dropout_2':
        class_weight = {0: 2, 1: 1}
    elif class_weight_option == 'dropout_3':
        class_weight = {0: 3, 1: 1}
    elif class_weight_option == 'dropout_4':
        class_weight = {0: 4, 1: 1}
    else:
        class_weight = None
    
    # Дополнительные параметры Random Forest
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    
    # Если bootstrap=True, можно настроить max_samples
    max_samples = None
    if bootstrap:
        max_samples = trial.suggest_float('max_samples', 0.5, 1.0)
    
    # Устанавливаем параметры в pipeline
    params = {
        'rf__n_estimators': n_estimators,
        'rf__max_depth': max_depth if max_depth < 30 else None,
        'rf__min_samples_split': min_samples_split,
        'rf__min_samples_leaf': min_samples_leaf,
        'rf__criterion': criterion,
        'rf__class_weight': class_weight,
        'rf__max_features': max_features,
        'rf__bootstrap': bootstrap,
    }
    
    # Добавляем max_samples только если bootstrap=True
    if bootstrap and max_samples:
        params['rf__max_samples'] = max_samples
    else:
        params['rf__max_samples'] = None
    
    rf_pipeline.set_params(**params)
    
    # Вычисляем скор с помощью кросс-валидации
    scores = cross_val_score(
        rf_pipeline, 
        X_train, 
        y_train,
        cv=cv,
        scoring=recall_dropout_scorer,
        n_jobs=-1
    )
    
    return scores.mean()

In [None]:
rf_study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)

In [None]:
rf_study.optimize(
    lambda trial: objective(trial, rf_pipeline, cv, recall_dropout_scorer),
    n_trials=100,
    show_progress_bar=True
)

In [None]:
rf_best_params = rf_study.best_params
print("Лучшие параметры Random Forest:", rf_best_params)
print("Лучший recall:", rf_study.best_value)

In [None]:
pipeline_best_params = {}

for key, value in rf_best_params.items():
    if key == 'class_weight_option':
            pipeline_best_params[f'rf__class_weight'] = {0: 4, 1: 1}
    else:
        pipeline_best_params[f'rf__{key}'] = value

In [None]:
pipeline_best_params

In [None]:
rf_pipeline.set_params(**pipeline_best_params)

In [None]:
rf_pipeline.fit(X_train, y_train)

In [None]:
y_pred_rf = rf_pipeline.predict(X_test)

In [None]:
print("Оценка случайного леса на тестовых данных:")
print(f"Recall Dropout: {recall_dropout(y_test, y_pred_rf):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Dropout', 'Enrolled']))

# XGBoost

In [None]:
def objective(trial):
    """
    Целевая функция для Optuna, максимизирующая recall для класса dropout
    """
    # Определяем гиперпараметры для оптимизации
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 11),  # randint(3, 12) -> 3-11 включительно
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.31),  # uniform(0.01, 0.3)
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),  # uniform(0.6, 0.4)
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),  # uniform(0.6, 0.4)
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),  # uniform(0.5, 0.5)
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),  # uniform(0.5, 0.5)
        'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 3.0),  # uniform(0.5, 2.5)
        'n_estimators': trial.suggest_int('n_estimators', 300, 999),  # randint(300, 1000)
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.145, 0.435),  # uniform(0.29*0.5, 0.29*1.5)
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 9),  # randint(0, 10)
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 5.1),  # uniform(0.1, 5)
        'objective': 'binary:logistic',
        'random_state': 42,
        'eval_metric': ['mlogloss', 'merror']
    }
    
    # Создаем модель XGBoost
    model = xgb.XGBClassifier(**params)
    
    # Необязательно: можно добавить StandardScaler в пайплайн
    # Если ваши данные нуждаются в масштабировании
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('xgb', model)
    ])
    
    # Выполняем кросс-валидацию
    scores = cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=cv,
        scoring=recall_dropout_scorer,
        n_jobs=1  # XGBoost может конфликтовать с многопоточностью в кросс-валидации
    )
    
    # Возвращаем средний recall для dropout
    return np.mean(scores)

In [None]:
study = optuna.create_study(
    direction='maximize',  # Максимизируем recall для dropout
    study_name='xgboost_optuna',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=10,
        n_warmup_steps=5,
        interval_steps=1
    )
)

In [None]:
study.optimize(objective, n_trials=150, show_progress_bar=True)

In [None]:
pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('xgb', model)
    ])

In [None]:
print(f"\nЛучшие параметры: {study.best_params}")
print(f"Лучший Recall 0-го класса: {study.best_value:.4f}")

In [None]:
best_params = study.best_params

In [None]:
best_params

In [None]:
xgb_model = xgb.XGBClassifier(
    **best_params,
    objective='binary:logistic',
    random_state=42,
    eval_metric=['mlogloss', 'merror'],
    n_jobs=-1  # Используем все ядра
)

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', xgb_model)
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred_xgb = pipeline.predict(X_test)

In [None]:
print(f"Recall Dropout: {recall_dropout(y_test, y_pred_xgb):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['Dropout', 'Graduate']))

# Без укрупнения

In [None]:
train = pd.read_csv('train.csv')

In [None]:
pd.set_option('display.max_columns', None)
train.head()

In [None]:
label_encoder = LabelEncoder()
train['target_encoded'] = label_encoder.fit_transform(train['Target'])

print("Соответствие классов:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"{i} -> {class_name}")

In [None]:
train_wout_2nd_sem = train.loc[:, ~train.columns.str.contains('2nd sem', case=False, na=False)]
train_wout_2nd_sem = train_wout_2nd_sem[train_wout_2nd_sem['Target'] != 'Enrolled']
train_wout_2nd_sem['target_encoded'] = train_wout_2nd_sem['target_encoded'].replace(2, 1)

In [None]:
df = train_wout_2nd_sem

In [None]:
target_distribution = df[df['Curricular units 1st sem (approved)'] == 0]['Target'].value_counts()

In [None]:
target_distribution

In [None]:
df = df[df['Curricular units 1st sem (approved)'] > 0]

## Логистическая регрессия

In [None]:
df_t = df.copy()

In [None]:
df_t

In [None]:
df_t = reduce_mem_usage(df_t)

In [None]:
if 'target_encoded' not in df_t.columns:
    le = LabelEncoder()
    df_t['target_encoded'] = le.fit_transform(df_t['Target'])
    print(f"Target classes mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}")

In [None]:
X = df_t.drop(['Target', 'target_encoded'], axis=1, errors='ignore')
y = df_t['target_encoded'] if 'target_encoded' in df_t.columns else le.transform(df['Target'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
pipeline = Pipeline([
    ('classifier', LogisticRegression(
        random_state=42,
        max_iter=1000,
        class_weight='balanced',
        penalty='l2',
        C=1.0,
        solver='lbfgs'
    ))
])

In [None]:
def recall_dropout(y_true, y_pred):
    return recall_score(y_true, y_pred, labels=[0], average=None)[0]

recall_dropout_scorer = make_scorer(recall_dropout, greater_is_better=True)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
def objective(trial, pipeline, cv, recall_dropout_scorer):
    # Предлагаем тип регуляризации
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    
    # В зависимости от penalty выбираем допустимые solvers
    if penalty == 'l1':
        solver = trial.suggest_categorical('solver_l1', ['liblinear', 'saga'])
    elif penalty == 'l2':
        solver = trial.suggest_categorical('solver_l2', ['liblinear', 'saga', 'lbfgs', 'newton-cg', 'sag'])
    else:  # elasticnet
        solver = 'saga'
        l1_ratio = trial.suggest_float('l1_ratio', 0.1, 0.9)
    
    # Остальные гиперпараметры
    C = trial.suggest_float('C', 1, 5, log=True)
    class_weight = trial.suggest_categorical('class_weight', ['balanced', None])
    max_iter = trial.suggest_int('max_iter', 100, 2000)
    tol = trial.suggest_categorical('tol', [1e-4, 1e-3])
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    
    # Устанавливаем параметры в pipeline
    params = {
        'classifier__C': C,
        'classifier__penalty': penalty,
        'classifier__solver': solver,
        'classifier__class_weight': class_weight,
        'classifier__max_iter': max_iter,
        'classifier__tol': tol,
        'classifier__fit_intercept': fit_intercept,
    }
    
    if penalty == 'elasticnet':
        params['classifier__l1_ratio'] = l1_ratio
    
    pipeline.set_params(**params)
    
    # Вычисляем скор с помощью кросс-валидации
    scores = cross_val_score(
        pipeline, 
        X_train,  # Нужно определить X_train и y_train
        y_train,
        cv=cv,
        scoring=recall_dropout_scorer,
        n_jobs=-1
    )
    
    return scores.mean()

In [None]:
study = optuna.create_study(
    direction='maximize',  # или 'minimize' в зависимости от метрики
    sampler=optuna.samplers.TPESampler(seed=42)
)

In [None]:
study.optimize(
    lambda trial: objective(trial, pipeline, cv, recall_dropout_scorer),
    n_trials=50,  # Количество испытаний
    show_progress_bar=True
)

In [None]:
best_params = study.best_params
print("Лучшие параметры:", best_params)
print("Лучшее значение recall:", study.best_value)

In [None]:
pipeline_best_params = {}

for key, value in best_params.items():
    pipeline_best_params[f'classifier__{key}'] = value

In [None]:
best_params = {
    'classifier__C': 4.367688611434853,
    'classifier__class_weight': 'balanced',
    'classifier__l1_ratio': 0.5740737821496418,
    'classifier__max_iter': 1539,
    'classifier__penalty': 'elasticnet',  # Для использования l1_ratio нужен penalty='elasticnet'
    'classifier__solver': 'saga',  # 'saga' поддерживает elasticnet
    'classifier__random_state': 42
}

In [None]:
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
recall_scores = recall_score(y_test, y_pred, average=None, labels=[0, 1])
print(f"Recall для Dropout (класс 0): {recall_scores[0]:.4f}")
print(f"Recall для Graduate (класс 1): {recall_scores[1]:.4f}")

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Dropout', 'Graduate']))

## Дерево решений

In [None]:
dt_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('dt', DecisionTreeClassifier(random_state=42, class_weight='balanced'))
])

In [None]:
def objective(trial, dt_pipeline, cv, recall_dropout_scorer):
    # Предлагаем гиперпараметры для Decision Tree
    max_depth = trial.suggest_int('max_depth', 3, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    
    # Для class_weight используем разные стратегии (для 2 классов)
    class_weight_option = trial.suggest_categorical(
        'class_weight_option', 
        ['balanced', 'dropout_2', 'dropout_3', 'dropout_4', None]
    )
    
    # Преобразуем class_weight в нужный формат для 2 классов
    if class_weight_option == 'balanced':
        class_weight = 'balanced'
    elif class_weight_option == 'dropout_2':
        class_weight = {0: 2, 1: 1}  # Dropout весом 2, Graduate весом 1
    elif class_weight_option == 'dropout_3':
        class_weight = {0: 3, 1: 1}  # Dropout весом 3, Graduate весом 1
    elif class_weight_option == 'dropout_4':
        class_weight = {0: 4, 1: 1}  # Dropout весом 4, Graduate весом 1
    else:  # None
        class_weight = None
    
    # Дополнительные параметры (опционально)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    splitter = trial.suggest_categorical('splitter', ['best', 'random'])
    
    # Устанавливаем параметры в pipeline
    params = {
        'dt__max_depth': max_depth if max_depth < 30 else None,
        'dt__min_samples_split': min_samples_split,
        'dt__min_samples_leaf': min_samples_leaf,
        'dt__criterion': criterion,
        'dt__class_weight': class_weight,
        'dt__max_features': max_features,
        'dt__splitter': splitter,
    }
    
    dt_pipeline.set_params(**params)
    
    # Вычисляем скор с помощью кросс-валидации
    scores = cross_val_score(
        dt_pipeline, 
        X_train, 
        y_train,
        cv=cv,
        scoring=recall_dropout_scorer,
        n_jobs=-1
    )
    
    return scores.mean()

In [None]:
dt_study = optuna.create_study(
    direction='maximize',  # Мы хотим максимизировать recall для Dropout
    sampler=optuna.samplers.TPESampler(seed=42)
)

In [None]:
dt_study.optimize(
    lambda trial: objective(trial, dt_pipeline, cv, recall_dropout_scorer),
    n_trials=200,  # Количество испытаний
    show_progress_bar=True
)

In [None]:
dt_best_params = dt_study.best_params
print("Лучшие параметры:", dt_best_params)
print("Лучший recall:", dt_study.best_value)

In [None]:
pipeline_best_params = {}

for key, value in dt_best_params.items():
    if key == 'class_weight_option':
            pipeline_best_params[f'dt__class_weight'] = {0: 4, 1: 1}
    else:
        pipeline_best_params[f'dt__{key}'] = value

In [None]:
pipeline_best_params

In [None]:
dt_pipeline.set_params(**pipeline_best_params)

In [None]:
dt_pipeline.fit(X_train, y_train)

In [None]:
y_pred_dt = dt_pipeline.predict(X_test)

In [None]:
print("Оценка дерева решений на тестовых данных:")
print(f"Recall Dropout: {recall_dropout(y_test, y_pred_dt):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_dt, target_names=['Dropout', 'Enrolled']))

## Случайный лес

In [None]:
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1))
])

In [None]:
def objective(trial, rf_pipeline, cv, recall_dropout_scorer):
    # Основные гиперпараметры Random Forest
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 3, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
    
    # Для class_weight используем разные стратегии
    class_weight_option = trial.suggest_categorical(
        'class_weight_option', 
        ['balanced', 'balanced_subsample', 'dropout_2', 'dropout_3', 'dropout_4', None]
    )
    
    # Преобразуем class_weight в нужный формат
    if class_weight_option == 'balanced':
        class_weight = 'balanced'
    elif class_weight_option == 'balanced_subsample':
        class_weight = 'balanced_subsample'
    elif class_weight_option == 'dropout_2':
        class_weight = {0: 2, 1: 1}
    elif class_weight_option == 'dropout_3':
        class_weight = {0: 3, 1: 1}
    elif class_weight_option == 'dropout_4':
        class_weight = {0: 4, 1: 1}
    else:
        class_weight = None
    
    # Дополнительные параметры Random Forest
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    
    # Если bootstrap=True, можно настроить max_samples
    max_samples = None
    if bootstrap:
        max_samples = trial.suggest_float('max_samples', 0.5, 1.0)
    
    # Устанавливаем параметры в pipeline
    params = {
        'rf__n_estimators': n_estimators,
        'rf__max_depth': max_depth if max_depth < 30 else None,
        'rf__min_samples_split': min_samples_split,
        'rf__min_samples_leaf': min_samples_leaf,
        'rf__criterion': criterion,
        'rf__class_weight': class_weight,
        'rf__max_features': max_features,
        'rf__bootstrap': bootstrap,
    }
    
    # Добавляем max_samples только если bootstrap=True
    if bootstrap and max_samples:
        params['rf__max_samples'] = max_samples
    else:
        params['rf__max_samples'] = None
    
    rf_pipeline.set_params(**params)
    
    # Вычисляем скор с помощью кросс-валидации
    scores = cross_val_score(
        rf_pipeline, 
        X_train, 
        y_train,
        cv=cv,
        scoring=recall_dropout_scorer,
        n_jobs=-1
    )
    
    return scores.mean()

In [None]:
rf_study = optuna.create_study(
    direction='maximize',
    sampler=optuna.samplers.TPESampler(seed=42)
)

In [None]:
rf_study.optimize(
    lambda trial: objective(trial, rf_pipeline, cv, recall_dropout_scorer),
    n_trials=100,
    show_progress_bar=True
)

In [None]:
rf_best_params = rf_study.best_params
print("Лучшие параметры Random Forest:", rf_best_params)
print("Лучший recall:", rf_study.best_value) 

In [None]:
pipeline_best_params = {}

for key, value in rf_best_params.items():
    if key == 'class_weight_option':
            pipeline_best_params[f'rf__class_weight'] = {0: 4, 1: 1}
    else:
        pipeline_best_params[f'rf__{key}'] = value

In [None]:
pipeline_best_params

In [None]:
rf_pipeline.set_params(**pipeline_best_params)

In [None]:
rf_pipeline.fit(X_train, y_train)

In [None]:
y_pred_rf = rf_pipeline.predict(X_test)

In [None]:
print("Оценка случайного леса на тестовых данных:")
print(f"Recall Dropout: {recall_dropout(y_test, y_pred_rf):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Dropout', 'Enrolled']))

## XGBoost

In [None]:
def objective(trial):
    """
    Целевая функция для Optuna, максимизирующая recall для класса dropout
    """
    # Определяем гиперпараметры для оптимизации
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 11),  # randint(3, 12) -> 3-11 включительно
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.31),  # uniform(0.01, 0.3)
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),  # uniform(0.6, 0.4)
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),  # uniform(0.6, 0.4)
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),  # uniform(0.5, 0.5)
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),  # uniform(0.5, 0.5)
        'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 3.0),  # uniform(0.5, 2.5)
        'n_estimators': trial.suggest_int('n_estimators', 300, 999),  # randint(300, 1000)
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.145, 0.435),  # uniform(0.29*0.5, 0.29*1.5)
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 9),  # randint(0, 10)
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 5.1),  # uniform(0.1, 5)
        'objective': 'binary:logistic',
        'random_state': 42,
        'eval_metric': ['mlogloss', 'merror']
    }
    
    # Создаем модель XGBoost
    model = xgb.XGBClassifier(**params)
    
    # Необязательно: можно добавить StandardScaler в пайплайн
    # Если ваши данные нуждаются в масштабировании
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('xgb', model)
    ])
    
    # Выполняем кросс-валидацию
    scores = cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=cv,
        scoring=recall_dropout_scorer,
        n_jobs=1  # XGBoost может конфликтовать с многопоточностью в кросс-валидации
    )
    
    # Возвращаем средний recall для dropout
    return np.mean(scores)

In [None]:
study = optuna.create_study(
    direction='maximize',  # Максимизируем recall для dropout
    study_name='xgboost_optuna',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=10,
        n_warmup_steps=5,
        interval_steps=1
    )
)

In [None]:
study.optimize(objective, n_trials=150, show_progress_bar=True)

In [None]:
print(f"\nЛучшие параметры: {study.best_params}")
print(f"Лучший Recall 0-го класса: {study.best_value:.4f}")

In [None]:
best_params = study.best_params

In [None]:
best_params

In [None]:
xgb_model = xgb.XGBClassifier(
    **best_params,
    objective='binary:logistic',
    random_state=42,
    eval_metric=['mlogloss', 'merror'],
    n_jobs=-1  # Используем все ядра
)

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', xgb_model)
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred_xgb = pipeline.predict(X_test)

In [None]:
print(f"Recall Dropout: {recall_dropout(y_test, y_pred_xgb):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=['Dropout', 'Graduate']))

### Анализ важности признаков

In [None]:
import shap

xgb_model = pipeline.named_steps['xgb']

X_train_scaled = pipeline.named_steps['scaler'].transform(X_train)
X_test_scaled = pipeline.named_steps['scaler'].transform(X_test)

explainer = shap.TreeExplainer(xgb_model)

shap_values = explainer.shap_values(X_test_scaled)

print(f"Форма SHAP значений: {np.shape(shap_values)}")
print(f"Форма тестовых данных: {X_test_scaled.shape}")

plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X_test_scaled, feature_names=X_train.columns, show=False, max_display=len(X_train.columns))
plt.title("SHAP Summary Plot - Глобальная важность признаков")
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X_test_scaled, feature_names=X_train.columns, plot_type="bar", show=False, max_display=len(X_train.columns))
plt.title("SHAP Feature Importance (Mean Absolute SHAP Value)")
plt.tight_layout()
plt.show()

In [None]:
columns_to_drop = [
    'Unemployment rate',
    'Admission grade',
    'Father\'s occupation',
    'Curricular units 1st sem (credited)',
    'Application order',
    'Daytime/evening attendance',
    'Mother\'s occupation',
    'id',
    'Mother\'s qualification',
    'Marital status',
    'GDP',
    'Inflation rate',
    'Previous qualification',
    'Curricular units 1st sem (without evaluations)',
    'Nacionality',
    'International',
    'Educational special needs'
]

In [None]:
X_train_selected = X_train.drop(columns=columns_to_drop, axis=1)

In [None]:
X_test_selected = X_test.drop(columns=columns_to_drop, axis=1)

In [None]:
xgb_model_selected = xgb.XGBClassifier(
    **best_params,
    objective='binary:logistic',
    random_state=42,
    eval_metric=['mlogloss', 'merror'],
    n_jobs=-1  # Используем все ядра
)

In [None]:
pipeline_selected = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', xgb_model)
])

In [None]:
pipeline_selected.fit(X_train_selected, y_train)

In [None]:
y_pred_xgb_selected = pipeline_selected.predict(X_test_selected)

In [None]:
print(f"Recall Dropout: {recall_dropout(y_test, y_pred_xgb_selected):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb_selected):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb_selected, target_names=['Dropout', 'Graduate']))

Recall Dropout: 0.8831 -> Recall Dropout: 0.8779


Пробуем обрезать еще больше

In [None]:
columns_to_drop = [
    'Unemployment rate',
    'Admission grade',
    'Father\'s occupation',
    'Curricular units 1st sem (credited)',
    'Application order',
    'Daytime/evening attendance',
    'Mother\'s occupation',
    'id',
    'Mother\'s qualification',
    'Marital status',
    'GDP',
    'Inflation rate',
    'Previous qualification',
    'Curricular units 1st sem (without evaluations)',
    'Nacionality',
    'International',
    'Educational special needs',
    'Previous qualification (grade)',
    "Father's occupation",
    'Displaced',
    'Debtor'
]

In [None]:
X_train_selected = X_train.drop(columns=columns_to_drop, axis=1)

In [None]:
X_test_selected = X_test.drop(columns=columns_to_drop, axis=1)

In [None]:
xgb_model_selected = xgb.XGBClassifier(
    **best_params,
    objective='binary:logistic',
    random_state=42,
    eval_metric=['mlogloss', 'merror'],
    n_jobs=-1  # Используем все ядра
)

In [None]:
pipeline_selected = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', xgb_model)
])

In [None]:
pipeline_selected.fit(X_train_selected, y_train)

In [None]:
y_pred_xgb_selected = pipeline_selected.predict(X_test_selected)

In [None]:
print(f"Recall Dropout: {recall_dropout(y_test, y_pred_xgb_selected):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb_selected):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb_selected, target_names=['Dropout', 'Graduate']))

Recall Dropout: 0.8831 -> Recall Dropout: 0.8779 -> Recall Dropout: 0.8774

Попробуем оставить только первые топ 5 признаков

In [None]:
top_5_features = [
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)',
    'Scholarship holder',
    'Tuition fees up to date',
    'Curricular units 1st sem (evaluations)'
]

In [None]:
X_train_selected = X_train[top_5_features]
X_test_selected = X_test[top_5_features]

In [None]:
xgb_model_selected = xgb.XGBClassifier(
    **best_params,
    objective='binary:logistic',
    random_state=42,
    eval_metric=['mlogloss', 'merror'],
    n_jobs=-1  # Используем все ядра
)

In [None]:
pipeline_selected = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', xgb_model)
])

In [None]:
pipeline_selected.fit(X_train_selected, y_train)

In [None]:
y_pred_xgb_selected = pipeline_selected.predict(X_test_selected)

In [None]:
print(f"Recall Dropout: {recall_dropout(y_test, y_pred_xgb_selected):.4f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb_selected):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb_selected, target_names=['Dropout', 'Graduate']))

Recall Dropout: 0.8831 -> Recall Dropout: 0.8779 -> Recall Dropout: 0.8774 -> Recall Dropout: 0.8673