# gridsearch

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import time
from sklearn.base import clone
from GradientBoosting import MyGradientBoosting
from Losses import LogisticLoss

print("=== Загрузка данных Adult ===")
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
col_names = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'income'
]

df = pd.read_csv(url, names=col_names, sep=',\\s', engine='python')
income_map = {'<=50K': 0, '>50K': 1}
df['income'] = df['income'].map(income_map)
df = df.dropna()

categorical = df.select_dtypes(include=['object']).columns
for col in categorical:
    df[col] = df[col].astype('category').cat.codes

X = df.drop('income', axis=1).values
y = df['income'].values
X_train, X_test, y_train, y_test = train_test_split(X[:1000], y[:1000], test_size=0.2, random_state=42)

print(f"Размер тренировочных данных: {X_train.shape}")
print(f"Размер тестовых данных: {X_test.shape}")

# Исправленный код GridSearchCV
print("\n=== GridSearchCV для моего Gradient Boosting ===")
param_grid = {
    'n_estimators': [3, 5],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3],
    'subsampling': [0.8, 1.0]
}

my_gb_gs = MyGradientBoosting(loss=LogisticLoss())
grid_search = GridSearchCV(
    estimator=my_gb_gs,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring='accuracy'  # Явно указываем метрику
)

start = time.time()
grid_search.fit(X_train, y_train)
gs_time = time.time() - start

# Теперь predict() возвращает бинарные предсказания
best_pred = grid_search.best_estimator_.predict(X_test)
best_acc = accuracy_score(y_test, best_pred)

print(f"GridSearchCV - Время: {gs_time:.2f} сек")
print(f"Лучшие параметры: {grid_search.best_params_}")
print(f"Лучшая точность на тесте: {best_acc:.4f}")


=== Загрузка данных Adult ===
Размер тренировочных данных: (800, 14)
Размер тестовых данных: (200, 14)

=== GridSearchCV для моего Gradient Boosting ===
GridSearchCV - Время: 2.85 сек
Лучшие параметры: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 3, 'subsampling': 0.8}
Лучшая точность на тесте: 0.7600
