In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, roc_auc_score, classification_report, confusion_matrix

In [None]:
# open files
train_data = pd.read_csv("/kaggle/input/playground-series-s3e23/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s3e23/test.csv")

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.drop('id', axis=1, inplace=True)

In [None]:
train_data['defects'] = train_data['defects'].replace({True: 1, False: 0})

In [None]:
train_data.describe()

In [None]:
# see class balance
sns.countplot(data = train_data, x = 'defects')
plt.title('Defects');

In [None]:
# Calculate the correlation matrix
corr_matrix = train_data.corr()

# Set the size of the figure
plt.figure(figsize=(15, 15))

# Create a heatmap
sns.heatmap(corr_matrix, annot=True)
plt.title('Correlation Matrix')
plt.show()

In [None]:
train_data['mean_bnv'] = (train_data['n'] + train_data['v'] + train_data['b']) /3
train_data['mean_uniqOpOpend'] = (train_data['uniq_Op'] + train_data['uniq_Opnd']) /2
train_data['mean_totOpOpend'] = (train_data['total_Op'] + train_data['total_Opnd']) /2
train_data['mean_brcntvg'] = (train_data['branchCount'] + train_data['v(g)']) / 2
train_data.drop(['n', 'v', 'b', 'uniq_Op', 'uniq_Opnd','total_Op', 'total_Opnd', 'branchCount', 'v(g)'], axis=1, inplace = True)

In [None]:
# Calculate new correlation matrix
corr_matrix = train_data.corr()

# Set the size of the figure
plt.figure(figsize=(15, 15))

# Create a heatmap
sns.heatmap(corr_matrix, annot=True)
plt.title('Correlation Matrix New')
plt.show()

In [None]:
features = train_data.drop('defects', axis=1)
target = train_data['defects']

In [None]:
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.3, random_state = 2110)

In [None]:
features_train.shape

In [None]:
features_valid.shape

In [None]:
# Data scaling 
scaler = preprocessing.RobustScaler()
scaler.fit(features_train)
features_train_scaled = scaler.transform(features_train)
features_valid_scaled = scaler.transform(features_valid)

In [None]:
stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1010)

In [None]:
roc_auc_scorer = make_scorer(roc_auc_score)

Hist Gradient Boosting Classifier

In [None]:
hgb_classifier = HistGradientBoostingClassifier(
    class_weight = None,
    interaction_cst = 'pairwise',
    learning_rate = 0.1,
    max_iter = 32,
    min_samples_leaf = 11,
    warm_start = True,
    random_state = 1410)

In [None]:
hgb_classifier.fit(features_train_scaled, target_train)

In [None]:
#hgb_param_grid = {
    #'learning_rate': [0.01, 0.1, 0.2],
    #'max_iter': [30, 31, 32],
    #'min_samples_leaf': [10, 11, 12],
    #'warm_start': [True, False],
    #'class_weight': [None, 'balanced'],
    #'interaction_cst':[None, 'pairwise', 'no_interactions']
#}

In [None]:
#hgb_grid = GridSearchCV(estimator=hgb_classifier, param_grid=hgb_param_grid, cv=stratified_kfold, scoring='roc_auc')

In [None]:
#hgb_grid.fit(features_train_scaled, target_train)

In [None]:
#print('Best params:', hgb_grid.best_params_)
#print('Best score (AUC-ROC):', hgb_grid.best_score_)

In [None]:
#best_hgb_classifier = hgb_grid.best_estimator_

Best params: {'class_weight': None, 'interaction_cst': 'pairwise', 'learning_rate': 0.1, 'max_iter': 32, 'min_samples_leaf': 11, 'warm_start': True}<br>
Best score (AUC-ROC): 0.7910262941077485

XGBoost Classifier

In [None]:
xgboost_cl = XGBClassifier(
    eval_metric='auc',
    objective = 'binary:logistic',
    verbosity=0,
    alpha = 0, 
    eta = 0.1, 
    max_depth = 3,
    n_estimators = 100,
    subsample = 1,
    random_state=1110)

In [None]:
xgboost_cl.fit(features_train_scaled, target_train)

In [None]:
#xgboost_cl_params = {
    #'eta': [0.1, 0.2],
    #'max_depth': [3, 4],
    #'subsample': [0.7, 0.8, 0.9, 1],
    #'alpha': [0,0.1],
    #'n_estimators': [90, 100]   
#}

In [None]:
#xgboost_cl_grid = GridSearchCV(
    #estimator=xgboost_cl, param_grid=xgboost_cl_params, cv=stratified_kfold, scoring='roc_auc'
#)

In [None]:
#xgboost_cl_grid.fit(features_train_scaled, target_train)

In [None]:
#print('Best params:', xgboost_cl_grid.best_params_)
#print('Best score (AUC-ROC):', xgboost_cl_grid.best_score_)

In [None]:
#best_xgb_classifier = xgboost_cl_grid.best_estimator_

Best params: {'alpha': 0, 'eta': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1}<br>
Best score (AUC-ROC): 0.7920298757887111

LGBM Classifier

In [None]:
lgb_cl = lgb.LGBMClassifier(
    subsample = 0.8,
    colsample_bytree = 0.8,
    reg_alpha = 0.8, 
    reg_lambda = 0.5,
    learning_rate = 0.1,
    max_depth = 3,
    min_child_samples = 35,
    n_estimators = 150,
    num_leaves = 5,
    random_state = 1210
)

In [None]:
lgb_cl.fit(features_train_scaled, target_train)

In [None]:
#lgb_cl_params = {
    #'num_leaves': [2, 3, 4, 5],
    #'max_depth': [3, 5, 7],
    #'learning_rate': [0.01, 0.1, 1],
    #'n_estimators': [150],
    #'min_child_samples': [35, 40, 45],
#}

In [None]:
#lgb_cl_grid = GridSearchCV(estimator = lgb_cl, param_grid = lgb_cl_params, cv = stratified_kfold, scoring='roc_auc')

In [None]:
#lgb_cl_grid.fit(features_train_scaled, target_train)

In [None]:
#print('Best params:', lgb_cl_grid.best_params_)
#print('Best score (AUC-ROC):', lgb_cl_grid.best_score_)

In [None]:
#best_lgb_classifier = lgb_cl_grid.best_estimator_

Best params: {'learning_rate': 0.1, 'max_depth': 3, 'min_child_samples': 35, 'n_estimators': 150, 'num_leaves': 5}<br>
Best score (AUC-ROC): 0.7919868551017376

LogisticRegression

In [None]:
#from sklearn.kernel_approximation import Nystroem
#from sklearn.linear_model import LogisticRegression
#from sklearn.pipeline import Pipeline
#from sklearn.model_selection import GridSearchCV

# Создайте объект Nystroem
#nystroem = Nystroem(kernel='rbf', random_state=1910)

# Задайте параметры для перебора, включая параметры Nystroem, параметр C, и дополнительные параметры для логистической регрессии
#param_grid = {
    #'nystroem__n_components': [900, 1000],  # Разное количество компонент Nystroem
    #'logistic__C': [0,64, 0.65],  # Разное значение параметра C для логистической регрессии
    #'logistic__penalty': ['l2'],  # Разные типы регуляризации для логистической регрессии
    #'logistic__class_weight': ['balanced'],  # Разные варианты балансировки классов
    #'logistic__solver': ['newton-cg'],  # Разные алгоритмы оптимизации
    #'logistic__dual': [False],  # Разрешить двойственный формулировка или нет
    #'logistic__max_iter': [100],  # Разное количество итераций
#}

# Создайте логистическую регрессию с параметрами по умолчанию
#lregression = LogisticRegression(random_state=2310)

# Создайте пайплайн, который включает в себя Nystroem и логистическую регрессию
#pipeline = Pipeline([
    #('nystroem', nystroem),
   # ('logistic', lregression)
#])

# Создайте объект GridSearchCV
#grid_search = GridSearchCV(pipeline, param_grid, cv=stratified_kfold, n_jobs=-1, scoring='roc_auc')

# Обучите GridSearchCV на обучающих данных
#grid_search.fit(features_train_scaled, target_train)

# Получите лучшие параметры и оценку ROC AUC
#best_params = grid_search.best_params_
#best_score = grid_search.best_score_

#print("Best params:", best_params)
#print("Best score (ROC AUC):", best_score)


In [None]:
#best_lrny_classifier = grid_search.best_estimator_

In [None]:
#lr_params = {
    #'C': [0.001, 0.01, 0.1, 1, 10, 100],
    #'penalty': ['l1', 'l2'],
    #'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    #'class_weight': [None, 'balanced'],
    #'fit_intercept': [True, False],
    #'max_iter': [100, 200, 300],

#}

In [None]:
#lr_grid = GridSearchCV(lregression, lr_params, cv=stratified_kfold, scoring='roc_auc', n_jobs=-1)

#lr_grid.fit(features_train_scaled, target_train)

In [None]:
#print('Best params:', lr_grid.best_params_)
#print('Best score (AUC-ROC):', lr_grid.best_score_)

In [None]:
#best_lgb_classifier = lr_grid.best_estimator_

Best params: {'C': 0.01, 'class_weight': 'balanced', 'fit_intercept': True, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}<br>
Best score (AUC-ROC): 0.78314444755966

Catboost

In [None]:
catb = CatBoostClassifier(loss_function = 'Logloss',
                                iterations = 500,
                                learning_rate = 0.01,
                                depth = 7,
                                random_strength = 0.5,
                                bagging_temperature = 0.7,
                                border_count = 30,
                                l2_leaf_reg = 5,
                                verbose = False, 
                                task_type = 'CPU')

In [None]:
catb.fit(features_train_scaled, target_train)

Ensemble

In [None]:
#from itertools import product
#from sklearn.model_selection import cross_val_score

#weights_combinations = product([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], repeat=4)

#best_score = -1  # Инициализируем лучшую оценку
#best_weights = None  # Инициализируем лучшие веса
#best_model = None  # Инициализируем лучшую модель

#for weights in weights_combinations:
    # Создайте VotingClassifier с текущими весами
   # ensemble_classifier = VotingClassifier(
        #estimators=[
            #('best_hgb', hgb_classifier),
            #('best_xgb', xgboost_cl),
            #('best_lgb', lgb_cl),
            #('best_lrny', best_lrny_classifier),
       # ],
        #voting='soft',
        #weights=weights
    #)
    
    # Оцените модель (например, с использованием кросс-валидации и метрики ROC AUC)
    #scores = cross_val_score(
        #ensemble_classifier, 
        #features_train,
        #target_train, 
        #cv=stratified_kfold, 
        #scoring=roc_auc_scorer
    #)
    
    # Вычислите среднюю оценку по всем фолдам
    #score = np.mean(scores)
    
    # Если текущая оценка лучше предыдущей, обновите лучшую оценку, лучшие веса и лучшую модель
    #if score > best_score:
        #best_score = score
        #best_weights = weights
        #best_model = ensemble_classifier

#print("Лучшие веса:", best_weights)
#print("Лучшая оценка:", best_score)

In [None]:
# Создайте VotingClassifier с лучшими весами
best_ensemble_classifier = VotingClassifier(
    estimators=[
        ('best_hgb', hgb_classifier),
        ('best_xgb', xgboost_cl),
        ('best_lgb', lgb_cl),
        ('catb', catb)
    ],
    voting='soft',
)

param_grid_ens = {
    'weights': [
        [0.3, 0.3, 0.3, 0.1],  
        [0.1, 0.6, 0.2, 0.1],
        [0.2, 0.3, 0.2, 0.3],
        [0.1, 0.2, 0.6, 0.1]
    ]
}

grid_search_ens = GridSearchCV(
    estimator=best_ensemble_classifier,
    param_grid=param_grid_ens,
    cv=3,  
    scoring = roc_auc_scorer  
)

# Обучите этот ансамбль на всем наборе данных features_train и target_train
best_ensemble_classifier.fit(features_train_scaled, target_train)

In [None]:
#grid_search_ens = GridSearchCV(
    #estimator=ensemble_classifier,
    #param_grid=param_grid_ens,
    #cv=3,  
    #scoring = roc_auc_scorer  
#)

In [None]:
#grid_search_ens.fit(features_train_scaled, target_train)

In [None]:
pred_valid_proba = best_ensemble_classifier.predict_proba(features_valid_scaled)
roc_auc = roc_auc_score(target_valid, pred_valid_proba[:, 1])
print('ROC-AUC valid:', roc_auc)

Submition

In [None]:
features_test = test_data.drop('id', axis = 1)

In [None]:
features_test['mean_bnv'] = (features_test['n'] + features_test['v'] + features_test['b']) /3
features_test['mean_uniqOpOpend'] = (features_test['uniq_Op'] + features_test['uniq_Opnd']) /2
features_test['mean_totOpOpend'] = (features_test['total_Op'] + features_test['total_Opnd']) /2
features_test['mean_brcntvg'] = (features_test['branchCount'] + features_test['v(g)']) / 2
features_test.drop(['n', 'v', 'b', 'uniq_Op', 'uniq_Opnd','total_Op', 'total_Opnd', 'branchCount', 'v(g)'], axis=1, inplace = True)

In [None]:
# Data scaling 
features_test_scaled = scaler.transform(features_test)

In [None]:
predictions = best_ensemble_classifier.predict_proba(features_test_scaled)[:, 1]

output = pd.DataFrame({'id': test_data.id, 'defects': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")