### 0 Импорты и общие функции

In [None]:
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt, time
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import uniform


def bootstrap_score(model, X_df, y, iters=250, test_size=0.25, random_state=42):
    rng = np.random.RandomState(random_state)
    accs = []
    for _ in range(iters):
        idx = rng.choice(len(X_df), size=len(X_df), replace=True)
        X_samp = X_df.iloc[idx]
        y_samp = y[idx]

        try:
            X_tr, X_te, y_tr, y_te = train_test_split(
                X_samp, y_samp, test_size=test_size,
                random_state=rng.randint(0, 1_000_000), stratify=y_samp
            )
        except ValueError:
            X_tr, X_te, y_tr, y_te = train_test_split(
                X_samp, y_samp, test_size=test_size,
                random_state=rng.randint(0, 1_000_000), stratify=None
            )

        model.fit(X_tr, y_tr)
        accs.append(accuracy_score(y_te, model.predict(X_te)))

    return float(np.mean(accs)), float(np.std(accs))

### 1 Загрузка датасетов

In [19]:
wine_df  = pd.read_csv("winequality-red.csv")
heart_df = pd.read_csv("heart.csv")

display(wine_df.head())
display(heart_df.head())


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


### 2 Предобработка

In [28]:
def make_Xy_and_ct(df, target, zero_to_nan=None):
    df = df.copy()
    if zero_to_nan:
        df[zero_to_nan] = df[zero_to_nan].replace(0, np.nan)
    df = df.fillna(df.median(numeric_only=True))

    X = df.drop(columns=[target])
    y = df[target].values

    cat_cols = X.select_dtypes(include=["object","category"]).columns.tolist()
    num_cols = X.select_dtypes(include=["number","bool"]).columns.tolist()

    ct = ColumnTransformer(
        [("num", StandardScaler(), num_cols),
         ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols)],
        remainder="drop"
    )
    return X, y, ct

X_wine,  y_wine,  wine_ct  = make_Xy_and_ct(wine_df,  target="quality")
target_col = "target" if "target" in heart_df.columns else "HeartDisease"
X_heart, y_heart, heart_ct = make_Xy_and_ct(heart_df, target=target_col)

### 3 Train/Test split

In [29]:
splits = {}
for name, X, y, ct in [
    ("wine",  X_wine,  y_wine,  wine_ct),
    ("heart", X_heart, y_heart, heart_ct)
]:
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.20, stratify=y, random_state=42
    )
    splits[name] = (X_tr, X_te, y_tr, y_te, ct)


### 4 Модели для сравнений

In [23]:
base_models = {
    "LogReg": LogisticRegression(max_iter=2000),
    "SVM": SVC(probability=True),
    "RF": RandomForestClassifier(n_estimators=400, random_state=42),
    "GB": GradientBoostingClassifier(random_state=42),
    "NB": GaussianNB(),
    "KNN": KNeighborsClassifier(n_neighbors=7)
}

### 5 Обучение и отчёты

In [None]:
records = []
for ds, (X_tr, X_te, y_tr, y_te, ct) in splits.items():
    for mdl_name, mdl in base_models.items():
        pipe = Pipeline([("prep", ct), ("clf", mdl)])
        t0 = time.time()
        pipe.fit(X_tr, y_tr)
        train_time = time.time()-t0
        y_pred = pipe.predict(X_te)
        acc    = accuracy_score(y_te, y_pred)
        boot_m, boot_s = bootstrap_score(pipe, X_tr, y_tr, iters=250)
        records.append(dict(Dataset=ds, Model=mdl_name,
                            Accuracy=acc, Boot_Mean=boot_m,
                            Boot_STD=boot_s, Train_Time_s=train_time))
        print(f"\n=== {ds.upper()} – {mdl_name} ===")
        print(classification_report(y_te, y_pred, digits=3))

results_df = (pd.DataFrame(records)
              .sort_values(["Dataset", "Accuracy"], ascending=[True,False])
              .reset_index(drop=True))
display(results_df)

### 6 Визуализация снижения размерности

In [None]:
def dr_plot(X_df, y, ct, title):
    X_sc = ct.fit_transform(X_df)
    fig, ax = plt.subplots(1,3,figsize=(18,5))
    for a, reducer, name in [
        (ax[0], PCA(n_components=2, random_state=0), "PCA"),
        (ax[1], TSNE(n_components=2, random_state=0, perplexity=35), "t-SNE"),
        (ax[2], LDA(n_components=2), "LDA")
    ]:
        X_emb = reducer.fit_transform(X_sc, y) if name=="LDA" else reducer.fit_transform(X_sc)
        sns.scatterplot(x=X_emb[:,0], y=X_emb[:,1], hue=y, ax=a,
                        palette="Set2", s=40, alpha=.85, edgecolor="k")
        a.set_title(name); a.set_xlabel("Dim-1"); a.set_ylabel("Dim-2")
    plt.suptitle(title); plt.show()

dr_plot(X_wine,  y_wine,  wine_ct,  "Wine - 2-D embeddings")
dr_plot(X_heart, y_heart, heart_ct, "Heart - 2-D embeddings")

### 7 RandomizedSearchCV для SVM

In [None]:
param_dist = {
    "clf__C":     uniform(0.1, 10),
    "clf__gamma": uniform(0.0005, 0.5),
    "clf__kernel": ["rbf", "poly", "sigmoid"]
}

search_records = []

for ds_name, (X_tr, X_te, y_tr, y_te, ct) in splits.items():
    pipe = Pipeline([("prep", ct), ("clf", SVC(probability=True, random_state=42))])
    
    rnd = RandomizedSearchCV(
        pipe, param_dist, n_iter=60, cv=5,
        scoring="accuracy", n_jobs=-1, random_state=42, verbose=1
    )
    rnd.fit(X_tr, y_tr)
    
    best_cv  = rnd.best_score_
    test_acc = accuracy_score(y_te, rnd.predict(X_te))
    
    search_records.append(
        dict(Dataset=ds_name,
             Best_Params=rnd.best_params_,
             CV_Accuracy=best_cv,
             Test_Accuracy=test_acc)
    )
    print(f"{ds_name.upper()} search done. Best CV={best_cv:.3f}")

search_df = pd.DataFrame(search_records)
display(search_df)