In [11]:
import pandas as pd

from sklearn.model_selection import cross_val_score, train_test_split, cross_validate, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

from  sklearn.linear_model import LogisticRegression
from  sklearn.neighbors import KNeighborsClassifier
from  sklearn.tree import DecisionTreeClassifier
from  sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from src.preprocessing import create_preprocessing_pipeline
from src.config import *

import joblib
from pathlib import Path



def data_load():
    data_path = DATA_DIR / 'raw' / TRAIN_FILE
    df = pd.read_csv(data_path)

    X = df.drop(TARGET_COLUMN, axis=1)
    y = df[TARGET_COLUMN]
    return X,y



def models_load():
    return {'LogisticRegression': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
            'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=5),
            'DecisionTreeClassifier': DecisionTreeClassifier(max_depth=15, min_samples_leaf=4, random_state=RANDOM_STATE),
            'RandomForestClassifier': RandomForestClassifier(max_depth=5, max_samples=100, random_state=RANDOM_STATE),
            'GradientBoostingClassifier': GradientBoostingClassifier(max_depth=3, random_state=RANDOM_STATE),
            'SVC': SVC(max_iter=1000, probability=True)
            }

def evaluateCV(pips, X, y, k_fold = 5):
    metrics = {'roc_auc': 'roc_auc', 'pr_auc': 'average_precision'}
    cv = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=RANDOM_STATE)

    res={}

    for name_m, model in pips.items():
        res[name_m] = cross_validate(model,X=X, y=y, cv=cv, scoring=metrics, return_train_score=False)
    return  res

def aggregate_results(res_dict):
    rows = []

    for model_name, scores in res_dict.items():
        row = {
            "model": model_name,
            "roc_auc_mean": scores["test_roc_auc"].mean(),
            "roc_auc_std": scores["test_roc_auc"].std(),
            "pr_auc_mean": scores["test_pr_auc"].mean(),
            "pr_auc_std": scores["test_pr_auc"].std(),
            "fit_time_mean": scores["fit_time"].mean(),
            "score_time_mean": scores["score_time"].mean()
        }
        rows.append(row)

    return pd.DataFrame(rows).round(5)


def preprocessing_pipeline(model_name, model):
    ppc_pipeline = Pipeline(steps=[('pipeline_preprocessing', create_preprocessing_pipeline()), ('model', model)])
    return ppc_pipeline


def save_artifacts(m_metrics, t_models, version = 'v3'):
    model_dir = BEST_MODEL_DIR / version
    model_dir.mkdir(parents=True, exist_ok=True)

    metrics_path = model_dir / "cv_metrics.csv"

    for m_name in t_models.keys():
        model_file = m_name + '.joblib'
        model_path = model_dir/ 'models'
        model_path.mkdir(parents=True, exist_ok=True)
        joblib.dump(t_models[m_name], model_path/model_file)


    m_metrics.to_csv(metrics_path, index=False)
    print('модели и их метрики сохранены успешно')

def train_model(pips, X, y):
    return {name_m: pips[name_m].fit(X, y) for name_m in pips.keys()}


def main():
    X,y = data_load()
    models = models_load()
    pips = {n_model: Pipeline(steps=[('preprocessing', create_preprocessing_pipeline()),
                                     ('model', model)])
            for n_model, model in models.items()}

    basic_res_pips = evaluateCV(pips, X, y)
    res_pips = aggregate_results(basic_res_pips)

    save_artifacts(res_pips, train_model(pips, X, y))
    
if __name__ == '__main__':
    main()

Unnamed: 0,fit_time,score_time,test_roc_auc,test_pr_auc
LogisticRegression,0.027,0.0167,0.8617,0.8324
KNeighborsClassifier,0.0178,0.0162,0.827,0.7372
DecisionTreeClassifier,0.0174,0.0133,0.8303,0.7371
RandomForestClassifier,0.1136,0.0172,0.8666,0.8488
GradientBoostingClassifier,0.1364,0.0123,0.8785,0.8592
SVC,0.0816,0.0165,0.8532,0.8146
