## Загрузка данных из первого проекта

In [4]:
import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

def create_connection():

    load_dotenv()
    host = os.environ.get('DB_DESTINATION_HOST')
    port = os.environ.get('DB_DESTINATION_PORT')
    db = os.environ.get('DB_DESTINATION_NAME')
    username = os.environ.get('DB_DESTINATION_USER')
    password = os.environ.get('DB_DESTINATION_PASSWORD')
    
    print(f'postgresql://{username}:{password}@{host}:{port}/{db}')
    conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}', connect_args={'sslmode':'require'})
    return conn

def get_data():
    conn = create_connection()
    data = pd.read_sql('select * from clean_flats', conn)
    conn.dispose()
    
    os.makedirs('data', exist_ok=True)
    data.to_csv('data/initial_data.csv', index=None)

get_data()

postgresql://mle_20250227_88b3651024:88ec19845ab74861bedf481289d34a66@rc1b-uh7kdmcx67eomesf.mdb.yandexcloud.net:6432/playground_mle_20250227_88b3651024


## Обучение модели по параметрам из первого проекта

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
import yaml
from sklearn.model_selection import train_test_split

with open('params.yaml', 'r') as fd:
    params = yaml.safe_load(fd)

data = pd.read_csv('data/initial_data.csv', index_col=params.get('index_col'))
data.drop(columns=['building_id', 'latitude', 'longitude'], inplace=True)

target_col = params.get('target_col')
one_hot_drop = params.get('one_hot_drop', False)

X = data.drop(columns=[target_col])
y = data[target_col]

cat_features = X.select_dtypes(include=['object', 'bool'])
potential_binary_features = cat_features.nunique() == 2

binary_cat_features = cat_features[potential_binary_features[potential_binary_features].index]
other_cat_features = cat_features[potential_binary_features[~potential_binary_features].index]
num_features = X.select_dtypes(['float', 'int'])

preprocessor = ColumnTransformer(
    [
        ('binary', OneHotEncoder(drop='if_binary' if one_hot_drop else None), binary_cat_features.columns.tolist()),
        ('cat', CatBoostEncoder(return_df=False), other_cat_features.columns.tolist()),
        ('num', 'passthrough', num_features.columns.tolist())
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

model = RandomForestRegressor(n_estimators=300, max_depth=10)

pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=69)
pipeline.fit(X_train, y_train)

In [6]:
from sklearn.model_selection import KFold, cross_validate

prediction = pipeline.predict(X_test)

metrics = {}

cv_strategy = KFold(n_splits=params['n_splits'], shuffle=True, random_state=69)

cv_res = cross_validate(
    pipeline, data, data[target_col], cv=cv_strategy, n_jobs=params['n_jobs'], scoring=params['metrics']
)


In [8]:
metrics['mean_fit_time'] = round(cv_res['fit_time'].mean(), 3)
metrics['mean_score_time'] = round(cv_res['score_time'].mean(), 3)
metrics['mean_test_r2'] = round(cv_res['test_r2'].mean(), 3)
metrics['mean_mape'] = round(cv_res['test_neg_mean_absolute_percentage_error'].mean(), 3)

In [17]:
import mlflow

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

RUN_NAME = "project1_model_registration"
EXPERIMENT_NAME = "project2"
REGISTRY_MODEL_NAME = "project1_model"

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

pip_requirements= "../requirements.txt"
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    mlflow.log_params(model.get_params())
    mlflow.log_metrics(metrics)
    
    cv_info = mlflow.sklearn.log_model(cv_strategy, artifact_path='cv')
    
    model_info = mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        registered_model_name=REGISTRY_MODEL_NAME,
        signature=signature,
        input_example=input_example,
        pip_requirements=pip_requirements)

    mlflow.log_dict(metrics, "metrics.json")
    mlflow.log_text(str(pipeline.get_params()), "project1_pipeline_params.txt")
    
    run_id = run.info.run_id
    print(f"Run ID: {run_id}")
    print(f"Model logged to: {run.info.artifact_uri}/model")

  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'project1_model'.
2025/04/20 13:30:50 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: project1_model, version 1


Run ID: ed2e99f189f94759829768016b856135
Model logged to: s3://s3-student-mle-20250227-88b3651024/18/ed2e99f189f94759829768016b856135/artifacts/model


Created version '1' of model 'project1_model'.
