In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer



from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import xgboost as xgb


import mlflow 
from mlflow.models import infer_signature

In [13]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

In [14]:
data = pd.read_csv('../data/train_cleaned.csv')
X = data.drop('Survived', axis=1)
y = data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scoring = make_scorer(accuracy_score)

In [15]:
pipe1 = Pipeline([("regresja", LogisticRegression(solver="liblinear"))]) 
pipe2 = Pipeline([("drzewo", DecisionTreeClassifier())])
pipe3 = Pipeline([("las", RandomForestClassifier())])
pipe4 = Pipeline([("SVM", SVC())])
pipe5 = Pipeline([("Bagging", BaggingClassifier())]) 
pipe6 = Pipeline([("KNN", KNeighborsClassifier())])
pipe7=  Pipeline([("XGB", xgb.XGBClassifier())])
pipe8 = Pipeline([("Ada", AdaBoostClassifier())])

param_grid1 = {
              
              "regresja__penalty": ["l1", "l2"],
              "regresja__C": 10.0**np.arange(-1, 1, 1), 
              "regresja__class_weight": ["balanced",None] 
             }

param_grid2 = {
              
              "drzewo__criterion": ["gini", "entropy"], 
              "drzewo__splitter": ["best", "random"], 
              "drzewo__max_depth": [None,5], 
              "drzewo__min_samples_leaf": [1,5], 
              "drzewo__max_features": [None,"sqrt","log2"] 
             }
param_grid3 = {
            "las__criterion": ["gini", "entropy"],
            "las__max_depth": [None, 5, 15],  
            "las__min_samples_leaf": [1, 4],
            "las__min_samples_split": [2,  10],  
            "las__max_features": ["auto", "sqrt", "log2"],
            "las__n_estimators": [100, 300]  
            }


param_grid4 =[{"SVM__kernel": ['linear'], 'SVM__C': [1,100]},
              {"SVM__kernel": ['rbf'], 'SVM__gamma': ["scale","auto",0.012],'SVM__C': [1,90]},
              {"SVM__kernel": ['poly'], 'SVM__degree': [2,3],'SVM__C': [1,100]}]


param_grid5 = {
              
              "Bagging__base_estimator": [DecisionTreeClassifier(),LogisticRegression(),RandomForestClassifier()],
              "Bagging__max_samples": [0.2,0.8], 
              "Bagging__max_features": [0.2,0.8],
              "Bagging__bootstrap": [False,True], 
              "Bagging__n_estimators": [10,15,20] 
              }


param_grid6 = {              
              "KNN__n_neighbors": [5,20,50], 
              "KNN__weights": ["uniform","distance"], 
              "KNN__p": [1,2] 
             }

             
param_grid7 = [{"XGB__booster": ["gbtree"], "XGB__min_child_weight": [0,1], "XGB__learning_rate":[0.3,0.8], "XGB__gamma": [0,0.5],
                "XGB__max_depth":[5,10],"XGB__subsample":[0.5,1],"XGB__alpha": [1,10],"XGB__lambda": [1,10]},
               {"XGB__booster": ["dart"],"XGB__sample_type": ["uniform","weighted"]}]


param_grid8 = {
              "Ada__base_estimator": [DecisionTreeClassifier(),LogisticRegression(),RandomForestClassifier()],
              "Ada__n_estimators": [10,15,20], 
              "Ada__learning_rate": [0.1,0.5,1.0], 
              "Ada__algorithm": ["SAMME", "SAMME.R"] 
             }


Regresja = (pipe1, param_grid1)
Drzewo = (pipe2, param_grid2)
Las = (pipe3, param_grid3)
SVM = (pipe4,param_grid4)
Bagging = (pipe5,param_grid5)
KNN=(pipe6,param_grid6)
XGB=(pipe7,param_grid7)
Ada=(pipe8,param_grid8)

modele = [Regresja, Drzewo, Las, SVM,Bagging,KNN,XGB,Ada] 
modele_nazwy = ["Regresja", "Drzewo", "Las", "SVM", "Bagging","KNN","XGB","Ada"]

In [16]:
experiment_description = (
    "test mlflow na bazie setu titanic"
)

experiment_tags = {
    "project_name": "titanic-mlflow-test",
    "owner": "pkwiecien",
    "mlflow.note.content": experiment_description,
}

mlflow.create_experiment(name="model_selection", tags=experiment_tags)

mlflow.set_experiment("model_selection")



<Experiment: artifact_location='mlflow-artifacts:/188875176167796080', creation_time=1709065578681, experiment_id='188875176167796080', last_update_time=1709065578681, lifecycle_stage='active', name='model_selection', tags={'mlflow.note.content': 'test mlflow na bazie setu titanic',
 'owner': 'pkwiecien',
 'project_name': 'titanic-mlflow-test'}>

In [17]:
for model, nazwa in zip(modele, modele_nazwy):

    with mlflow.start_run(run_name=nazwa) as run:
        gs = GridSearchCV(model[0], model[1],cv=5, scoring=scoring,refit="accuracy", n_jobs=3) 
        gs.fit(X_train, y_train)
        mlflow.log_params(gs.best_params_)
        mlflow.log_metrics({"accuracy": gs.best_score_})
        mlflow.sklearn.log_model(sk_model=gs.best_estimator_, input_example=X_train, signature=infer_signature(X_train,gs.best_estimator_.predict(X_train)), artifact_path=nazwa,registered_model_name=nazwa)

  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'Regresja'.
2024/02/27 21:26:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Regresja, version 1
Created version '1' of model 'Regresja'.
  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'Drzewo'.
2024/02/27 21:26:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Drzewo, version 1
Created version '1' of model 'Drzewo'.
  warn(
  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'Las'.
2024/02/27 21:27:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Las, version 1
Created version '1' of model 'Las'.
  inputs = _infer_schema(model_input) if mod