In [1]:
# pip install --force --upgrade scikit-learn==1.0.2

In [None]:
import pickle
import mlflow
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from random import randint
#from sklearn.metrics._classification import accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

X.rename(columns={
    "home.dest": "homedest"
}, inplace=True)

mlflow_client = mlflow.client.MlflowClient("http://127.0.0.1:5000")
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("RANDOMIZED SEARCH")

mlflow.sklearn.autolog()

#si se usa CV ( CROSS VALIDATION) no dividimos el dataset entre TRAIN y TEST
#seed = randint(0,10000000)
seed = 1234
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

#with mlflow.start_run():
#for pca_components in range(1, 5):
#        with mlflow.start_run(nested=True):
num_pipeline = Pipeline([
    ("imputer", SimpleImputer()),
    ("ss", StandardScaler()),
    ("pca", PCA())
])

#            mlflow.log_param("pca_components", pca_components)
#            mlflow.log_param("seed", seed)

ct = ColumnTransformer([
    ("cat", OneHotEncoder(), ["pclass", "sex", "embarked"]),
    ("num", num_pipeline, ["age", "fare", "sibsp", "parch"])
])

# Entrenamiento 

pipeline = Pipeline([
    ("ct", ct),
    ("model", DecisionTreeClassifier())
])

#paso nuevo METAESTIMACION
param_grid ={
    "ct__num__pca__n_components": [1,2,3,4],
    "model__max_depth": [2,3,4,5,6,7,8]
}

# una opcion
#pipeline = GridSearchCV(pipeline, param_grid, scoring="accuracy")

# una opcion más rápida
pipeline = RandomizedSearchCV(pipeline, param_grid, scoring="accuracy", verbose=True, n_iter=5)

pipeline.fit(X_train, y_train)

: 

In [3]:
#RandomizedSearchCV?

In [4]:
pipeline.best_params_

{'model__max_depth': 3, 'ct__num__pca__n_components': 2}

In [5]:
pipeline.best_estimator_

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('cat', OneHotEncoder(),
                                                  ['pclass', 'sex',
                                                   'embarked']),
                                                 ('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('ss',
                                                                   StandardScaler()),
                                                                  ('pca',
                                                                   PCA(n_components=2))]),
                                                  ['age', 'fare', 'sibsp',
                                                   'parch'])])),
                ('model', DecisionTreeClassifier(max_depth=3))])

In [6]:
pipeline.predict(X_test)

array(['0', '0', '0', '0', '0', '0', '0', '1', '0', '1', '1', '1', '0',
       '0', '0', '1', '0', '0', '0', '0', '1', '0', '1', '0', '0', '1',
       '1', '0', '0', '1', '0', '1', '0', '0', '0', '1', '1', '0', '0',
       '1', '1', '1', '0', '1', '1', '0', '1', '0', '1', '0', '1', '1',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0',
       '0', '1', '1', '0', '1', '1', '1', '1', '0', '0', '0', '1', '1',
       '1', '0', '0', '1', '0', '0', '0', '1', '0', '0', '0', '0', '1',
       '0', '0', '1', '1', '0', '0', '1', '1', '0', '1', '1', '1', '0',
       '0', '0', '0', '0', '0', '1', '0', '1', '1', '0', '0', '0', '0',
       '1', '1', '1', '0', '0', '1', '0', '0', '1', '1', '0', '0', '0',
       '0', '1', '0', '1', '0', '0', '0', '1', '1', '1', '0', '1', '0',
       '0', '0', '1', '1', '0', '0', '0', '1', '0', '0', '1', '0', '0',
       '0', '0', '1', '1', '0', '0', '0', '1', '1', '0', '0', '0', '1',
       '0', '1', '0', '0', '0', '0', '0', '0', '1', '1', '1', '0

In [7]:
# Evaluación

#from sklearn.metrics import accuracy_score
###from sklearn.metrics._classification import accuracy_score

y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)

# Acc. en train
#mlflow.log_metric("acc.train", accuracy_score(y_train, y_pred_train))

# Acc. en test
#mlflow.log_metric("acc.test", accuracy_score(y_test, y_pred_test))

# Serialización 
with open("titanic_model.pkl", "wb") as f:
    pickle.dump(pipeline, f)
    mlflow.log_artifact("titanic_model.pkl", "model/pickle")

#Cerramos el run de MLFlow ( no hace falta)
#mlflow.end_run()

PicklingError: Can't pickle <function accuracy_score at 0x7f9c405d4ca0>: it's not the same object as sklearn.metrics._classification.accuracy_score

In [None]:
    # Python program to compute accuracy score using the function accuracy_score  
      
    # Importing the required libraries  
    import pickle
    import numpy as np  
    from sklearn.model_selection import train_test_split  
    from sklearn.metrics import accuracy_score   
    from sklearn.svm import SVC  
    from sklearn.datasets import load_iris  
      
    # Loading the dataset  
    X, Y = load_iris(return_X_y = True)  
      
    # Splitting the dataset in training and test data  
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)  
      
    # Training the model using the Support Vector Classification class of sklearn  
    svc = SVC()  
    svc.fit(X_train, Y_train)  
      
    # Computing the accuracy_score of the model  
    Y_pred = svc.predict(X_test)  
    score = accuracy_score(Y_test, Y_pred)  
    print(score)  

0.9777777777777777


In [None]:
with open("prueba_svc.pkl", "wb") as f:
    pickle.dump(svc, f)