In [18]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [20]:
import mlflow
from mlflow.models import infer_signature
mlflow.set_tracking_uri("http://Mlflow:5000")

In [3]:
import os
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://Minio:9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

In [10]:
df = pd.read_csv('covertype_train.csv', sep = ',', decimal = '.', header = 0, encoding = 'utf-8')
df = df.rename(columns=lambda x: x.replace(' ', '_').lower())
df = df.loc[:,['elevation', 'horizontal_distance_to_roadways', 'hillshade_9am', 
               'horizontal_distance_to_fire_points', 'cover_type']]
df.head()

Unnamed: 0,elevation,horizontal_distance_to_roadways,hillshade_9am,horizontal_distance_to_fire_points,cover_type
0,2991,1015,233,1570,1
1,2876,2495,192,1557,1
2,3171,4374,213,1052,0
3,3087,4774,193,752,0
4,2835,3596,231,3280,1


In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('cover_type', axis = 1), df['cover_type'], test_size=0.5, random_state=1)

In [22]:
EXPERIMENT_NAME = "Cover-Survived-Classifier-Experiment"
mlflow.set_experiment(EXPERIMENT_NAME)

current_experiment=dict(mlflow.get_experiment_by_name(EXPERIMENT_NAME))
experiment_id=current_experiment['experiment_id']

2024/05/05 13:55:08 INFO mlflow.tracking.fluent: Experiment with name 'Cover-Survived-Classifier-Experiment' does not exist. Creating a new experiment.


In [23]:
model_name = 'Decision Tree'
RUN_NAME = f'Cover Classifier Experiment {model_name}'
params = {'max_depth':3, 'min_samples_split':2}
with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME):
    
    model = decision_tree = DecisionTreeClassifier(**params)
    
    model.fit(X_train, y_train)  # Train model
    predictions = model.predict(X_test)  # Predictions

    # Calculate metrics
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')  
    
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log the loss metric
    mlflow.log_metric(f"{model_name}_accuracy", accuracy)
    mlflow.log_metric(f"{model_name}_f1", f1)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", f"{model_name} model for Cover")

    # Infer the model signature
    signature = infer_signature(X_train, model.predict(X_train))
    
    #log the model

    model_info = mlflow.sklearn.log_model(
    sk_model=model,
    artifact_path=f"cover_{model_name}_model",
    signature=signature,
    input_example=X_train,
    registered_model_name=f"tracking-cover-{model_name}",)

    mlflow.end_run() 

Registered model 'tracking-cover-Decision Tree' already exists. Creating a new version of this model...
2024/05/05 13:55:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-cover-Decision Tree, version 2
Created version '2' of model 'tracking-cover-Decision Tree'.


In [None]:
from mlflow import MlflowClient

client = MlflowClient()

client.set_registered_model_tag("tracking-cover-Decision Tree", "task", "classification")

In [24]:
model_name = "tracking-cover-Decision Tree"
model_version = 1

lr = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")

Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

2024/05/05 13:55:48 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false


In [25]:
lr.predict(X_test)

array([0, 2, 0, ..., 0, 1, 1])

In [None]:
# user_input = [elevation, horizontal_distance_to_roadways, hillshade_9am, horizontal_distance_to_fire_points]
user_input = [2998, 319, 233, 955]
df_pred = pd.DataFrame(user_input)
df_pred = df_pred.T
df_pred.columns = ['elevation', 'horizontal_distance_to_roadways', 'hillshade_9am', 
                'horizontal_distance_to_fire_points']
out_model = lr.predict(df_pred)
out_model

In [None]:
user_input = [2998, 319, 233, 955]
columns = ['elevation', 'horizontal_distance_to_roadways', 'hillshade_9am', 'horizontal_distance_to_fire_points']
df_pred = pd.DataFrame([user_input], columns=columns)
out_model = lr.predict(df_pred)
out_model

In [26]:
print('ok_')

ok_
