In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ML libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# MLflow for tracking
import mlflow
import mlflow.sklearn

# Set plotting style
plt.style.use('ggplot')
sns.set(style="whitegrid")

# ## Loading the Iris Dataset
# 
# The Iris dataset is a classic dataset for classification tasks. It includes 3 classes (species of iris flowers) with 50 samples each, and 4 features (sepal length, sepal width, petal length, petal width).

# Load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
target_names = iris.target_names

# Create a DataFrame for easier handling
data = pd.DataFrame(X, columns=feature_names)
data['target'] = y
data['target_name'] = [target_names[t] for t in y]

# Display the first few rows
print(f"Dataset shape: {data.shape}")
print(data.head())

# ## Exploratory Data Analysis
# 
# Let's explore the dataset to understand its structure and characteristics.

# Summary statistics
print("\nSummary Statistics:")
print(data.describe())

# Class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='target_name', data=data)
plt.title('Class Distribution')
plt.xlabel('Species')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.savefig('class_distribution.png')
plt.close()

# Pairplot to visualize relationships between features
plt.figure(figsize=(12, 10))
sns.pairplot(data, hue='target_name', vars=feature_names)
plt.suptitle('Pairplot of Iris Features by Species', y=1.02)
plt.savefig('pairplot.png')
plt.close()

# Box plots for each feature by species
plt.figure(figsize=(14, 10))
for i, feature in enumerate(feature_names):
    plt.subplot(2, 2, i + 1)
    sns.boxplot(x='target_name', y=feature, data=data)
    plt.title(f'Box Plot of {feature} by Species')
    plt.xlabel('Species')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.savefig('boxplots.png')
plt.close()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(data.drop(['target_name'], axis=1).corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.savefig('correlation_heatmap.png')
plt.close()

# ## Data Preparation
# 
# Now, let's prepare the data for modeling by splitting it into training and test sets.

# Split data into features and target
X = data.drop(['target', 'target_name'], axis=1)
y = data['target']

# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# ## Model Training
# 
# Let's train a Random Forest model on the Iris dataset.

# Define model parameters
model_params = {
    'n_estimators': 100,
    'max_depth': 10,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'random_state': 42
}

# Create and train the model
model = RandomForestClassifier(**model_params)
model.fit(X_train, y_train)

# ## Model Evaluation
# 
# Now, let's evaluate our model's performance on the test set.

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, average='weighted'),
    'recall': recall_score(y_test, y_pred, average='weighted'),
    'f1': f1_score(y_test, y_pred, average='weighted')
}

# Print metrics
print("\nModel Performance Metrics:")
for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value:.4f}")

# Confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix.png')
plt.close()

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# ## Feature Importance
# 
# Let's analyze the importance of each feature in our model.

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.savefig('feature_importance.png')
plt.close()

# ## Tracking with MLflow
# 
# Now, let's demonstrate how to track our model, parameters, and metrics with MLflow.

try:
    # Set MLflow tracking URI if needed
    # mlflow.set_tracking_uri("http://localhost:5000")
    
    # Set experiment
    mlflow.set_experiment("iris-classifier-notebook")
    
    # Log model, parameters, metrics with MLflow
    with mlflow.start_run() as run:
        # Log parameters
        mlflow.log_params(model_params)
        
        # Log metrics
        mlflow.log_metrics(metrics)
        
        # Log model
        mlflow.sklearn.log_model(model, "model")
        
        # Log feature importance
        mlflow.log_dict(feature_importance.to_dict(), "feature_importance.json")
        
        print(f"\nMLflow Run ID: {run.info.run_id}")
except Exception as e:
    print(f"\nMLflow tracking failed: {e}")
    print("Continuing without MLflow tracking...")

# ## Model Prediction Example
# 
# Finally, let's demonstrate how to use the model to make predictions on new data.

# Example new data point
new_data = np.array([
    [5.1, 3.5, 1.4, 0.2],  # Example of Iris setosa
    [6.7, 3.1, 4.7, 1.5],  # Example of Iris versicolor
    [6.3, 3.3, 6.0, 2.5]   # Example of Iris virginica
])

# Make predictions
predictions = model.predict(new_data)
prediction_probabilities = model.predict_proba(new_data)

# Display results
print("\nPrediction Examples:")
for i, pred in enumerate(predictions):
    print(f"Example {i+1}:")
    print(f"  Features: {new_data[i]}")
    print(f"  Predicted species: {target_names[pred]}")
    print(f"  Probabilities: {prediction_probabilities[i]}")
    print("")

# ## Conclusion
# 
# In this script, we explored the Iris dataset, trained a Random Forest classifier, evaluated its performance, and tracked the results with MLflow. This workflow mirrors the pipeline used in the LocalFlow-ML project, demonstrating how ML models can be developed, trained, and tracked in an MLOps environment.
# 
# Key insights:
# - The Iris dataset features are strong predictors for species classification
# - The Random Forest model achieves high accuracy on this dataset
# - Feature importance analysis shows which measurements are most predictive
# - MLflow provides effective tracking and versioning of models and experiments

print("\nExploration complete. All visualizations have been saved as PNG files.") 

Dataset shape: (150, 6)
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target target_name  
0       0      setosa  
1       0      setosa  
2       0      setosa  
3       0      setosa  
4       0      setosa  

Summary Statistics:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%        

2025/04/07 20:07:54 INFO mlflow.tracking.fluent: Experiment with name 'iris-classifier-notebook' does not exist. Creating a new experiment.



MLflow Run ID: b2190af14ca24de6a5c9a167ae84b8f3

Prediction Examples:
Example 1:
  Features: [5.1 3.5 1.4 0.2]
  Predicted species: setosa
  Probabilities: [1. 0. 0.]

Example 2:
  Features: [6.7 3.1 4.7 1.5]
  Predicted species: versicolor
  Probabilities: [0. 1. 0.]

Example 3:
  Features: [6.3 3.3 6.  2.5]
  Predicted species: virginica
  Probabilities: [0. 0. 1.]


Exploration complete. All visualizations have been saved as PNG files.




<Figure size 1200x1000 with 0 Axes>

In [5]:
%pip install mlflow



Collecting mlflow
  Using cached mlflow-2.21.3-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.21.3 (from mlflow)
  Using cached mlflow_skinny-2.21.3-py3-none-any.whl.metadata (31 kB)
Collecting Flask<4 (from mlflow)
  Using cached flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Using cached alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting pyarrow<20,>=4.0.0 (from mlflow)
  Downloading pyarrow-19.0.1-cp39-cp39-win_amd64.whl.metadata (3.4 kB)
Collecting sqlalchemy<3,>=1.4.0 (from mlflow)
  Downloading sqlalchemy-2.0.40-cp39-cp39-win_amd64.whl.metadata (9.9 kB)
Collecting waitress<4 (from mlflow)
  Using cached waitress-3.0.2-py3-none-any.whl.metadata (5.8 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow-skinny=