In [1]:
import pandas as pd

def load_data(file_path):
    df = pd.read_csv(file_path)

    return df

file_path = 'data/literacy_rates_clean.csv'
df = load_data(file_path)
df.head()

Unnamed: 0,Region,Country,Year,Age,Gender,Literacy rate
0,Central and Southern Asia,Afghanistan,2011,15+,female,0.176121
1,Central and Southern Asia,Afghanistan,2011,15+,male,0.454171
2,Central and Southern Asia,Afghanistan,2011,15-24,female,0.321132
3,Central and Southern Asia,Afghanistan,2011,15-24,male,0.618791
4,Central and Southern Asia,Afghanistan,2011,25-64,female,0.084128


In [2]:
def eda(df):

    duplicates = df.duplicated().sum()
    missing_values = df.isnull().sum()
    data_types = df.info()

    print(data_types)
    print('--' * 20)
    print(f"Number of duplicates: {duplicates}")
    print(f"\nNumber of missing values: {missing_values}")
    
eda(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3303 entries, 0 to 3302
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Region         3303 non-null   object 
 1   Country        3303 non-null   object 
 2   Year           3303 non-null   int64  
 3   Age            3303 non-null   object 
 4   Gender         3303 non-null   object 
 5   Literacy rate  3303 non-null   float64
dtypes: float64(1), int64(1), object(4)
memory usage: 155.0+ KB
None
----------------------------------------
Number of duplicates: 0

Number of missing values: Region           0
Country          0
Year             0
Age              0
Gender           0
Literacy rate    0
dtype: int64


In [3]:
from sklearn.model_selection import train_test_split

# Determine the target variable
y = df['Literacy rate']
X = df.drop(columns='Literacy rate')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline

# Select categorical and numerical columns
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_include='object')

# Instantiate the transformers
scaler = StandardScaler()
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
imputer = KNNImputer(n_neighbors=2)

# Create preprocessing pipeline
def preprocessing_pipeline():

    # Create numeric and categorical pipelines
    num_pipe = Pipeline([
        ('imputer', imputer),
        ('scaler', scaler)
    ])

    cat_pipe = Pipeline([
        ('encoder', encoder)
    ])

    # Create preprocessor
    preprocessor = ColumnTransformer([
        ('numeric', num_pipe, num_cols),
        ('categorical', cat_pipe, cat_cols)
    ])

    return preprocessor

# Apply the preprocessing pipeline
preprocessor = preprocessing_pipeline()
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [5]:
import warnings
from lazypredict.Supervised import LazyRegressor

warnings.filterwarnings('ignore')

# Instantiate the model
lazy_model = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)

# Fit the model
models, predictions = lazy_model.fit(X_train, X_test, y_train, y_test)

# Print the top 5 models
top_5 = models.sort_values('Adjusted R-Squared', ascending=False).head()
top_5

 17%|█▋        | 7/42 [00:00<00:01, 27.17it/s]  File "c:\Users\erwin\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\erwin\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\erwin\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1024, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\erwin\AppData\Local\Programs\Python\Python311\Lib\subprocess.py", line 1493, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
100%|██████████| 42/42 [00:04<00:00,  8.76it/s]

XGBRegressor model failed to execute
'super' object has no attribute '__sklearn_tags__'
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000072 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 190
[LightGBM] [Info] Number of data points in the train set: 2642, number of used features: 15
[LightGBM] [Info] Start training from score 0.816639





Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LGBMRegressor,0.87,0.87,0.08,0.1
HistGradientBoostingRegressor,0.87,0.87,0.08,0.4
RandomForestRegressor,0.79,0.79,0.1,0.53
BaggingRegressor,0.75,0.76,0.11,0.08
KNeighborsRegressor,0.75,0.75,0.11,0.03


In [6]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate models
def evaluate_models(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    print(f'Mean Absolute Error: {mae:.2f}')
    print(f'Mean Squared Error: {mse:.2f}')
    print(f'Root Mean Squared Error: {rmse:.2f}')
    print(f'R-Squared: {r2:.2f}')

    return mae, mse, rmse, r2

In [7]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

# Run models
models = [
    XGBRegressor(),
    RandomForestRegressor(random_state=42)
]

for model_name in models:
    model_name.fit(X_train_transformed, y_train)
    y_pred = model_name.predict(X_test_transformed)
    evaluate_models(y_test, y_pred)

    print(f'Model: {model_name.__class__.__name__}')   
    print('--' * 20)

Mean Absolute Error: 0.04
Mean Squared Error: 0.00
Root Mean Squared Error: 0.06
R-Squared: 0.92
Model: XGBRegressor
----------------------------------------
Mean Absolute Error: 0.04
Mean Squared Error: 0.01
Root Mean Squared Error: 0.08
R-Squared: 0.88
Model: RandomForestRegressor
----------------------------------------


In [8]:
from dotenv import load_dotenv
import os
import dagshub
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from mlflow.tracking import MlflowClient

# Load environment variables
load_dotenv()

# Initialize dagshub
dagshub.init(
    repo_owner=os.getenv('DAGSHUB_REPO_OWNER'),
    repo_name=os.getenv('DAGSHUB_REPO_NAME'),
    mlflow=True
)

# Set up MLflow tracking
mlflow.set_experiment('literacy-rate')



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=0289778a-278a-460b-b243-81dcf9f7b26f&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=2e8c64040ba09d450850a18844b2aa3db87b24a7ec841668bf99f4e35e7dfd24




<Experiment: artifact_location='mlflow-artifacts:/01dd4db9f9664444976b43ab86f9165c', creation_time=1733667369822, experiment_id='0', last_update_time=1733667369822, lifecycle_stage='active', name='literacy-rate', tags={}>

In [9]:
# # Train and track models
# with mlflow.start_run(run_name='XGBRegressor'):
    
#     # Log parameters
#     mlflow.log_param('test_size', 0.2)
#     mlflow.log_param('random_state', 42)

#     # Train model
#     xgb_regressor = XGBRegressor()
#     xgb_regressor.fit(X_train_transformed, y_train)

#     # Make predictions and evaluate
#     y_pred = xgb_regressor.predict(X_test_transformed)
#     mae, mse, rmse, r2 = evaluate_models(y_test, y_pred)

#     # Log metrics
#     mlflow.log_metrics({
#         'Mean Absolute Error': mae,
#         'Mean Squared Error': mse,
#         'Root Mean Squared Error': rmse,
#         'R-Squared': r2
#     })

#     # Create signature for the model
#     signature = mlflow.models.infer_signature(
#         X_train_transformed,  # input example
#         y_train              # output example
#     )

#     # Log the model with signature and input example
#     mlflow.xgboost.log_model(
#         xgb_regressor, 
#         "model",
#         signature=signature,
#         input_example=X_train_transformed[:5]  # First 5 rows as example
#     )

In [10]:
# with mlflow.start_run(run_name='RandomForestRegressor'):

#     # Log parameters
#     mlflow.log_param('test_size', 0.2)
#     mlflow.log_param('random_state', 42)

#     # Train model
#     rf_regressor = RandomForestRegressor(random_state=42)
#     rf_regressor.fit(X_train_transformed, y_train)

#     # Make predictions
#     y_pred = rf_regressor.predict(X_test_transformed)
#     mae, mse, rmse, r2 = evaluate_models(y_test, y_pred)

#     # Log metrics
#     mlflow.log_metrics({
#         'Mean Absolute Error': mae,
#         'Mean Squared Error': mse,
#         'Root Mean Squared Error': rmse,
#         'R-Squared': r2
#     })

#     # Create signature for the model
#     signature = mlflow.models.infer_signature(
#         X_train_transformed,
#         y_train
#     )

#     # Log the model with signature and input example
#     mlflow.sklearn.log_model(
#         rf_regressor, 
#         "model",
#         signature=signature,
#         input_example=X_train_transformed[:5]  # First 5 rows as example
#     )

In [None]:
# List of models and params
models_config = [
    {
        'model_class': XGBRegressor,
        'model_name': 'XGBRegressor',
        'mlflow_module': mlflow.xgboost,
        'params': {}  # Add XGBoost specific parameters here
    },
    {
        'model_class': RandomForestRegressor,
        'model_name': 'RandomForestRegressor',
        'mlflow_module': mlflow.sklearn,
        'params': {'random_state': 42}
    }
]

# Function to train and log a model
def train_and_log_model(
    model_class, 
    model_name, 
    X_train, 
    X_test, 
    y_train, 
    y_test, 
    params=None,
    mlflow_module=None
):
    
    with mlflow.start_run(run_name=model_name):
        # Log common parameters
        mlflow.log_param('test_size', 0.2)
        mlflow.log_param('random_state', 42)
        
        # Initialize and train model
        model = model_class(**(params or {}))
        model.fit(X_train, y_train)
        
        # Make predictions and evaluate
        y_pred = model.predict(X_test)
        mae, mse, rmse, r2 = evaluate_models(y_test, y_pred)
        
        # Log metrics
        mlflow.log_metrics({
            'Mean Absolute Error': mae,
            'Mean Squared Error': mse,
            'Root Mean Squared Error': rmse,
            'R-Squared': r2
        })
        
        # Create and log model with signature
        signature = mlflow.models.infer_signature(X_train, y_train)
        
        mlflow_module.log_model(
            model,
            "model",
            signature=signature,
            input_example=X_train[:5]
        )
        
        return model, y_pred

# Train all models
for config in models_config:
    model, predictions = train_and_log_model(
        model_class=config['model_class'],
        model_name=config['model_name'],
        X_train=X_train_transformed,
        X_test=X_test_transformed,
        y_train=y_train,
        y_test=y_test,
        params=config['params'],
        mlflow_module=config['mlflow_module']
    )

Mean Absolute Error: 0.04
Mean Squared Error: 0.00
Root Mean Squared Error: 0.06
R-Squared: 0.92


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1730.94it/s]
  "inputs": [
    [
      0.38363884943889986,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      1.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
  

🏃 View run XGBRegressor at: https://dagshub.com/erwincarlogonzales/mlflow-basics.mlflow/#/experiments/0/runs/356612a471b347ec9f138fcfe6cbf756
🧪 View experiment at: https://dagshub.com/erwincarlogonzales/mlflow-basics.mlflow/#/experiments/0
Mean Absolute Error: 0.04
Mean Squared Error: 0.01
Root Mean Squared Error: 0.08
R-Squared: 0.88


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 467.73it/s] 


🏃 View run RandomForestRegressor at: https://dagshub.com/erwincarlogonzales/mlflow-basics.mlflow/#/experiments/0/runs/cc357a3dc68944e49de659f5c1e43c8c
🧪 View experiment at: https://dagshub.com/erwincarlogonzales/mlflow-basics.mlflow/#/experiments/0


In [12]:
from mlflow.tracking import MlflowClient

# Model registry
def register_model(run_id, model_name):
    model_uri = f'runs:/{run_id}/model'

    return mlflow.register_model(model_uri=model_uri, name=model_name)

# Promote model
def promote_challenger_to_production(model_name, prod_name):
    client = MlflowClient()
    current_model_uri = f"models:/{model_name}@challenger"
    client.copy_model_version(src_model_uri=current_model_uri, dst_name=prod_name)

# Retrieve production model
def get_production_champion(prod_name):
    prod_model_uri = f"models:/{prod_name}@champion"
    return mlflow.xgboost.load_model(prod_model_uri)

In [13]:
run_id = '26bbc91b56a542c38d5b5afe473601ff' # Get this from MLflow UI
model_name = 'XGBRegressor'
prod_name = 'literacy-rate-production'

# Register model
model_details = register_model(run_id, model_name)
print(f'Registered model version: {model_details.version}')

Registered model 'XGBRegressor' already exists. Creating a new version of this model...
2024/12/28 04:58:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBRegressor, version 5


Registered model version: 5


Created version '5' of model 'XGBRegressor'.


In [14]:
# Promote challenger to production => make sure to add challenger to the model alias
promote_challenger_to_production(model_name, prod_name)

Registered model 'literacy-rate-production' already exists. Creating a new version of this model...
Copied version '1' of model 'XGBRegressor' to version '5' of model 'literacy-rate-production'.
